LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Collect either replicated Loads or Stores grouped by their address SCEV.
234template <unsigned Opcode>
237 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
238 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
239 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
240 "Only Load and Store opcodes supported");
241 constexpr bool IsLoad = (Opcode == Instruction::Load);
243 RecipesByAddress;
244 for (VPBlockBase *Block :
246 auto *VPBB = cast<VPBasicBlock>(Block);
247 for (VPRecipeBase &R : *VPBB) {
248 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
249 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
250 continue;
251
252 // For loads, operand 0 is address; for stores, operand 1 is address.
253 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
254 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
255 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
256 RecipesByAddress[AddrSCEV].push_back(RepR);
257 }
258 }
259 auto Groups = to_vector(RecipesByAddress.values());
260 VPDominatorTree VPDT(Plan);
261 for (auto &Group : Groups) {
262 // Sort mem ops by dominance order, with earliest (most dominating) first.
264 return VPDT.properlyDominates(A, B);
265 });
266 }
267 return Groups;
268}
269
270/// Return true if we do not know how to (mechanically) hoist or sink \p R out
271/// of a loop region.
273 // Assumes don't alias anything or throw; as long as they're guaranteed to
274 // execute, they're safe to hoist.
276 return false;
277
278 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
279 // memory location is not modified in the vector loop.
280 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
281 return true;
282
283 // Allocas cannot be hoisted.
284 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
285 return RepR && RepR->getOpcode() == Instruction::Alloca;
286}
287
288static bool sinkScalarOperands(VPlan &Plan) {
289 auto Iter = vp_depth_first_deep(Plan.getEntry());
290 bool ScalarVFOnly = Plan.hasScalarVFOnly();
291 bool Changed = false;
292
294 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
295 VPBasicBlock *SinkTo, VPValue *Op) {
296 auto *Candidate =
297 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
298 if (!Candidate)
299 return;
300
301 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
302 // for now.
304 return;
305
306 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
307 return;
308
309 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
310 if (!ScalarVFOnly && RepR->isSingleScalar())
311 return;
312
313 WorkList.insert({SinkTo, Candidate});
314 };
315
316 // First, collect the operands of all recipes in replicate blocks as seeds for
317 // sinking.
319 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
320 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
321 continue;
322 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
323 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
324 continue;
325 for (auto &Recipe : *VPBB)
326 for (VPValue *Op : Recipe.operands())
327 InsertIfValidSinkCandidate(VPBB, Op);
328 }
329
330 // Try to sink each replicate or scalar IV steps recipe in the worklist.
331 for (unsigned I = 0; I != WorkList.size(); ++I) {
332 VPBasicBlock *SinkTo;
333 VPSingleDefRecipe *SinkCandidate;
334 std::tie(SinkTo, SinkCandidate) = WorkList[I];
335
336 // All recipe users of SinkCandidate must be in the same block SinkTo or all
337 // users outside of SinkTo must only use the first lane of SinkCandidate. In
338 // the latter case, we need to duplicate SinkCandidate.
339 auto UsersOutsideSinkTo =
340 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
341 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
342 });
343 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
344 return !U->usesFirstLaneOnly(SinkCandidate);
345 }))
346 continue;
347 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
348
349 if (NeedsDuplicating) {
350 if (ScalarVFOnly)
351 continue;
352 VPSingleDefRecipe *Clone;
353 if (auto *SinkCandidateRepR =
354 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
355 // TODO: Handle converting to uniform recipes as separate transform,
356 // then cloning should be sufficient here.
357 Instruction *I = SinkCandidate->getUnderlyingInstr();
358 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
359 nullptr /*Mask*/, *SinkCandidateRepR,
360 *SinkCandidateRepR);
361 // TODO: add ".cloned" suffix to name of Clone's VPValue.
362 } else {
363 Clone = SinkCandidate->clone();
364 }
365
366 Clone->insertBefore(SinkCandidate);
367 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
368 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
369 });
370 }
371 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
372 for (VPValue *Op : SinkCandidate->operands())
373 InsertIfValidSinkCandidate(SinkTo, Op);
374 Changed = true;
375 }
376 return Changed;
377}
378
379/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
380/// the mask.
382 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
383 if (!EntryBB || EntryBB->size() != 1 ||
384 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
385 return nullptr;
386
387 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
388}
389
390/// If \p R is a triangle region, return the 'then' block of the triangle.
392 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
393 if (EntryBB->getNumSuccessors() != 2)
394 return nullptr;
395
396 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
397 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
398 if (!Succ0 || !Succ1)
399 return nullptr;
400
401 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
402 return nullptr;
403 if (Succ0->getSingleSuccessor() == Succ1)
404 return Succ0;
405 if (Succ1->getSingleSuccessor() == Succ0)
406 return Succ1;
407 return nullptr;
408}
409
410// Merge replicate regions in their successor region, if a replicate region
411// is connected to a successor replicate region with the same predicate by a
412// single, empty VPBasicBlock.
414 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
415
416 // Collect replicate regions followed by an empty block, followed by another
417 // replicate region with matching masks to process front. This is to avoid
418 // iterator invalidation issues while merging regions.
421 vp_depth_first_deep(Plan.getEntry()))) {
422 if (!Region1->isReplicator())
423 continue;
424 auto *MiddleBasicBlock =
425 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
426 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
427 continue;
428
429 auto *Region2 =
430 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
431 if (!Region2 || !Region2->isReplicator())
432 continue;
433
434 VPValue *Mask1 = getPredicatedMask(Region1);
435 VPValue *Mask2 = getPredicatedMask(Region2);
436 if (!Mask1 || Mask1 != Mask2)
437 continue;
438
439 assert(Mask1 && Mask2 && "both region must have conditions");
440 WorkList.push_back(Region1);
441 }
442
443 // Move recipes from Region1 to its successor region, if both are triangles.
444 for (VPRegionBlock *Region1 : WorkList) {
445 if (TransformedRegions.contains(Region1))
446 continue;
447 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
448 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
449
450 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
451 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
452 if (!Then1 || !Then2)
453 continue;
454
455 // Note: No fusion-preventing memory dependencies are expected in either
456 // region. Such dependencies should be rejected during earlier dependence
457 // checks, which guarantee accesses can be re-ordered for vectorization.
458 //
459 // Move recipes to the successor region.
460 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
461 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
462
463 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
464 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
465
466 // Move VPPredInstPHIRecipes from the merge block to the successor region's
467 // merge block. Update all users inside the successor region to use the
468 // original values.
469 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
470 VPValue *PredInst1 =
471 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
472 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
473 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
474 return cast<VPRecipeBase>(&U)->getParent() == Then2;
475 });
476
477 // Remove phi recipes that are unused after merging the regions.
478 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
479 Phi1ToMove.eraseFromParent();
480 continue;
481 }
482 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
483 }
484
485 // Remove the dead recipes in Region1's entry block.
486 for (VPRecipeBase &R :
487 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
488 R.eraseFromParent();
489
490 // Finally, remove the first region.
491 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
492 VPBlockUtils::disconnectBlocks(Pred, Region1);
493 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
494 }
495 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
496 TransformedRegions.insert(Region1);
497 }
498
499 return !TransformedRegions.empty();
500}
501
503 VPlan &Plan) {
504 Instruction *Instr = PredRecipe->getUnderlyingInstr();
505 // Build the triangular if-then region.
506 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
507 assert(Instr->getParent() && "Predicated instruction not in any basic block");
508 auto *BlockInMask = PredRecipe->getMask();
509 auto *MaskDef = BlockInMask->getDefiningRecipe();
510 auto *BOMRecipe = new VPBranchOnMaskRecipe(
511 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
512 auto *Entry =
513 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
514
515 // Replace predicated replicate recipe with a replicate recipe without a
516 // mask but in the replicate region.
517 auto *RecipeWithoutMask = new VPReplicateRecipe(
518 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
519 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
520 PredRecipe->getDebugLoc());
521 auto *Pred =
522 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
523
524 VPPredInstPHIRecipe *PHIRecipe = nullptr;
525 if (PredRecipe->getNumUsers() != 0) {
526 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
527 RecipeWithoutMask->getDebugLoc());
528 PredRecipe->replaceAllUsesWith(PHIRecipe);
529 PHIRecipe->setOperand(0, RecipeWithoutMask);
530 }
531 PredRecipe->eraseFromParent();
532 auto *Exiting =
533 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
535 Plan.createReplicateRegion(Entry, Exiting, RegionName);
536
537 // Note: first set Entry as region entry and then connect successors starting
538 // from it in order, to propagate the "parent" of each VPBasicBlock.
539 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
540 VPBlockUtils::connectBlocks(Pred, Exiting);
541
542 return Region;
543}
544
545static void addReplicateRegions(VPlan &Plan) {
548 vp_depth_first_deep(Plan.getEntry()))) {
549 for (VPRecipeBase &R : *VPBB)
550 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
551 if (RepR->isPredicated())
552 WorkList.push_back(RepR);
553 }
554 }
555
556 unsigned BBNum = 0;
557 for (VPReplicateRecipe *RepR : WorkList) {
558 VPBasicBlock *CurrentBlock = RepR->getParent();
559 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
560
561 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
562 SplitBlock->setName(
563 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
564 // Record predicated instructions for above packing optimizations.
566 Region->setParent(CurrentBlock->getParent());
568
569 VPRegionBlock *ParentRegion = Region->getParent();
570 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
571 ParentRegion->setExiting(SplitBlock);
572 }
573}
574
575/// Remove redundant VPBasicBlocks by merging them into their predecessor if
576/// the predecessor has a single successor.
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 for (auto *Succ : to_vector(VPBB->successors())) {
604 VPBlockUtils::connectBlocks(PredVPBB, Succ);
605 }
606 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
607 }
608 return !WorkList.empty();
609}
610
612 // Convert masked VPReplicateRecipes to if-then region blocks.
614
615 bool ShouldSimplify = true;
616 while (ShouldSimplify) {
617 ShouldSimplify = sinkScalarOperands(Plan);
618 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
619 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
620 }
621}
622
623/// Remove redundant casts of inductions.
624///
625/// Such redundant casts are casts of induction variables that can be ignored,
626/// because we already proved that the casted phi is equal to the uncasted phi
627/// in the vectorized loop. There is no need to vectorize the cast - the same
628/// value can be used for both the phi and casts in the vector loop.
630 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
632 if (!IV || IV->getTruncInst())
633 continue;
634
635 // A sequence of IR Casts has potentially been recorded for IV, which
636 // *must be bypassed* when the IV is vectorized, because the vectorized IV
637 // will produce the desired casted value. This sequence forms a def-use
638 // chain and is provided in reverse order, ending with the cast that uses
639 // the IV phi. Search for the recipe of the last cast in the chain and
640 // replace it with the original IV. Note that only the final cast is
641 // expected to have users outside the cast-chain and the dead casts left
642 // over will be cleaned up later.
643 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
644 VPValue *FindMyCast = IV;
645 for (Instruction *IRCast : reverse(Casts)) {
646 VPSingleDefRecipe *FoundUserCast = nullptr;
647 for (auto *U : FindMyCast->users()) {
648 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
649 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
650 FoundUserCast = UserCast;
651 break;
652 }
653 }
654 FindMyCast = FoundUserCast;
655 }
656 FindMyCast->replaceAllUsesWith(IV);
657 }
658}
659
660/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
661/// recipe, if it exists.
663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
664 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
665 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
666 for (VPUser *U : CanonicalIV->users()) {
668 if (WidenNewIV)
669 break;
670 }
671
672 if (!WidenNewIV)
673 return;
674
675 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
676 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
677 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
678
679 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
680 continue;
681
682 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
683 // everything WidenNewIV's users need. That is, WidenOriginalIV will
684 // generate a vector phi or all users of WidenNewIV demand the first lane
685 // only.
686 if (Plan.hasScalarVFOnly() ||
687 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
688 vputils::onlyFirstLaneUsed(WidenNewIV)) {
689 // We are replacing a wide canonical iv with a suitable wide induction.
690 // This is used to compute header mask, hence all lanes will be used and
691 // we need to drop wrap flags only applying to lanes guranteed to execute
692 // in the original scalar loop.
693 WidenOriginalIV->dropPoisonGeneratingFlags();
694 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
695 WidenNewIV->eraseFromParent();
696 return;
697 }
698 }
699}
700
701/// Returns true if \p R is dead and can be removed.
702static bool isDeadRecipe(VPRecipeBase &R) {
703 // Do remove conditional assume instructions as their conditions may be
704 // flattened.
705 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
706 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
708 if (IsConditionalAssume)
709 return true;
710
711 if (R.mayHaveSideEffects())
712 return false;
713
714 // Recipe is dead if no user keeps the recipe alive.
715 return all_of(R.definedValues(),
716 [](VPValue *V) { return V->getNumUsers() == 0; });
717}
718
721 vp_post_order_deep(Plan.getEntry()))) {
722 // The recipes in the block are processed in reverse order, to catch chains
723 // of dead recipes.
724 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
725 if (isDeadRecipe(R)) {
726 R.eraseFromParent();
727 continue;
728 }
729
730 // Check if R is a dead VPPhi <-> update cycle and remove it.
731 auto *PhiR = dyn_cast<VPPhi>(&R);
732 if (!PhiR || PhiR->getNumOperands() != 2)
733 continue;
734 VPUser *PhiUser = PhiR->getSingleUser();
735 if (!PhiUser)
736 continue;
737 VPValue *Incoming = PhiR->getOperand(1);
738 if (PhiUser != Incoming->getDefiningRecipe() ||
739 Incoming->getNumUsers() != 1)
740 continue;
741 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
742 PhiR->eraseFromParent();
743 Incoming->getDefiningRecipe()->eraseFromParent();
744 }
745 }
746}
747
750 Instruction::BinaryOps InductionOpcode,
751 FPMathOperator *FPBinOp, Instruction *TruncI,
752 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
753 VPBuilder &Builder) {
754 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
755 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
756 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
757 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
758 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
759
760 // Truncate base induction if needed.
761 VPTypeAnalysis TypeInfo(Plan);
762 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
763 if (TruncI) {
764 Type *TruncTy = TruncI->getType();
765 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
766 "Not truncating.");
767 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
768 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
769 ResultTy = TruncTy;
770 }
771
772 // Truncate step if needed.
773 Type *StepTy = TypeInfo.inferScalarType(Step);
774 if (ResultTy != StepTy) {
775 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
776 "Not truncating.");
777 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
778 auto *VecPreheader =
780 VPBuilder::InsertPointGuard Guard(Builder);
781 Builder.setInsertPoint(VecPreheader);
782 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
783 }
784 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
785 &Plan.getVF(), DL);
786}
787
790 for (unsigned I = 0; I != Users.size(); ++I) {
792 if (isa<VPHeaderPHIRecipe>(Cur))
793 continue;
794 for (VPValue *V : Cur->definedValues())
795 Users.insert_range(V->users());
796 }
797 return Users.takeVector();
798}
799
800/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
801/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
802/// generates scalar values.
803static VPValue *
805 VPlan &Plan, VPBuilder &Builder) {
807 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
808 VPValue *StepV = PtrIV->getOperand(1);
810 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
811 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
812
813 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
814 PtrIV->getDebugLoc(), "next.gep");
815}
816
817/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
819/// VPWidenPointerInductionRecipe will generate vectors only. If some users
820/// require vectors while other require scalars, the scalar uses need to extract
821/// the scalars from the generated vectors (Note that this is different to how
822/// int/fp inductions are handled). Legalize extract-from-ends using uniform
823/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
824/// the correct end value is available. Also optimize
825/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
826/// providing them scalar steps built on the canonical scalar IV and update the
827/// original IV's users. This is an optional optimization to reduce the needs of
828/// vector extracts.
831 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
832 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
833 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
834 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
835 if (!PhiR)
836 continue;
837
838 // Try to narrow wide and replicating recipes to uniform recipes, based on
839 // VPlan analysis.
840 // TODO: Apply to all recipes in the future, to replace legacy uniformity
841 // analysis.
842 auto Users = collectUsersRecursively(PhiR);
843 for (VPUser *U : reverse(Users)) {
844 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
845 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
846 // Skip recipes that shouldn't be narrowed.
847 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
848 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
849 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
850 continue;
851
852 // Skip recipes that may have other lanes than their first used.
854 continue;
855
856 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
857 Def->operands(), /*IsUniform*/ true,
858 /*Mask*/ nullptr, /*Flags*/ *Def);
859 Clone->insertAfter(Def);
860 Def->replaceAllUsesWith(Clone);
861 }
862
863 // Replace wide pointer inductions which have only their scalars used by
864 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
865 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
866 if (!Plan.hasScalarVFOnly() &&
867 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
868 continue;
869
870 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
871 PtrIV->replaceAllUsesWith(PtrAdd);
872 continue;
873 }
874
875 // Replace widened induction with scalar steps for users that only use
876 // scalars.
877 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
878 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
879 return U->usesScalars(WideIV);
880 }))
881 continue;
882
883 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
885 Plan, ID.getKind(), ID.getInductionOpcode(),
886 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
887 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
888 WideIV->getDebugLoc(), Builder);
889
890 // Update scalar users of IV to use Step instead.
891 if (!HasOnlyVectorVFs) {
892 assert(!Plan.hasScalableVF() &&
893 "plans containing a scalar VF cannot also include scalable VFs");
894 WideIV->replaceAllUsesWith(Steps);
895 } else {
896 bool HasScalableVF = Plan.hasScalableVF();
897 WideIV->replaceUsesWithIf(Steps,
898 [WideIV, HasScalableVF](VPUser &U, unsigned) {
899 if (HasScalableVF)
900 return U.usesFirstLaneOnly(WideIV);
901 return U.usesScalars(WideIV);
902 });
903 }
904 }
905}
906
907/// Check if \p VPV is an untruncated wide induction, either before or after the
908/// increment. If so return the header IV (before the increment), otherwise
909/// return null.
912 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
913 if (WideIV) {
914 // VPV itself is a wide induction, separately compute the end value for exit
915 // users if it is not a truncated IV.
916 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
917 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
918 }
919
920 // Check if VPV is an optimizable induction increment.
921 VPRecipeBase *Def = VPV->getDefiningRecipe();
922 if (!Def || Def->getNumOperands() != 2)
923 return nullptr;
924 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
925 if (!WideIV)
926 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
927 if (!WideIV)
928 return nullptr;
929
930 auto IsWideIVInc = [&]() {
931 auto &ID = WideIV->getInductionDescriptor();
932
933 // Check if VPV increments the induction by the induction step.
934 VPValue *IVStep = WideIV->getStepValue();
935 switch (ID.getInductionOpcode()) {
936 case Instruction::Add:
937 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
938 case Instruction::FAdd:
939 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
940 case Instruction::FSub:
941 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
942 m_Specific(IVStep)));
943 case Instruction::Sub: {
944 // IVStep will be the negated step of the subtraction. Check if Step == -1
945 // * IVStep.
946 VPValue *Step;
947 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
948 return false;
949 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
950 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
951 ScalarEvolution &SE = *PSE.getSE();
952 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
953 !isa<SCEVCouldNotCompute>(StepSCEV) &&
954 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
955 }
956 default:
957 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
958 match(VPV, m_GetElementPtr(m_Specific(WideIV),
959 m_Specific(WideIV->getStepValue())));
960 }
961 llvm_unreachable("should have been covered by switch above");
962 };
963 return IsWideIVInc() ? WideIV : nullptr;
964}
965
966/// Attempts to optimize the induction variable exit values for users in the
967/// early exit block.
969 VPTypeAnalysis &TypeInfo,
970 VPBlockBase *PredVPBB,
971 VPValue *Op,
973 VPValue *Incoming, *Mask;
976 return nullptr;
977
978 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
979 if (!WideIV)
980 return nullptr;
981
982 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
983 if (WideIntOrFp && WideIntOrFp->getTruncInst())
984 return nullptr;
985
986 // Calculate the final index.
987 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
988 auto *CanonicalIV = LoopRegion->getCanonicalIV();
989 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
990 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
991
992 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
993 VPValue *FirstActiveLane =
994 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
995 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
996 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
997 FirstActiveLaneType, DL);
998 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
999
1000 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1001 // changed it means the exit is using the incremented value, so we need to
1002 // add the step.
1003 if (Incoming != WideIV) {
1004 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1005 EndValue = B.createAdd(EndValue, One, DL);
1006 }
1007
1008 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1009 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1010 VPIRValue *Start = WideIV->getStartValue();
1011 VPValue *Step = WideIV->getStepValue();
1012 EndValue = B.createDerivedIV(
1013 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1014 Start, EndValue, Step);
1015 }
1016
1017 return EndValue;
1018}
1019
1020/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1021/// VPDerivedIVRecipe for non-canonical inductions.
1023 VPBuilder &VectorPHBuilder,
1024 VPTypeAnalysis &TypeInfo,
1025 VPValue *VectorTC) {
1026 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1027 // Truncated wide inductions resume from the last lane of their vector value
1028 // in the last vector iteration which is handled elsewhere.
1029 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1030 return nullptr;
1031
1032 VPIRValue *Start = WideIV->getStartValue();
1033 VPValue *Step = WideIV->getStepValue();
1035 VPValue *EndValue = VectorTC;
1036 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1037 EndValue = VectorPHBuilder.createDerivedIV(
1038 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1039 Start, VectorTC, Step);
1040 }
1041
1042 // EndValue is derived from the vector trip count (which has the same type as
1043 // the widest induction) and thus may be wider than the induction here.
1044 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1045 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1046 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1047 ScalarTypeOfWideIV,
1048 WideIV->getDebugLoc());
1049 }
1050
1051 return EndValue;
1052}
1053
1054/// Attempts to optimize the induction variable exit values for users in the
1055/// exit block coming from the latch in the original scalar loop.
1057 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1060 VPWidenInductionRecipe *WideIV = nullptr;
1062 WideIV = getOptimizableIVOf(Incoming, PSE);
1063
1064 if (!WideIV)
1065 return nullptr;
1066
1067 VPValue *EndValue = EndValues.lookup(WideIV);
1068 assert(EndValue && "Must have computed the end value up front");
1069
1070 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1071 // changed it means the exit is using the incremented value, so we don't
1072 // need to subtract the step.
1073 if (Incoming != WideIV)
1074 return EndValue;
1075
1076 // Otherwise, subtract the step from the EndValue.
1077 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1078 VPValue *Step = WideIV->getStepValue();
1079 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1080 if (ScalarTy->isIntegerTy())
1081 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1082 if (ScalarTy->isPointerTy()) {
1083 Type *StepTy = TypeInfo.inferScalarType(Step);
1084 auto *Zero = Plan.getZero(StepTy);
1085 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1086 DebugLoc::getUnknown(), "ind.escape");
1087 }
1088 if (ScalarTy->isFloatingPointTy()) {
1089 const auto &ID = WideIV->getInductionDescriptor();
1090 return B.createNaryOp(
1091 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1092 ? Instruction::FSub
1093 : Instruction::FAdd,
1094 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1095 }
1096 llvm_unreachable("all possible induction types must be handled");
1097 return nullptr;
1098}
1099
1101 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1102 // Compute end values for all inductions.
1103 VPTypeAnalysis TypeInfo(Plan);
1104 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1105 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1106 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1108 VPValue *ResumeTC =
1109 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1110 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1111 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1112 if (!WideIV)
1113 continue;
1115 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1116 EndValues[WideIV] = EndValue;
1117 }
1118
1119 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1120 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1121 VPValue *Op;
1122 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1123 continue;
1124 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1125 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1126 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1127 R.eraseFromParent();
1128 }
1129 }
1130
1131 // Then, optimize exit block users.
1132 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1133 for (VPRecipeBase &R : ExitVPBB->phis()) {
1134 auto *ExitIRI = cast<VPIRPhi>(&R);
1135
1136 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1137 VPValue *Escape = nullptr;
1138 if (PredVPBB == MiddleVPBB)
1139 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1140 ExitIRI->getOperand(Idx),
1141 EndValues, PSE);
1142 else
1144 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1145 if (Escape)
1146 ExitIRI->setOperand(Idx, Escape);
1147 }
1148 }
1149 }
1150}
1151
1152/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1153/// them with already existing recipes expanding the same SCEV expression.
1156
1157 for (VPRecipeBase &R :
1159 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1160 if (!ExpR)
1161 continue;
1162
1163 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1164 if (Inserted)
1165 continue;
1166 ExpR->replaceAllUsesWith(V->second);
1167 ExpR->eraseFromParent();
1168 }
1169}
1170
1172 SmallVector<VPValue *> WorkList;
1174 WorkList.push_back(V);
1175
1176 while (!WorkList.empty()) {
1177 VPValue *Cur = WorkList.pop_back_val();
1178 if (!Seen.insert(Cur).second)
1179 continue;
1180 VPRecipeBase *R = Cur->getDefiningRecipe();
1181 if (!R)
1182 continue;
1183 if (!isDeadRecipe(*R))
1184 continue;
1185 append_range(WorkList, R->operands());
1186 R->eraseFromParent();
1187 }
1188}
1189
1190/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1191/// Returns an optional pair, where the first element indicates whether it is
1192/// an intrinsic ID.
1193static std::optional<std::pair<bool, unsigned>>
1195 return TypeSwitch<const VPSingleDefRecipe *,
1196 std::optional<std::pair<bool, unsigned>>>(R)
1199 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1200 .Case([](const VPWidenIntrinsicRecipe *I) {
1201 return std::make_pair(true, I->getVectorIntrinsicID());
1202 })
1203 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1204 // For recipes that do not directly map to LLVM IR instructions,
1205 // assign opcodes after the last VPInstruction opcode (which is also
1206 // after the last IR Instruction opcode), based on the VPRecipeID.
1207 return std::make_pair(false,
1208 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1209 })
1210 .Default([](auto *) { return std::nullopt; });
1211}
1212
1213/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1214/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1215/// Operands are foldable live-ins.
1217 ArrayRef<VPValue *> Operands,
1218 const DataLayout &DL,
1219 VPTypeAnalysis &TypeInfo) {
1220 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1221 if (!OpcodeOrIID)
1222 return nullptr;
1223
1225 for (VPValue *Op : Operands) {
1226 if (!match(Op, m_LiveIn()))
1227 return nullptr;
1228 Value *V = Op->getUnderlyingValue();
1229 if (!V)
1230 return nullptr;
1231 Ops.push_back(V);
1232 }
1233
1234 auto FoldToIRValue = [&]() -> Value * {
1235 InstSimplifyFolder Folder(DL);
1236 if (OpcodeOrIID->first) {
1237 if (R.getNumOperands() != 2)
1238 return nullptr;
1239 unsigned ID = OpcodeOrIID->second;
1240 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1241 TypeInfo.inferScalarType(&R));
1242 }
1243 unsigned Opcode = OpcodeOrIID->second;
1244 if (Instruction::isBinaryOp(Opcode))
1245 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1246 Ops[0], Ops[1]);
1247 if (Instruction::isCast(Opcode))
1248 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1249 TypeInfo.inferScalarType(R.getVPSingleValue()));
1250 switch (Opcode) {
1252 return Folder.FoldSelect(Ops[0], Ops[1],
1254 case VPInstruction::Not:
1255 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1257 case Instruction::Select:
1258 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1259 case Instruction::ICmp:
1260 case Instruction::FCmp:
1261 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1262 Ops[1]);
1263 case Instruction::GetElementPtr: {
1264 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1265 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1266 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1267 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1268 }
1271 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1272 Ops[0], Ops[1],
1273 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1274 // An extract of a live-in is an extract of a broadcast, so return the
1275 // broadcasted element.
1276 case Instruction::ExtractElement:
1277 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1278 return Ops[0];
1279 }
1280 return nullptr;
1281 };
1282
1283 if (Value *V = FoldToIRValue())
1284 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1285 return nullptr;
1286}
1287
1288/// Try to simplify VPSingleDefRecipe \p Def.
1290 VPlan *Plan = Def->getParent()->getPlan();
1291
1292 // Simplification of live-in IR values for SingleDef recipes using
1293 // InstSimplifyFolder.
1294 const DataLayout &DL = Plan->getDataLayout();
1295 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1296 return Def->replaceAllUsesWith(V);
1297
1298 // Fold PredPHI LiveIn -> LiveIn.
1299 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1300 VPValue *Op = PredPHI->getOperand(0);
1301 if (isa<VPIRValue>(Op))
1302 PredPHI->replaceAllUsesWith(Op);
1303 }
1304
1305 VPBuilder Builder(Def);
1306
1307 // Avoid replacing VPInstructions with underlying values with new
1308 // VPInstructions, as we would fail to create widen/replicate recpes from the
1309 // new VPInstructions without an underlying value, and miss out on some
1310 // transformations that only apply to widened/replicated recipes later, by
1311 // doing so.
1312 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1313 // VPInstructions without underlying values, as those will get skipped during
1314 // cost computation.
1315 bool CanCreateNewRecipe =
1316 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1317
1318 VPValue *A;
1319 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1320 Type *TruncTy = TypeInfo.inferScalarType(Def);
1321 Type *ATy = TypeInfo.inferScalarType(A);
1322 if (TruncTy == ATy) {
1323 Def->replaceAllUsesWith(A);
1324 } else {
1325 // Don't replace a non-widened cast recipe with a widened cast.
1326 if (!isa<VPWidenCastRecipe>(Def))
1327 return;
1328 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1329
1330 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1331 ? Instruction::SExt
1332 : Instruction::ZExt;
1333 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1334 TruncTy);
1335 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1336 // UnderlyingExt has distinct return type, used to retain legacy cost.
1337 Ext->setUnderlyingValue(UnderlyingExt);
1338 }
1339 Def->replaceAllUsesWith(Ext);
1340 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1341 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1342 Def->replaceAllUsesWith(Trunc);
1343 }
1344 }
1345#ifndef NDEBUG
1346 // Verify that the cached type info is for both A and its users is still
1347 // accurate by comparing it to freshly computed types.
1348 VPTypeAnalysis TypeInfo2(*Plan);
1349 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1350 for (VPUser *U : A->users()) {
1351 auto *R = cast<VPRecipeBase>(U);
1352 for (VPValue *VPV : R->definedValues())
1353 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1354 }
1355#endif
1356 }
1357
1358 // Simplify (X && Y) | (X && !Y) -> X.
1359 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1360 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1361 // recipes to be visited during simplification.
1362 VPValue *X, *Y, *Z;
1363 if (match(Def,
1366 Def->replaceAllUsesWith(X);
1367 Def->eraseFromParent();
1368 return;
1369 }
1370
1371 // x | AllOnes -> AllOnes
1372 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1373 return Def->replaceAllUsesWith(
1374 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1375
1376 // x | 0 -> x
1377 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1378 return Def->replaceAllUsesWith(X);
1379
1380 // x | !x -> AllOnes
1382 return Def->replaceAllUsesWith(
1383 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1384
1385 // x & 0 -> 0
1386 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1387 return Def->replaceAllUsesWith(
1388 Plan->getZero(TypeInfo.inferScalarType(Def)));
1389
1390 // x & AllOnes -> x
1391 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1392 return Def->replaceAllUsesWith(X);
1393
1394 // x && false -> false
1395 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1396 return Def->replaceAllUsesWith(Plan->getFalse());
1397
1398 // x && true -> x
1399 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1400 return Def->replaceAllUsesWith(X);
1401
1402 // (x && y) | (x && z) -> x && (y | z)
1403 if (CanCreateNewRecipe &&
1406 // Simplify only if one of the operands has one use to avoid creating an
1407 // extra recipe.
1408 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1409 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1410 return Def->replaceAllUsesWith(
1411 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1412
1413 // x && (x && y) -> x && y
1414 if (match(Def, m_LogicalAnd(m_VPValue(X),
1416 return Def->replaceAllUsesWith(Def->getOperand(1));
1417
1418 // x && (y && x) -> x && y
1419 if (match(Def, m_LogicalAnd(m_VPValue(X),
1421 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1422
1423 // x && !x -> 0
1425 return Def->replaceAllUsesWith(Plan->getFalse());
1426
1427 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1428 return Def->replaceAllUsesWith(X);
1429
1430 // select c, false, true -> not c
1431 VPValue *C;
1432 if (CanCreateNewRecipe &&
1433 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1434 return Def->replaceAllUsesWith(Builder.createNot(C));
1435
1436 // select !c, x, y -> select c, y, x
1437 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1438 Def->setOperand(0, C);
1439 Def->setOperand(1, Y);
1440 Def->setOperand(2, X);
1441 return;
1442 }
1443
1444 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1445 return Def->replaceAllUsesWith(A);
1446
1447 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1448 return Def->replaceAllUsesWith(A);
1449
1450 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1451 return Def->replaceAllUsesWith(
1452 Plan->getZero(TypeInfo.inferScalarType(Def)));
1453
1454 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1455 // Preserve nsw from the Mul on the new Sub.
1457 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1458 return Def->replaceAllUsesWith(
1459 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1460 Def->getDebugLoc(), "", NW));
1461 }
1462
1463 const APInt *APC;
1464 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1465 APC->isPowerOf2())
1466 return Def->replaceAllUsesWith(Builder.createNaryOp(
1467 Instruction::Shl,
1468 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1469 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1470
1471 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1472 // not allowed in them.
1473 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1474 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1475 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1476 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1477 return Def->replaceAllUsesWith(Builder.createNaryOp(
1478 Instruction::LShr,
1479 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1480 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1481
1482 if (match(Def, m_Not(m_VPValue(A)))) {
1483 if (match(A, m_Not(m_VPValue(A))))
1484 return Def->replaceAllUsesWith(A);
1485
1486 // Try to fold Not into compares by adjusting the predicate in-place.
1487 CmpPredicate Pred;
1488 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1489 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1490 if (all_of(Cmp->users(),
1492 m_Not(m_Specific(Cmp)),
1493 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1494 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1495 for (VPUser *U : to_vector(Cmp->users())) {
1496 auto *R = cast<VPSingleDefRecipe>(U);
1497 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1498 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1499 R->setOperand(1, Y);
1500 R->setOperand(2, X);
1501 } else {
1502 // not (cmp pred) -> cmp inv_pred
1503 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1504 R->replaceAllUsesWith(Cmp);
1505 }
1506 }
1507 // If Cmp doesn't have a debug location, use the one from the negation,
1508 // to preserve the location.
1509 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1510 Cmp->setDebugLoc(Def->getDebugLoc());
1511 }
1512 }
1513 }
1514
1515 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1516 // any-of (fcmp uno %A, %B), ...
1517 if (match(Def, m_AnyOf())) {
1519 VPRecipeBase *UnpairedCmp = nullptr;
1520 for (VPValue *Op : Def->operands()) {
1521 VPValue *X;
1522 if (Op->getNumUsers() > 1 ||
1524 m_Deferred(X)))) {
1525 NewOps.push_back(Op);
1526 } else if (!UnpairedCmp) {
1527 UnpairedCmp = Op->getDefiningRecipe();
1528 } else {
1529 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1530 UnpairedCmp->getOperand(0), X));
1531 UnpairedCmp = nullptr;
1532 }
1533 }
1534
1535 if (UnpairedCmp)
1536 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1537
1538 if (NewOps.size() < Def->getNumOperands()) {
1539 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1540 return Def->replaceAllUsesWith(NewAnyOf);
1541 }
1542 }
1543
1544 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1545 // This is useful for fmax/fmin without fast-math flags, where we need to
1546 // check if any operand is NaN.
1547 if (CanCreateNewRecipe &&
1549 m_Deferred(X)),
1551 m_Deferred(Y))))) {
1552 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1553 return Def->replaceAllUsesWith(NewCmp);
1554 }
1555
1556 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1557 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1558 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1559 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1560 TypeInfo.inferScalarType(Def))
1561 return Def->replaceAllUsesWith(Def->getOperand(1));
1562
1564 m_One()))) {
1565 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1566 if (TypeInfo.inferScalarType(X) != WideStepTy)
1567 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1568 Def->replaceAllUsesWith(X);
1569 return;
1570 }
1571
1572 // For i1 vp.merges produced by AnyOf reductions:
1573 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1575 m_VPValue(X), m_VPValue())) &&
1577 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1578 Def->setOperand(1, Def->getOperand(0));
1579 Def->setOperand(0, Y);
1580 return;
1581 }
1582
1583 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1584 if (Phi->getOperand(0) == Phi->getOperand(1))
1585 Phi->replaceAllUsesWith(Phi->getOperand(0));
1586 return;
1587 }
1588
1589 // Simplify MaskedCond with no block mask to its single operand.
1591 !cast<VPInstruction>(Def)->isMasked())
1592 return Def->replaceAllUsesWith(Def->getOperand(0));
1593
1594 // Look through ExtractLastLane.
1595 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1596 if (match(A, m_BuildVector())) {
1597 auto *BuildVector = cast<VPInstruction>(A);
1598 Def->replaceAllUsesWith(
1599 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1600 return;
1601 }
1602 if (Plan->hasScalarVFOnly())
1603 return Def->replaceAllUsesWith(A);
1604 }
1605
1606 // Look through ExtractPenultimateElement (BuildVector ....).
1608 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1609 Def->replaceAllUsesWith(
1610 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1611 return;
1612 }
1613
1614 uint64_t Idx;
1616 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1617 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1618 return;
1619 }
1620
1621 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1622 Def->replaceAllUsesWith(
1623 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1624 return;
1625 }
1626
1627 // Look through broadcast of single-scalar when used as select conditions; in
1628 // that case the scalar condition can be used directly.
1629 if (match(Def,
1632 "broadcast operand must be single-scalar");
1633 Def->setOperand(0, C);
1634 return;
1635 }
1636
1638 if (Def->getNumOperands() == 1)
1639 Def->replaceAllUsesWith(Def->getOperand(0));
1640 return;
1641 }
1642
1643 VPIRValue *IRV;
1644 if (Def->getNumOperands() == 1 &&
1646 return Def->replaceAllUsesWith(IRV);
1647
1648 // Some simplifications can only be applied after unrolling. Perform them
1649 // below.
1650 if (!Plan->isUnrolled())
1651 return;
1652
1653 // After unrolling, extract-lane may be used to extract values from multiple
1654 // scalar sources. Only simplify when extracting from a single scalar source.
1655 VPValue *LaneToExtract;
1656 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1657 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1659 return Def->replaceAllUsesWith(A);
1660
1661 // Simplify extract-lane with single source to extract-element.
1662 Def->replaceAllUsesWith(Builder.createNaryOp(
1663 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1664 return;
1665 }
1666
1667 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1668 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1669 isa<VPPhi>(X)) {
1670 auto *Phi = cast<VPPhi>(X);
1671 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1672 Phi->getSingleUser() == Def) {
1673 Phi->setOperand(0, Y);
1674 Def->replaceAllUsesWith(Phi);
1675 return;
1676 }
1677 }
1678
1679 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1680 // just the pointer operand.
1681 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1682 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1683 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1684
1685 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1686 // the start index is zero and only the first lane 0 is demanded.
1687 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1688 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1689 Steps->replaceAllUsesWith(Steps->getOperand(0));
1690 return;
1691 }
1692 }
1693 // Simplify redundant ReductionStartVector recipes after unrolling.
1694 VPValue *StartV;
1696 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1697 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1698 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1699 return PhiR && PhiR->isInLoop();
1700 });
1701 return;
1702 }
1703
1705 Def->replaceAllUsesWith(A);
1706 return;
1707 }
1708
1709 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1712 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1713 all_of(A->users(),
1714 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1715 return Def->replaceAllUsesWith(A);
1716 }
1717
1718 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1719 return Def->replaceAllUsesWith(A);
1720}
1721
1724 Plan.getEntry());
1725 VPTypeAnalysis TypeInfo(Plan);
1727 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1728 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1729 simplifyRecipe(Def, TypeInfo);
1730 }
1731}
1732
1733/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1734/// header mask to be simplified further when tail folding, e.g. in
1735/// optimizeEVLMasks.
1736static void reassociateHeaderMask(VPlan &Plan) {
1737 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1738 if (!HeaderMask)
1739 return;
1740
1741 SmallVector<VPUser *> Worklist;
1742 for (VPUser *U : HeaderMask->users())
1743 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1745
1746 while (!Worklist.empty()) {
1747 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1748 VPValue *X, *Y;
1749 if (!R || !match(R, m_LogicalAnd(
1750 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1751 m_VPValue(Y))))
1752 continue;
1753 append_range(Worklist, R->users());
1754 VPBuilder Builder(R);
1755 R->replaceAllUsesWith(
1756 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1757 }
1758}
1759
1761 if (Plan.hasScalarVFOnly())
1762 return;
1763
1764 // Try to narrow wide and replicating recipes to single scalar recipes,
1765 // based on VPlan analysis. Only process blocks in the loop region for now,
1766 // without traversing into nested regions, as recipes in replicate regions
1767 // cannot be converted yet.
1770 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1772 VPWidenStoreRecipe>(&R))
1773 continue;
1774 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1775 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1776 continue;
1777
1778 // Convert an unmasked scatter with an uniform address into
1779 // extract-last-lane + scalar store.
1780 // TODO: Add a profitability check comparing the cost of a scatter vs.
1781 // extract + scalar store.
1782 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1783 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1784 !WidenStoreR->isConsecutive()) {
1785 assert(!WidenStoreR->isReverse() &&
1786 "Not consecutive memory recipes shouldn't be reversed");
1787 VPValue *Mask = WidenStoreR->getMask();
1788
1789 // Only convert the scatter to a scalar store if it is unmasked.
1790 // TODO: Support converting scatter masked by the header mask to scalar
1791 // store.
1792 if (Mask)
1793 continue;
1794
1796 {WidenStoreR->getOperand(1)});
1797 Extract->insertBefore(WidenStoreR);
1798
1799 // TODO: Sink the scalar store recipe to middle block if possible.
1800 auto *ScalarStore = new VPReplicateRecipe(
1801 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1802 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1803 *WidenStoreR /*Metadata*/);
1804 ScalarStore->insertBefore(WidenStoreR);
1805 WidenStoreR->eraseFromParent();
1806 continue;
1807 }
1808
1809 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1810 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1811 vputils::isSingleScalar(RepR->getOperand(1))) {
1812 auto *Clone = new VPReplicateRecipe(
1813 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1814 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1815 *RepR /*Metadata*/, RepR->getDebugLoc());
1816 Clone->insertBefore(RepOrWidenR);
1817 VPBuilder Builder(Clone);
1818 VPValue *ExtractOp = Clone->getOperand(0);
1819 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1820 ExtractOp =
1821 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1822 ExtractOp =
1823 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1824 Clone->setOperand(0, ExtractOp);
1825 RepR->eraseFromParent();
1826 continue;
1827 }
1828
1829 // Skip recipes that aren't single scalars.
1830 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1831 continue;
1832
1833 // Predicate to check if a user of Op introduces extra broadcasts.
1834 auto IntroducesBCastOf = [](const VPValue *Op) {
1835 return [Op](const VPUser *U) {
1836 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1840 VPI->getOpcode()))
1841 return false;
1842 }
1843 return !U->usesScalars(Op);
1844 };
1845 };
1846
1847 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1848 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1849 if (any_of(
1850 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1851 IntroducesBCastOf(Op)))
1852 return false;
1853 // Non-constant live-ins require broadcasts, while constants do not
1854 // need explicit broadcasts.
1855 auto *IRV = dyn_cast<VPIRValue>(Op);
1856 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1857 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1858 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1859 }))
1860 continue;
1861
1862 auto *Clone = new VPReplicateRecipe(
1863 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1864 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1865 Clone->insertBefore(RepOrWidenR);
1866 RepOrWidenR->replaceAllUsesWith(Clone);
1867 if (isDeadRecipe(*RepOrWidenR))
1868 RepOrWidenR->eraseFromParent();
1869 }
1870 }
1871}
1872
1873/// Try to see if all of \p Blend's masks share a common value logically and'ed
1874/// and remove it from the masks.
1876 if (Blend->isNormalized())
1877 return;
1878 VPValue *CommonEdgeMask;
1879 if (!match(Blend->getMask(0),
1880 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1881 return;
1882 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1883 if (!match(Blend->getMask(I),
1884 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1885 return;
1886 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1887 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1888}
1889
1890/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1891/// to make sure the masks are simplified.
1892static void simplifyBlends(VPlan &Plan) {
1895 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1896 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1897 if (!Blend)
1898 continue;
1899
1900 removeCommonBlendMask(Blend);
1901
1902 // Try to remove redundant blend recipes.
1903 SmallPtrSet<VPValue *, 4> UniqueValues;
1904 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1905 UniqueValues.insert(Blend->getIncomingValue(0));
1906 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1907 if (!match(Blend->getMask(I), m_False()))
1908 UniqueValues.insert(Blend->getIncomingValue(I));
1909
1910 if (UniqueValues.size() == 1) {
1911 Blend->replaceAllUsesWith(*UniqueValues.begin());
1912 Blend->eraseFromParent();
1913 continue;
1914 }
1915
1916 if (Blend->isNormalized())
1917 continue;
1918
1919 // Normalize the blend so its first incoming value is used as the initial
1920 // value with the others blended into it.
1921
1922 unsigned StartIndex = 0;
1923 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1924 // If a value's mask is used only by the blend then is can be deadcoded.
1925 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1926 // that's used by multiple blends where it can be removed from them all.
1927 VPValue *Mask = Blend->getMask(I);
1928 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1929 StartIndex = I;
1930 break;
1931 }
1932 }
1933
1934 SmallVector<VPValue *, 4> OperandsWithMask;
1935 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1936
1937 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1938 if (I == StartIndex)
1939 continue;
1940 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1941 OperandsWithMask.push_back(Blend->getMask(I));
1942 }
1943
1944 auto *NewBlend =
1945 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1946 OperandsWithMask, *Blend, Blend->getDebugLoc());
1947 NewBlend->insertBefore(&R);
1948
1949 VPValue *DeadMask = Blend->getMask(StartIndex);
1950 Blend->replaceAllUsesWith(NewBlend);
1951 Blend->eraseFromParent();
1953
1954 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1955 VPValue *NewMask;
1956 if (NewBlend->getNumOperands() == 3 &&
1957 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1958 VPValue *Inc0 = NewBlend->getOperand(0);
1959 VPValue *Inc1 = NewBlend->getOperand(1);
1960 VPValue *OldMask = NewBlend->getOperand(2);
1961 NewBlend->setOperand(0, Inc1);
1962 NewBlend->setOperand(1, Inc0);
1963 NewBlend->setOperand(2, NewMask);
1964 if (OldMask->getNumUsers() == 0)
1965 cast<VPInstruction>(OldMask)->eraseFromParent();
1966 }
1967 }
1968 }
1969}
1970
1971/// Optimize the width of vector induction variables in \p Plan based on a known
1972/// constant Trip Count, \p BestVF and \p BestUF.
1974 ElementCount BestVF,
1975 unsigned BestUF) {
1976 // Only proceed if we have not completely removed the vector region.
1977 if (!Plan.getVectorLoopRegion())
1978 return false;
1979
1980 const APInt *TC;
1981 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1982 return false;
1983
1984 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1985 // and UF. Returns at least 8.
1986 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1987 APInt AlignedTC =
1990 APInt MaxVal = AlignedTC - 1;
1991 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1992 };
1993 unsigned NewBitWidth =
1994 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1995
1996 LLVMContext &Ctx = Plan.getContext();
1997 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1998
1999 bool MadeChange = false;
2000
2001 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2002 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2003 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2004
2005 // Currently only handle canonical IVs as it is trivial to replace the start
2006 // and stop values, and we currently only perform the optimization when the
2007 // IV has a single use.
2008 if (!WideIV || !WideIV->isCanonical() ||
2009 WideIV->hasMoreThanOneUniqueUser() ||
2010 NewIVTy == WideIV->getScalarType())
2011 continue;
2012
2013 // Currently only handle cases where the single user is a header-mask
2014 // comparison with the backedge-taken-count.
2015 VPUser *SingleUser = WideIV->getSingleUser();
2016 if (!SingleUser ||
2017 !match(SingleUser, m_ICmp(m_Specific(WideIV),
2020 continue;
2021
2022 // Update IV operands and comparison bound to use new narrower type.
2023 auto *NewStart = Plan.getZero(NewIVTy);
2024 WideIV->setStartValue(NewStart);
2025 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2026 WideIV->setStepValue(NewStep);
2027
2028 auto *NewBTC = new VPWidenCastRecipe(
2029 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2030 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2031 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2032 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2033 Cmp->setOperand(1, NewBTC);
2034
2035 MadeChange = true;
2036 }
2037
2038 return MadeChange;
2039}
2040
2041/// Return true if \p Cond is known to be true for given \p BestVF and \p
2042/// BestUF.
2044 ElementCount BestVF, unsigned BestUF,
2047 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2048 &PSE](VPValue *C) {
2049 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2050 });
2051
2052 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2054 m_Specific(CanIV->getBackedgeValue()),
2055 m_Specific(&Plan.getVectorTripCount()))))
2056 return false;
2057
2058 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2059 // count is not conveniently available as SCEV so far, so we compare directly
2060 // against the original trip count. This is stricter than necessary, as we
2061 // will only return true if the trip count == vector trip count.
2062 const SCEV *VectorTripCount =
2064 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2065 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2066 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2067 "Trip count SCEV must be computable");
2068 ScalarEvolution &SE = *PSE.getSE();
2069 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2070 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2071 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2072}
2073
2074/// Try to replace multiple active lane masks used for control flow with
2075/// a single, wide active lane mask instruction followed by multiple
2076/// extract subvector intrinsics. This applies to the active lane mask
2077/// instructions both in the loop and in the preheader.
2078/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2079/// new extracts from the first active lane mask, which has it's last
2080/// operand (multiplier) set to UF.
2082 unsigned UF) {
2083 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2084 return false;
2085
2086 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2087 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2088 auto *Term = &ExitingVPBB->back();
2089
2090 using namespace llvm::VPlanPatternMatch;
2092 m_VPValue(), m_VPValue(), m_VPValue())))))
2093 return false;
2094
2095 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2096 LLVMContext &Ctx = Plan.getContext();
2097
2098 auto ExtractFromALM = [&](VPInstruction *ALM,
2099 SmallVectorImpl<VPValue *> &Extracts) {
2100 DebugLoc DL = ALM->getDebugLoc();
2101 for (unsigned Part = 0; Part < UF; ++Part) {
2103 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2104 auto *Ext =
2105 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2106 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2107 Extracts[Part] = Ext;
2108 Ext->insertAfter(ALM);
2109 }
2110 };
2111
2112 // Create a list of each active lane mask phi, ordered by unroll part.
2114 for (VPRecipeBase &R : Header->phis()) {
2116 if (!Phi)
2117 continue;
2118 VPValue *Index = nullptr;
2119 match(Phi->getBackedgeValue(),
2121 assert(Index && "Expected index from ActiveLaneMask instruction");
2122
2123 uint64_t Part;
2124 if (match(Index,
2126 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2127 Phis[Part] = Phi;
2128 else {
2129 // Anything other than a CanonicalIVIncrementForPart is part 0
2130 assert(!match(
2131 Index,
2133 Phis[0] = Phi;
2134 }
2135 }
2136
2137 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2138 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2139
2140 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2141 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2142
2143 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2144 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2145 "Expected incoming values of Phi to be ActiveLaneMasks");
2146
2147 // When using wide lane masks, the return type of the get.active.lane.mask
2148 // intrinsic is VF x UF (last operand).
2149 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2150 EntryALM->setOperand(2, ALMMultiplier);
2151 LoopALM->setOperand(2, ALMMultiplier);
2152
2153 // Create UF x extract vectors and insert into preheader.
2154 SmallVector<VPValue *> EntryExtracts(UF);
2155 ExtractFromALM(EntryALM, EntryExtracts);
2156
2157 // Create UF x extract vectors and insert before the loop compare & branch,
2158 // updating the compare to use the first extract.
2159 SmallVector<VPValue *> LoopExtracts(UF);
2160 ExtractFromALM(LoopALM, LoopExtracts);
2161 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2162 Not->setOperand(0, LoopExtracts[0]);
2163
2164 // Update the incoming values of active lane mask phis.
2165 for (unsigned Part = 0; Part < UF; ++Part) {
2166 Phis[Part]->setStartValue(EntryExtracts[Part]);
2167 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2168 }
2169
2170 return true;
2171}
2172
2173/// Try to simplify the branch condition of \p Plan. This may restrict the
2174/// resulting plan to \p BestVF and \p BestUF.
2176 unsigned BestUF,
2178 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2179 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2180 auto *Term = &ExitingVPBB->back();
2181 VPValue *Cond;
2182 if (match(Term,
2184 m_VPValue())) ||
2186 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2187 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2188 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2189 const SCEV *VectorTripCount =
2191 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2192 VectorTripCount =
2194 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2195 "Trip count SCEV must be computable");
2196 ScalarEvolution &SE = *PSE.getSE();
2197 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2198 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2199 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2200 return false;
2201 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2203 // For BranchOnCond, check if we can prove the condition to be true using VF
2204 // and UF.
2205 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2206 return false;
2207 } else {
2208 return false;
2209 }
2210
2211 // The vector loop region only executes once. If possible, completely remove
2212 // the region, otherwise replace the terminator controlling the latch with
2213 // (BranchOnCond true).
2214 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2215 // support for other non-canonical widen induction recipes (e.g.,
2216 // VPWidenPointerInductionRecipe).
2217 // TODO: fold branch-on-constant after dissolving region.
2218 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2219 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2220 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2221 return R->isCanonical();
2222 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2223 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2224 })) {
2225 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2226 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2227 VPBuilder Builder(Plan.getVectorPreheader());
2228 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2229 R->getScalarType());
2230 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2231 HeaderR.eraseFromParent();
2232 continue;
2233 }
2234 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2235 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2236 HeaderR.eraseFromParent();
2237 }
2238
2239 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2240 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2241 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2242 for (VPBlockBase *Exit : Exits)
2243 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2244
2245 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2246 B->setParent(nullptr);
2247
2248 VPBlockUtils::connectBlocks(Preheader, Header);
2249
2250 for (VPBlockBase *Exit : Exits)
2251 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2252
2253 // Replace terminating branch-on-two-conds with branch-on-cond to early
2254 // exit.
2255 if (Exits.size() != 1) {
2256 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2257 "BranchOnTwoConds needs 2 remaining exits");
2259 Term->getOperand(0));
2260 }
2262 } else {
2263 // The vector region contains header phis for which we cannot remove the
2264 // loop region yet.
2265
2266 // For BranchOnTwoConds, set the latch exit condition to true directly.
2267 if (match(Term, m_BranchOnTwoConds())) {
2268 Term->setOperand(1, Plan.getTrue());
2269 return true;
2270 }
2271
2272 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2273 {}, {}, Term->getDebugLoc());
2274 ExitingVPBB->appendRecipe(BOC);
2275 }
2276
2277 Term->eraseFromParent();
2278
2279 return true;
2280}
2281
2282/// From the definition of llvm.experimental.get.vector.length,
2283/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2287 vp_depth_first_deep(Plan.getEntry()))) {
2288 for (VPRecipeBase &R : *VPBB) {
2289 VPValue *AVL;
2290 if (!match(&R, m_EVL(m_VPValue(AVL))))
2291 continue;
2292
2293 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2294 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2295 continue;
2296 ScalarEvolution &SE = *PSE.getSE();
2297 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2298 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2299 continue;
2300
2302 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2303 R.getDebugLoc());
2304 if (Trunc != AVL) {
2305 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2306 const DataLayout &DL = Plan.getDataLayout();
2307 VPTypeAnalysis TypeInfo(Plan);
2308 if (VPValue *Folded =
2309 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2310 Trunc = Folded;
2311 }
2312 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2313 return true;
2314 }
2315 }
2316 return false;
2317}
2318
2320 unsigned BestUF,
2322 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2323 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2324
2325 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2326 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2327 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2328 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2329
2330 if (MadeChange) {
2331 Plan.setVF(BestVF);
2332 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2333 }
2334}
2335
2336/// Sink users of \p FOR after the recipe defining the previous value \p
2337/// Previous of the recurrence. \returns true if all users of \p FOR could be
2338/// re-arranged as needed or false if it is not possible.
2339static bool
2341 VPRecipeBase *Previous,
2342 VPDominatorTree &VPDT) {
2343 // If Previous is a live-in (no defining recipe), it naturally dominates all
2344 // recipes in the loop, so no sinking is needed.
2345 if (!Previous)
2346 return true;
2347
2348 // Collect recipes that need sinking.
2351 Seen.insert(Previous);
2352 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2353 // The previous value must not depend on the users of the recurrence phi. In
2354 // that case, FOR is not a fixed order recurrence.
2355 if (SinkCandidate == Previous)
2356 return false;
2357
2358 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2359 !Seen.insert(SinkCandidate).second ||
2360 VPDT.properlyDominates(Previous, SinkCandidate))
2361 return true;
2362
2363 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2364 return false;
2365
2366 WorkList.push_back(SinkCandidate);
2367 return true;
2368 };
2369
2370 // Recursively sink users of FOR after Previous.
2371 WorkList.push_back(FOR);
2372 for (unsigned I = 0; I != WorkList.size(); ++I) {
2373 VPRecipeBase *Current = WorkList[I];
2374 assert(Current->getNumDefinedValues() == 1 &&
2375 "only recipes with a single defined value expected");
2376
2377 for (VPUser *User : Current->getVPSingleValue()->users()) {
2378 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2379 return false;
2380 }
2381 }
2382
2383 // Keep recipes to sink ordered by dominance so earlier instructions are
2384 // processed first.
2385 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2386 return VPDT.properlyDominates(A, B);
2387 });
2388
2389 for (VPRecipeBase *SinkCandidate : WorkList) {
2390 if (SinkCandidate == FOR)
2391 continue;
2392
2393 SinkCandidate->moveAfter(Previous);
2394 Previous = SinkCandidate;
2395 }
2396 return true;
2397}
2398
2399/// Try to hoist \p Previous and its operands before all users of \p FOR.
2401 VPRecipeBase *Previous,
2402 VPDominatorTree &VPDT) {
2403 if (cannotHoistOrSinkRecipe(*Previous))
2404 return false;
2405
2406 // Collect recipes that need hoisting.
2407 SmallVector<VPRecipeBase *> HoistCandidates;
2409 VPRecipeBase *HoistPoint = nullptr;
2410 // Find the closest hoist point by looking at all users of FOR and selecting
2411 // the recipe dominating all other users.
2412 for (VPUser *U : FOR->users()) {
2413 auto *R = cast<VPRecipeBase>(U);
2414 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2415 HoistPoint = R;
2416 }
2417 assert(all_of(FOR->users(),
2418 [&VPDT, HoistPoint](VPUser *U) {
2419 auto *R = cast<VPRecipeBase>(U);
2420 return HoistPoint == R ||
2421 VPDT.properlyDominates(HoistPoint, R);
2422 }) &&
2423 "HoistPoint must dominate all users of FOR");
2424
2425 auto NeedsHoisting = [HoistPoint, &VPDT,
2426 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2427 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2428 if (!HoistCandidate)
2429 return nullptr;
2430 VPRegionBlock *EnclosingLoopRegion =
2431 HoistCandidate->getParent()->getEnclosingLoopRegion();
2432 assert((!HoistCandidate->getRegion() ||
2433 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2434 "CFG in VPlan should still be flat, without replicate regions");
2435 // Hoist candidate was already visited, no need to hoist.
2436 if (!Visited.insert(HoistCandidate).second)
2437 return nullptr;
2438
2439 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2440 // hoisting.
2441 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2442 return nullptr;
2443
2444 // If we reached a recipe that dominates HoistPoint, we don't need to
2445 // hoist the recipe.
2446 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2447 return nullptr;
2448 return HoistCandidate;
2449 };
2450
2451 if (!NeedsHoisting(Previous->getVPSingleValue()))
2452 return true;
2453
2454 // Recursively try to hoist Previous and its operands before all users of FOR.
2455 HoistCandidates.push_back(Previous);
2456
2457 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2458 VPRecipeBase *Current = HoistCandidates[I];
2459 assert(Current->getNumDefinedValues() == 1 &&
2460 "only recipes with a single defined value expected");
2461 if (cannotHoistOrSinkRecipe(*Current))
2462 return false;
2463
2464 for (VPValue *Op : Current->operands()) {
2465 // If we reach FOR, it means the original Previous depends on some other
2466 // recurrence that in turn depends on FOR. If that is the case, we would
2467 // also need to hoist recipes involving the other FOR, which may break
2468 // dependencies.
2469 if (Op == FOR)
2470 return false;
2471
2472 if (auto *R = NeedsHoisting(Op)) {
2473 // Bail out if the recipe defines multiple values.
2474 // TODO: Hoisting such recipes requires additional handling.
2475 if (R->getNumDefinedValues() != 1)
2476 return false;
2477 HoistCandidates.push_back(R);
2478 }
2479 }
2480 }
2481
2482 // Order recipes to hoist by dominance so earlier instructions are processed
2483 // first.
2484 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2485 return VPDT.properlyDominates(A, B);
2486 });
2487
2488 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2489 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2490 HoistPoint->getIterator());
2491 }
2492
2493 return true;
2494}
2495
2497 VPBuilder &LoopBuilder) {
2498 VPDominatorTree VPDT(Plan);
2499 VPTypeAnalysis TypeInfo(Plan);
2500
2502 for (VPRecipeBase &R :
2505 RecurrencePhis.push_back(FOR);
2506
2507 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2509 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2510 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2511 // to terminate.
2512 while (auto *PrevPhi =
2514 assert(PrevPhi->getParent() == FOR->getParent());
2515 assert(SeenPhis.insert(PrevPhi).second);
2516 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2517 }
2518
2519 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2520 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2521 return false;
2522
2523 // Introduce a recipe to combine the incoming and previous values of a
2524 // fixed-order recurrence.
2525 VPBasicBlock *InsertBlock =
2526 Previous ? Previous->getParent() : FOR->getParent();
2527 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2528 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2529 else
2530 LoopBuilder.setInsertPoint(InsertBlock,
2531 std::next(Previous->getIterator()));
2532
2533 auto *RecurSplice =
2535 {FOR, FOR->getBackedgeValue()});
2536
2537 FOR->replaceAllUsesWith(RecurSplice);
2538 // Set the first operand of RecurSplice to FOR again, after replacing
2539 // all users.
2540 RecurSplice->setOperand(0, FOR);
2541
2542 // Check for users extracting at the penultimate active lane of the FOR.
2543 // If only a single lane is active in the current iteration, we need to
2544 // select the last element from the previous iteration (from the FOR phi
2545 // directly).
2546 for (VPUser *U : RecurSplice->users()) {
2548 m_Specific(RecurSplice))))
2549 continue;
2550
2552 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2553 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2554 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2555 VPValue *One = Plan.getConstantInt(Ty, 1);
2556 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2557 VPValue *PenultimateLastIter =
2558 B.createNaryOp(VPInstruction::ExtractLane,
2559 {PenultimateIndex, FOR->getBackedgeValue()});
2560 VPValue *LastPrevIter =
2561 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2562
2563 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2564 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2565 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2566 }
2567 }
2568 return true;
2569}
2570
2572 for (VPRecipeBase &R :
2574 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2575 if (!PhiR)
2576 continue;
2577 RecurKind RK = PhiR->getRecurrenceKind();
2578 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2580 continue;
2581
2582 for (VPUser *U : collectUsersRecursively(PhiR))
2583 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2584 RecWithFlags->dropPoisonGeneratingFlags();
2585 }
2586 }
2587}
2588
2589namespace {
2590struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2591 static bool isSentinel(const VPSingleDefRecipe *Def) {
2592 return Def == getEmptyKey() || Def == getTombstoneKey();
2593 }
2594
2595 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2596 /// return that source element type.
2597 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2598 // All VPInstructions that lower to GEPs must have the i8 source element
2599 // type (as they are PtrAdds), so we omit it.
2601 .Case([](const VPReplicateRecipe *I) -> Type * {
2602 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2603 return GEP->getSourceElementType();
2604 return nullptr;
2605 })
2606 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2607 [](auto *I) { return I->getSourceElementType(); })
2608 .Default([](auto *) { return nullptr; });
2609 }
2610
2611 /// Returns true if recipe \p Def can be safely handed for CSE.
2612 static bool canHandle(const VPSingleDefRecipe *Def) {
2613 // We can extend the list of handled recipes in the future,
2614 // provided we account for the data embedded in them while checking for
2615 // equality or hashing.
2616 auto C = getOpcodeOrIntrinsicID(Def);
2617
2618 // The issue with (Insert|Extract)Value is that the index of the
2619 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2620 // VPlan.
2621 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2622 C->second == Instruction::ExtractValue)))
2623 return false;
2624
2625 // During CSE, we can only handle recipes that don't read from memory: if
2626 // they read from memory, there could be an intervening write to memory
2627 // before the next instance is CSE'd, leading to an incorrect result.
2628 return !Def->mayReadFromMemory();
2629 }
2630
2631 /// Hash the underlying data of \p Def.
2632 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2633 const VPlan *Plan = Def->getParent()->getPlan();
2634 VPTypeAnalysis TypeInfo(*Plan);
2635 hash_code Result = hash_combine(
2636 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2637 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2639 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2640 if (RFlags->hasPredicate())
2641 return hash_combine(Result, RFlags->getPredicate());
2642 return Result;
2643 }
2644
2645 /// Check equality of underlying data of \p L and \p R.
2646 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2647 if (isSentinel(L) || isSentinel(R))
2648 return L == R;
2649 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2651 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2653 !equal(L->operands(), R->operands()))
2654 return false;
2656 "must have valid opcode info for both recipes");
2657 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2658 if (LFlags->hasPredicate() &&
2659 LFlags->getPredicate() !=
2660 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2661 return false;
2662 // Recipes in replicate regions implicitly depend on predicate. If either
2663 // recipe is in a replicate region, only consider them equal if both have
2664 // the same parent.
2665 const VPRegionBlock *RegionL = L->getRegion();
2666 const VPRegionBlock *RegionR = R->getRegion();
2667 if (((RegionL && RegionL->isReplicator()) ||
2668 (RegionR && RegionR->isReplicator())) &&
2669 L->getParent() != R->getParent())
2670 return false;
2671 const VPlan *Plan = L->getParent()->getPlan();
2672 VPTypeAnalysis TypeInfo(*Plan);
2673 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2674 }
2675};
2676} // end anonymous namespace
2677
2678/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2679/// Plan.
2681 VPDominatorTree VPDT(Plan);
2683
2685 vp_depth_first_deep(Plan.getEntry()))) {
2686 for (VPRecipeBase &R : *VPBB) {
2687 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2688 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2689 continue;
2690 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2691 // V must dominate Def for a valid replacement.
2692 if (!VPDT.dominates(V->getParent(), VPBB))
2693 continue;
2694 // Only keep flags present on both V and Def.
2695 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2696 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2697 Def->replaceAllUsesWith(V);
2698 continue;
2699 }
2700 CSEMap[Def] = Def;
2701 }
2702 }
2703}
2704
2705/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2706static void licm(VPlan &Plan) {
2707 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2708
2709 // Hoist any loop invariant recipes from the vector loop region to the
2710 // preheader. Preform a shallow traversal of the vector loop region, to
2711 // exclude recipes in replicate regions. Since the top-level blocks in the
2712 // vector loop region are guaranteed to execute if the vector pre-header is,
2713 // we don't need to check speculation safety.
2714 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2715 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2716 "Expected vector prehader's successor to be the vector loop region");
2718 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2719 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2721 continue;
2722 if (any_of(R.operands(), [](VPValue *Op) {
2723 return !Op->isDefinedOutsideLoopRegions();
2724 }))
2725 continue;
2726 R.moveBefore(*Preheader, Preheader->end());
2727 }
2728 }
2729
2730#ifndef NDEBUG
2731 VPDominatorTree VPDT(Plan);
2732#endif
2733 // Sink recipes with no users inside the vector loop region if all users are
2734 // in the same exit block of the region.
2735 // TODO: Extend to sink recipes from inner loops.
2737 vp_post_order_shallow(LoopRegion->getEntry()))) {
2738 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2740 continue;
2741
2742 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2743 // handles sunk recipes correctly.
2744 if (isa<VPReplicateRecipe>(&R))
2745 continue;
2746
2747 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2748 // support recipes with multiple defined values (e.g., interleaved loads).
2749 auto *Def = cast<VPSingleDefRecipe>(&R);
2750 // Skip recipes without users as we cannot determine a sink block.
2751 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2752 // their execution frequency.
2753 if (Def->getNumUsers() == 0)
2754 continue;
2755
2756 VPBasicBlock *SinkBB = nullptr;
2757 // Cannot sink the recipe if any user
2758 // * is defined in any loop region, or
2759 // * is a phi, or
2760 // * multiple users in different blocks.
2761 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2762 auto *UserR = cast<VPRecipeBase>(U);
2763 VPBasicBlock *Parent = UserR->getParent();
2764 // TODO: If the user is a PHI node, we should check the block of
2765 // incoming value. Support PHI node users if needed.
2766 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2767 return true;
2768 // TODO: Support sinking when users are in multiple blocks.
2769 if (SinkBB && SinkBB != Parent)
2770 return true;
2771 SinkBB = Parent;
2772 return false;
2773 }))
2774 continue;
2775
2776 // Only sink to dedicated exit blocks of the loop region.
2777 if (SinkBB->getSinglePredecessor() != LoopRegion)
2778 continue;
2779
2780 // TODO: This will need to be a check instead of a assert after
2781 // conditional branches in vectorized loops are supported.
2782 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2783 "Defining block must dominate sink block");
2784 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2785 // just moving.
2786 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2787 }
2788 }
2789}
2790
2792 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2793 if (Plan.hasScalarVFOnly())
2794 return;
2795 // Keep track of created truncates, so they can be re-used. Note that we
2796 // cannot use RAUW after creating a new truncate, as this would could make
2797 // other uses have different types for their operands, making them invalidly
2798 // typed.
2800 VPTypeAnalysis TypeInfo(Plan);
2801 VPBasicBlock *PH = Plan.getVectorPreheader();
2804 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2807 continue;
2808
2809 VPValue *ResultVPV = R.getVPSingleValue();
2810 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2811 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2812 if (!NewResSizeInBits)
2813 continue;
2814
2815 // If the value wasn't vectorized, we must maintain the original scalar
2816 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2817 // skip casts which do not need to be handled explicitly here, as
2818 // redundant casts will be removed during recipe simplification.
2820 continue;
2821
2822 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2823 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2824 assert(OldResTy->isIntegerTy() && "only integer types supported");
2825 (void)OldResSizeInBits;
2826
2827 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2828
2829 // Any wrapping introduced by shrinking this operation shouldn't be
2830 // considered undefined behavior. So, we can't unconditionally copy
2831 // arithmetic wrapping flags to VPW.
2832 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2833 VPW->dropPoisonGeneratingFlags();
2834
2835 if (OldResSizeInBits != NewResSizeInBits &&
2836 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2837 // Extend result to original width.
2838 auto *Ext = new VPWidenCastRecipe(
2839 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2840 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2841 Ext->insertAfter(&R);
2842 ResultVPV->replaceAllUsesWith(Ext);
2843 Ext->setOperand(0, ResultVPV);
2844 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2845 } else {
2846 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2847 "Only ICmps should not need extending the result.");
2848 }
2849
2850 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2852 continue;
2853
2854 // Shrink operands by introducing truncates as needed.
2855 unsigned StartIdx =
2856 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2857 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2858 auto *Op = R.getOperand(Idx);
2859 unsigned OpSizeInBits =
2861 if (OpSizeInBits == NewResSizeInBits)
2862 continue;
2863 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2864 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2865 if (!IterIsEmpty) {
2866 R.setOperand(Idx, ProcessedIter->second);
2867 continue;
2868 }
2869
2870 VPBuilder Builder;
2871 if (isa<VPIRValue>(Op))
2872 Builder.setInsertPoint(PH);
2873 else
2874 Builder.setInsertPoint(&R);
2875 VPWidenCastRecipe *NewOp =
2876 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2877 ProcessedIter->second = NewOp;
2878 R.setOperand(Idx, NewOp);
2879 }
2880
2881 }
2882 }
2883}
2884
2888 VPValue *Cond;
2889 // Skip blocks that are not terminated by BranchOnCond.
2890 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2891 continue;
2892
2893 assert(VPBB->getNumSuccessors() == 2 &&
2894 "Two successors expected for BranchOnCond");
2895 unsigned RemovedIdx;
2896 if (match(Cond, m_True()))
2897 RemovedIdx = 1;
2898 else if (match(Cond, m_False()))
2899 RemovedIdx = 0;
2900 else
2901 continue;
2902
2903 VPBasicBlock *RemovedSucc =
2904 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2905 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2906 "There must be a single edge between VPBB and its successor");
2907 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2908 // these recipes.
2909 for (VPRecipeBase &R : RemovedSucc->phis())
2910 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2911
2912 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2913 // automatically on VPlan destruction if it becomes unreachable.
2914 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2915 VPBB->back().eraseFromParent();
2916 }
2917}
2918
2940
2941// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2942// the loop terminator with a branch-on-cond recipe with the negated
2943// active-lane-mask as operand. Note that this turns the loop into an
2944// uncountable one. Only the existing terminator is replaced, all other existing
2945// recipes/users remain unchanged, except for poison-generating flags being
2946// dropped from the canonical IV increment. Return the created
2947// VPActiveLaneMaskPHIRecipe.
2948//
2949// The function adds the following recipes:
2950//
2951// vector.ph:
2952// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2953// %EntryALM = active-lane-mask %EntryInc, TC
2954//
2955// vector.body:
2956// ...
2957// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2958// ...
2959// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2960// %ALM = active-lane-mask %InLoopInc, TC
2961// %Negated = Not %ALM
2962// branch-on-cond %Negated
2963//
2966 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2967 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2968 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2969 VPValue *StartV = CanonicalIVPHI->getStartValue();
2970
2971 auto *CanonicalIVIncrement =
2972 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2973 // TODO: Check if dropping the flags is needed.
2974 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2975 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2976 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2977 // we have to take unrolling into account. Each part needs to start at
2978 // Part * VF
2979 auto *VecPreheader = Plan.getVectorPreheader();
2980 VPBuilder Builder(VecPreheader);
2981
2982 // Create the ActiveLaneMask instruction using the correct start values.
2983 VPValue *TC = Plan.getTripCount();
2984 VPValue *VF = &Plan.getVF();
2985
2986 auto *EntryIncrement = Builder.createOverflowingOp(
2987 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2988 DL, "index.part.next");
2989
2990 // Create the active lane mask instruction in the VPlan preheader.
2991 VPValue *ALMMultiplier =
2992 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2993 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2994 {EntryIncrement, TC, ALMMultiplier}, DL,
2995 "active.lane.mask.entry");
2996
2997 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2998 // preheader ActiveLaneMask instruction.
2999 auto *LaneMaskPhi =
3001 LaneMaskPhi->insertAfter(CanonicalIVPHI);
3002
3003 // Create the active lane mask for the next iteration of the loop before the
3004 // original terminator.
3005 VPRecipeBase *OriginalTerminator = EB->getTerminator();
3006 Builder.setInsertPoint(OriginalTerminator);
3007 auto *InLoopIncrement = Builder.createOverflowingOp(
3009 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3010 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3011 {InLoopIncrement, TC, ALMMultiplier}, DL,
3012 "active.lane.mask.next");
3013 LaneMaskPhi->addOperand(ALM);
3014
3015 // Replace the original terminator with BranchOnCond. We have to invert the
3016 // mask here because a true condition means jumping to the exit block.
3017 auto *NotMask = Builder.createNot(ALM, DL);
3018 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3019 OriginalTerminator->eraseFromParent();
3020 return LaneMaskPhi;
3021}
3022
3024 bool UseActiveLaneMaskForControlFlow) {
3025 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3026 auto *FoundWidenCanonicalIVUser = find_if(
3028 assert(FoundWidenCanonicalIVUser &&
3029 "Must have widened canonical IV when tail folding!");
3030 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3031 auto *WideCanonicalIV =
3032 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3033 VPSingleDefRecipe *LaneMask;
3034 if (UseActiveLaneMaskForControlFlow) {
3035 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3036 } else {
3037 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3038 VPValue *ALMMultiplier =
3039 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3040 LaneMask =
3041 B.createNaryOp(VPInstruction::ActiveLaneMask,
3042 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3043 nullptr, "active.lane.mask");
3044 }
3045
3046 // Walk users of WideCanonicalIV and replace the header mask of the form
3047 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3048 // removing the old one to ensure there is always only a single header mask.
3049 HeaderMask->replaceAllUsesWith(LaneMask);
3050 HeaderMask->eraseFromParent();
3051}
3052
3053template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3054 Op0_t In;
3056
3057 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3058
3059 template <typename OpTy> bool match(OpTy *V) const {
3060 if (m_Specific(In).match(V)) {
3061 Out = nullptr;
3062 return true;
3063 }
3064 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3065 }
3066};
3067
3068/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3069/// Returns the remaining part \p Out if so, or nullptr otherwise.
3070template <typename Op0_t, typename Op1_t>
3071static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3072 Op1_t &Out) {
3073 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3074}
3075
3076/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3077/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3078/// recipe could be created.
3079/// \p HeaderMask Header Mask.
3080/// \p CurRecipe Recipe to be transform.
3081/// \p TypeInfo VPlan-based type analysis.
3082/// \p EVL The explicit vector length parameter of vector-predication
3083/// intrinsics.
3085 VPRecipeBase &CurRecipe,
3086 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3087 VPlan *Plan = CurRecipe.getParent()->getPlan();
3088 DebugLoc DL = CurRecipe.getDebugLoc();
3089 VPValue *Addr, *Mask, *EndPtr;
3090
3091 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3092 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3093 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3094 EVLEndPtr->insertBefore(&CurRecipe);
3095 EVLEndPtr->setOperand(1, &EVL);
3096 return EVLEndPtr;
3097 };
3098
3099 if (match(&CurRecipe,
3100 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3101 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3102 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3103 EVL, Mask);
3104
3105 VPValue *ReversedVal;
3106 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3107 match(ReversedVal,
3108 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3109 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3110 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3111 auto *LoadR = new VPWidenLoadEVLRecipe(
3112 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3113 LoadR->insertBefore(&CurRecipe);
3114 return new VPWidenIntrinsicRecipe(
3115 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3116 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3117 }
3118
3119 VPValue *StoredVal;
3120 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3121 m_RemoveMask(HeaderMask, Mask))) &&
3122 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3123 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3124 StoredVal, EVL, Mask);
3125
3126 if (match(&CurRecipe,
3127 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3128 m_RemoveMask(HeaderMask, Mask))) &&
3129 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3130 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3131 auto *NewReverse = new VPWidenIntrinsicRecipe(
3132 Intrinsic::experimental_vp_reverse,
3133 {ReversedVal, Plan->getTrue(), &EVL},
3134 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3135 NewReverse->insertBefore(&CurRecipe);
3136 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3137 AdjustEndPtr(EndPtr), NewReverse, EVL,
3138 Mask);
3139 }
3140
3141 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3142 if (Rdx->isConditional() &&
3143 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3144 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3145
3146 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3147 if (Interleave->getMask() &&
3148 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3149 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3150
3151 VPValue *LHS, *RHS;
3152 if (match(&CurRecipe,
3153 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3154 return new VPWidenIntrinsicRecipe(
3155 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3156 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3157
3158 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3159 m_VPValue(RHS))))
3160 return new VPWidenIntrinsicRecipe(
3161 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3162 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3163
3164 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3165 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3166 VPValue *ZExt = VPBuilder(&CurRecipe)
3168 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3169 return new VPInstruction(
3170 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3171 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3172 }
3173
3174 return nullptr;
3175}
3176
3177/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3178/// The transforms here need to preserve the original semantics.
3180 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3181 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3184 m_VPValue(EVL))) &&
3185 match(EVL, m_EVL(m_VPValue()))) {
3186 HeaderMask = R.getVPSingleValue();
3187 break;
3188 }
3189 }
3190 if (!HeaderMask)
3191 return;
3192
3193 VPTypeAnalysis TypeInfo(Plan);
3194 SmallVector<VPRecipeBase *> OldRecipes;
3195 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3197 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3198 NewR->insertBefore(R);
3199 for (auto [Old, New] :
3200 zip_equal(R->definedValues(), NewR->definedValues()))
3201 Old->replaceAllUsesWith(New);
3202 OldRecipes.push_back(R);
3203 }
3204 }
3205
3206 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3207 // False, EVL)
3208 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3209 VPValue *Mask;
3210 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3211 auto *LogicalAnd = cast<VPInstruction>(U);
3212 auto *Merge = new VPWidenIntrinsicRecipe(
3213 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3214 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3215 Merge->insertBefore(LogicalAnd);
3216 LogicalAnd->replaceAllUsesWith(Merge);
3217 OldRecipes.push_back(LogicalAnd);
3218 }
3219 }
3220
3221 // Erase old recipes at the end so we don't invalidate TypeInfo.
3222 for (VPRecipeBase *R : reverse(OldRecipes)) {
3223 SmallVector<VPValue *> PossiblyDead(R->operands());
3224 R->eraseFromParent();
3225 for (VPValue *Op : PossiblyDead)
3227 }
3228}
3229
3230/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3231/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3232/// iteration.
3233static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3234 VPTypeAnalysis TypeInfo(Plan);
3235 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3236 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3237
3238 assert(all_of(Plan.getVF().users(),
3241 "User of VF that we can't transform to EVL.");
3242 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3244 });
3245
3246 assert(all_of(Plan.getVFxUF().users(),
3247 [&LoopRegion, &Plan](VPUser *U) {
3248 return match(U,
3249 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3250 m_Specific(&Plan.getVFxUF()))) ||
3251 isa<VPWidenPointerInductionRecipe>(U);
3252 }) &&
3253 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3254 "increment of the canonical induction.");
3255 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3256 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3257 // canonical induction must not be updated.
3259 });
3260
3261 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3262 // contained.
3263 bool ContainsFORs =
3265 if (ContainsFORs) {
3266 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3267 VPValue *MaxEVL = &Plan.getVF();
3268 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3269 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3270 MaxEVL = Builder.createScalarZExtOrTrunc(
3271 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3272 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3273
3274 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3275 VPValue *PrevEVL = Builder.createScalarPhi(
3276 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3277
3280 for (VPRecipeBase &R : *VPBB) {
3281 VPValue *V1, *V2;
3282 if (!match(&R,
3284 m_VPValue(V1), m_VPValue(V2))))
3285 continue;
3286 VPValue *Imm = Plan.getOrAddLiveIn(
3289 Intrinsic::experimental_vp_splice,
3290 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3291 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3292 R.getDebugLoc());
3293 VPSplice->insertBefore(&R);
3294 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3295 }
3296 }
3297 }
3298
3299 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3300 if (!HeaderMask)
3301 return;
3302
3303 // Replace header masks with a mask equivalent to predicating by EVL:
3304 //
3305 // icmp ule widen-canonical-iv backedge-taken-count
3306 // ->
3307 // icmp ult step-vector, EVL
3308 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3309 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3310 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3311 VPValue *EVLMask = Builder.createICmp(
3313 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3314 HeaderMask->replaceAllUsesWith(EVLMask);
3315}
3316
3317/// Converts a tail folded vector loop region to step by
3318/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3319/// iteration.
3320///
3321/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3322/// replaces all uses except the canonical IV increment of
3323/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3324/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3325/// this transformation.
3326///
3327/// - The header mask is replaced with a header mask based on the EVL.
3328///
3329/// - Plans with FORs have a new phi added to keep track of the EVL of the
3330/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3331/// @llvm.vp.splice.
3332///
3333/// The function uses the following definitions:
3334/// %StartV is the canonical induction start value.
3335///
3336/// The function adds the following recipes:
3337///
3338/// vector.ph:
3339/// ...
3340///
3341/// vector.body:
3342/// ...
3343/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3344/// [ %NextIter, %vector.body ]
3345/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3346/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3347/// ...
3348/// %OpEVL = cast i32 %VPEVL to IVSize
3349/// %NextIter = add IVSize %OpEVL, %CurrentIter
3350/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3351/// ...
3352///
3353/// If MaxSafeElements is provided, the function adds the following recipes:
3354/// vector.ph:
3355/// ...
3356///
3357/// vector.body:
3358/// ...
3359/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3360/// [ %NextIter, %vector.body ]
3361/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3362/// %cmp = cmp ult %AVL, MaxSafeElements
3363/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3364/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3365/// ...
3366/// %OpEVL = cast i32 %VPEVL to IVSize
3367/// %NextIter = add IVSize %OpEVL, %CurrentIter
3368/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3369/// ...
3370///
3372 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3373 if (Plan.hasScalarVFOnly())
3374 return;
3375 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3376 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3377
3378 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3379 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3380 VPValue *StartV = CanonicalIVPHI->getStartValue();
3381
3382 // Create the CurrentIteration recipe in the vector loop.
3383 auto *CurrentIteration =
3385 CurrentIteration->insertAfter(CanonicalIVPHI);
3386 VPBuilder Builder(Header, Header->getFirstNonPhi());
3387 // Create the AVL (application vector length), starting from TC -> 0 in steps
3388 // of EVL.
3389 VPPhi *AVLPhi = Builder.createScalarPhi(
3390 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3391 VPValue *AVL = AVLPhi;
3392
3393 if (MaxSafeElements) {
3394 // Support for MaxSafeDist for correct loop emission.
3395 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3396 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3397 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3398 "safe_avl");
3399 }
3400 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3401 DebugLoc::getUnknown(), "evl");
3402
3403 auto *CanonicalIVIncrement =
3404 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3405 Builder.setInsertPoint(CanonicalIVIncrement);
3406 VPValue *OpVPEVL = VPEVL;
3407
3408 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3409 OpVPEVL = Builder.createScalarZExtOrTrunc(
3410 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3411
3412 auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
3413 CanonicalIVIncrement->getDebugLoc(),
3414 "current.iteration.next",
3415 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3416 CanonicalIVIncrement->hasNoSignedWrap()});
3417 CurrentIteration->addOperand(NextIter);
3418
3419 VPValue *NextAVL =
3420 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3421 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3422 AVLPhi->addOperand(NextAVL);
3423
3424 fixupVFUsersForEVL(Plan, *VPEVL);
3425 removeDeadRecipes(Plan);
3426
3427 // Replace all uses of VPCanonicalIVPHIRecipe by
3428 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3429 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3430 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3431 // TODO: support unroll factor > 1.
3432 Plan.setUF(1);
3433}
3434
3436 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3437 // There should be only one VPCurrentIteration in the entire plan.
3438 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3439
3442 for (VPRecipeBase &R : VPBB->phis())
3443 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3444 assert(!CurrentIteration &&
3445 "Found multiple CurrentIteration. Only one expected");
3446 CurrentIteration = PhiR;
3447 }
3448
3449 // Early return if it is not variable-length stepping.
3450 if (!CurrentIteration)
3451 return;
3452
3453 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3454 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3455
3456 // Convert CurrentIteration to concrete recipe.
3457 auto *ScalarR =
3458 VPBuilder(CurrentIteration)
3460 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3461 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3462 CurrentIteration->replaceAllUsesWith(ScalarR);
3463 CurrentIteration->eraseFromParent();
3464
3465 // Replace CanonicalIVInc with CurrentIteration increment.
3466 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3467 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3468 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3469 m_Specific(&Plan.getVFxUF()))) &&
3470 "Unexpected canonical iv");
3471 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3472
3473 // Remove unused phi and increment.
3474 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3475 CanonicalIVIncrement->eraseFromParent();
3476 CanonicalIV->eraseFromParent();
3477}
3478
3480 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3481 // The canonical IV may not exist at this stage.
3482 if (!LoopRegion ||
3484 return;
3485 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3486 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3487 return;
3488 // The EVL IV is always immediately after the canonical IV.
3490 std::next(CanIV->getIterator()));
3491 if (!EVLPhi)
3492 return;
3493
3494 // Bail if not an EVL tail folded loop.
3495 VPValue *AVL;
3496 if (!match(EVLPhi->getBackedgeValue(),
3497 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3498 return;
3499
3500 // The AVL may be capped to a safe distance.
3501 VPValue *SafeAVL;
3502 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3503 AVL = SafeAVL;
3504
3505 VPValue *AVLNext;
3506 [[maybe_unused]] bool FoundAVLNext =
3508 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3509 assert(FoundAVLNext && "Didn't find AVL backedge?");
3510
3511 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3512 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3513 if (match(LatchBr, m_BranchOnCond(m_True())))
3514 return;
3515
3516 assert(
3517 match(LatchBr,
3520 m_Specific(&Plan.getVectorTripCount())))) &&
3521 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3522 "trip count");
3523
3524 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3525 VPBuilder Builder(LatchBr);
3526 LatchBr->setOperand(
3527 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3528}
3529
3531 VPlan &Plan, PredicatedScalarEvolution &PSE,
3532 const DenseMap<Value *, const SCEV *> &StridesMap) {
3533 // Replace VPValues for known constant strides guaranteed by predicate scalar
3534 // evolution.
3535 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3536 auto *R = cast<VPRecipeBase>(&U);
3537 return R->getRegion() ||
3538 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3539 };
3540 ValueToSCEVMapTy RewriteMap;
3541 for (const SCEV *Stride : StridesMap.values()) {
3542 using namespace SCEVPatternMatch;
3543 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3544 const APInt *StrideConst;
3545 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3546 // Only handle constant strides for now.
3547 continue;
3548
3549 auto *CI = Plan.getConstantInt(*StrideConst);
3550 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3551 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3552
3553 // The versioned value may not be used in the loop directly but through a
3554 // sext/zext. Add new live-ins in those cases.
3555 for (Value *U : StrideV->users()) {
3557 continue;
3558 VPValue *StrideVPV = Plan.getLiveIn(U);
3559 if (!StrideVPV)
3560 continue;
3561 unsigned BW = U->getType()->getScalarSizeInBits();
3562 APInt C =
3563 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3564 VPValue *CI = Plan.getConstantInt(C);
3565 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3566 }
3567 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3568 }
3569
3570 for (VPRecipeBase &R : *Plan.getEntry()) {
3571 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3572 if (!ExpSCEV)
3573 continue;
3574 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3575 auto *NewSCEV =
3576 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3577 if (NewSCEV != ScevExpr) {
3578 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3579 ExpSCEV->replaceAllUsesWith(NewExp);
3580 if (Plan.getTripCount() == ExpSCEV)
3581 Plan.resetTripCount(NewExp);
3582 }
3583 }
3584}
3585
3587 VPlan &Plan,
3588 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3589 // Collect recipes in the backward slice of `Root` that may generate a poison
3590 // value that is used after vectorization.
3592 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3594 Worklist.push_back(Root);
3595
3596 // Traverse the backward slice of Root through its use-def chain.
3597 while (!Worklist.empty()) {
3598 VPRecipeBase *CurRec = Worklist.pop_back_val();
3599
3600 if (!Visited.insert(CurRec).second)
3601 continue;
3602
3603 // Prune search if we find another recipe generating a widen memory
3604 // instruction. Widen memory instructions involved in address computation
3605 // will lead to gather/scatter instructions, which don't need to be
3606 // handled.
3608 VPHeaderPHIRecipe>(CurRec))
3609 continue;
3610
3611 // This recipe contributes to the address computation of a widen
3612 // load/store. If the underlying instruction has poison-generating flags,
3613 // drop them directly.
3614 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3615 VPValue *A, *B;
3616 // Dropping disjoint from an OR may yield incorrect results, as some
3617 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3618 // for dependence analysis). Instead, replace it with an equivalent Add.
3619 // This is possible as all users of the disjoint OR only access lanes
3620 // where the operands are disjoint or poison otherwise.
3621 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3622 RecWithFlags->isDisjoint()) {
3623 VPBuilder Builder(RecWithFlags);
3624 VPInstruction *New =
3625 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3626 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3627 RecWithFlags->replaceAllUsesWith(New);
3628 RecWithFlags->eraseFromParent();
3629 CurRec = New;
3630 } else
3631 RecWithFlags->dropPoisonGeneratingFlags();
3632 } else {
3635 (void)Instr;
3636 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3637 "found instruction with poison generating flags not covered by "
3638 "VPRecipeWithIRFlags");
3639 }
3640
3641 // Add new definitions to the worklist.
3642 for (VPValue *Operand : CurRec->operands())
3643 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3644 Worklist.push_back(OpDef);
3645 }
3646 });
3647
3648 // Traverse all the recipes in the VPlan and collect the poison-generating
3649 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3650 // VPInterleaveRecipe.
3651 auto Iter = vp_depth_first_deep(Plan.getEntry());
3653 for (VPRecipeBase &Recipe : *VPBB) {
3654 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3655 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3656 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3657 if (AddrDef && WidenRec->isConsecutive() &&
3658 BlockNeedsPredication(UnderlyingInstr.getParent()))
3659 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3660 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3661 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3662 if (AddrDef) {
3663 // Check if any member of the interleave group needs predication.
3664 const InterleaveGroup<Instruction> *InterGroup =
3665 InterleaveRec->getInterleaveGroup();
3666 bool NeedPredication = false;
3667 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3668 I < NumMembers; ++I) {
3669 Instruction *Member = InterGroup->getMember(I);
3670 if (Member)
3671 NeedPredication |= BlockNeedsPredication(Member->getParent());
3672 }
3673
3674 if (NeedPredication)
3675 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3676 }
3677 }
3678 }
3679 }
3680}
3681
3683 VPlan &Plan,
3685 &InterleaveGroups,
3686 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3687 if (InterleaveGroups.empty())
3688 return;
3689
3690 // Interleave memory: for each Interleave Group we marked earlier as relevant
3691 // for this VPlan, replace the Recipes widening its memory instructions with a
3692 // single VPInterleaveRecipe at its insertion point.
3693 VPDominatorTree VPDT(Plan);
3694 for (const auto *IG : InterleaveGroups) {
3695 auto *Start =
3696 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3697 VPIRMetadata InterleaveMD(*Start);
3698 SmallVector<VPValue *, 4> StoredValues;
3699 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3700 StoredValues.push_back(StoreR->getStoredValue());
3701 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3702 Instruction *MemberI = IG->getMember(I);
3703 if (!MemberI)
3704 continue;
3705 VPWidenMemoryRecipe *MemoryR =
3706 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3707 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3708 StoredValues.push_back(StoreR->getStoredValue());
3709 InterleaveMD.intersect(*MemoryR);
3710 }
3711
3712 bool NeedsMaskForGaps =
3713 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3714 (!StoredValues.empty() && !IG->isFull());
3715
3716 Instruction *IRInsertPos = IG->getInsertPos();
3717 auto *InsertPos =
3718 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3719
3721 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3722 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3723 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3724
3725 // Get or create the start address for the interleave group.
3726 VPValue *Addr = Start->getAddr();
3727 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3728 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3729 // We cannot re-use the address of member zero because it does not
3730 // dominate the insert position. Instead, use the address of the insert
3731 // position and create a PtrAdd adjusting it to the address of member
3732 // zero.
3733 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3734 // InsertPos or sink loads above zero members to join it.
3735 assert(IG->getIndex(IRInsertPos) != 0 &&
3736 "index of insert position shouldn't be zero");
3737 auto &DL = IRInsertPos->getDataLayout();
3738 APInt Offset(32,
3739 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3740 IG->getIndex(IRInsertPos),
3741 /*IsSigned=*/true);
3742 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3743 VPBuilder B(InsertPos);
3744 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3745 }
3746 // If the group is reverse, adjust the index to refer to the last vector
3747 // lane instead of the first. We adjust the index from the first vector
3748 // lane, rather than directly getting the pointer for lane VF - 1, because
3749 // the pointer operand of the interleaved access is supposed to be uniform.
3750 if (IG->isReverse()) {
3751 auto *ReversePtr = new VPVectorEndPointerRecipe(
3752 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3753 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3754 ReversePtr->insertBefore(InsertPos);
3755 Addr = ReversePtr;
3756 }
3757 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3758 InsertPos->getMask(), NeedsMaskForGaps,
3759 InterleaveMD, InsertPos->getDebugLoc());
3760 VPIG->insertBefore(InsertPos);
3761
3762 unsigned J = 0;
3763 for (unsigned i = 0; i < IG->getFactor(); ++i)
3764 if (Instruction *Member = IG->getMember(i)) {
3765 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3766 if (!Member->getType()->isVoidTy()) {
3767 VPValue *OriginalV = MemberR->getVPSingleValue();
3768 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3769 J++;
3770 }
3771 MemberR->eraseFromParent();
3772 }
3773 }
3774}
3775
3776/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3777/// value, phi and backedge value. In the following example:
3778///
3779/// vector.ph:
3780/// Successor(s): vector loop
3781///
3782/// <x1> vector loop: {
3783/// vector.body:
3784/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3785/// ...
3786/// EMIT branch-on-count ...
3787/// No successors
3788/// }
3789///
3790/// WIDEN-INDUCTION will get expanded to:
3791///
3792/// vector.ph:
3793/// ...
3794/// vp<%induction.start> = ...
3795/// vp<%induction.increment> = ...
3796///
3797/// Successor(s): vector loop
3798///
3799/// <x1> vector loop: {
3800/// vector.body:
3801/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3802/// ...
3803/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3804/// EMIT branch-on-count ...
3805/// No successors
3806/// }
3807static void
3809 VPTypeAnalysis &TypeInfo) {
3810 VPlan *Plan = WidenIVR->getParent()->getPlan();
3811 VPValue *Start = WidenIVR->getStartValue();
3812 VPValue *Step = WidenIVR->getStepValue();
3813 VPValue *VF = WidenIVR->getVFValue();
3814 DebugLoc DL = WidenIVR->getDebugLoc();
3815
3816 // The value from the original loop to which we are mapping the new induction
3817 // variable.
3818 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3819
3820 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3823 VPIRFlags Flags = *WidenIVR;
3824 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3825 AddOp = Instruction::Add;
3826 MulOp = Instruction::Mul;
3827 } else {
3828 AddOp = ID.getInductionOpcode();
3829 MulOp = Instruction::FMul;
3830 }
3831
3832 // If the phi is truncated, truncate the start and step values.
3833 VPBuilder Builder(Plan->getVectorPreheader());
3834 Type *StepTy = TypeInfo.inferScalarType(Step);
3835 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3836 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3837 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3838 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3839 // Truncation doesn't preserve WrapFlags.
3840 Flags.dropPoisonGeneratingFlags();
3841 StepTy = Ty;
3842 }
3843
3844 // Construct the initial value of the vector IV in the vector loop preheader.
3845 Type *IVIntTy =
3847 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3848 if (StepTy->isFloatingPointTy())
3849 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3850
3851 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3852 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3853
3854 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3855 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3856 DebugLoc::getUnknown(), "induction");
3857
3858 // Create the widened phi of the vector IV.
3859 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3860 WidenIVR->getDebugLoc(), "vec.ind");
3861 WidePHI->insertBefore(WidenIVR);
3862
3863 // Create the backedge value for the vector IV.
3864 VPValue *Inc;
3865 VPValue *Prev;
3866 // If unrolled, use the increment and prev value from the operands.
3867 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3868 Inc = SplatVF;
3869 Prev = WidenIVR->getLastUnrolledPartOperand();
3870 } else {
3871 if (VPRecipeBase *R = VF->getDefiningRecipe())
3872 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3873 // Multiply the vectorization factor by the step using integer or
3874 // floating-point arithmetic as appropriate.
3875 if (StepTy->isFloatingPointTy())
3876 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3877 DL);
3878 else
3879 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3880 TypeInfo.inferScalarType(VF), DL);
3881
3882 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3883 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3884 Prev = WidePHI;
3885 }
3886
3888 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3889 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3890 WidenIVR->getDebugLoc(), "vec.ind.next");
3891
3892 WidePHI->addOperand(Next);
3893
3894 WidenIVR->replaceAllUsesWith(WidePHI);
3895}
3896
3897/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3898/// initial value, phi and backedge value. In the following example:
3899///
3900/// <x1> vector loop: {
3901/// vector.body:
3902/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3903/// ...
3904/// EMIT branch-on-count ...
3905/// }
3906///
3907/// WIDEN-POINTER-INDUCTION will get expanded to:
3908///
3909/// <x1> vector loop: {
3910/// vector.body:
3911/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3912/// EMIT %mul = mul %stepvector, %step
3913/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3914/// ...
3915/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3916/// EMIT branch-on-count ...
3917/// }
3919 VPTypeAnalysis &TypeInfo) {
3920 VPlan *Plan = R->getParent()->getPlan();
3921 VPValue *Start = R->getStartValue();
3922 VPValue *Step = R->getStepValue();
3923 VPValue *VF = R->getVFValue();
3924
3925 assert(R->getInductionDescriptor().getKind() ==
3927 "Not a pointer induction according to InductionDescriptor!");
3928 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3929 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3930 "Recipe should have been replaced");
3931
3932 VPBuilder Builder(R);
3933 DebugLoc DL = R->getDebugLoc();
3934
3935 // Build a scalar pointer phi.
3936 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3937
3938 // Create actual address geps that use the pointer phi as base and a
3939 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3940 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3941 Type *StepTy = TypeInfo.inferScalarType(Step);
3942 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3943 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3944 VPValue *PtrAdd =
3945 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3946 R->replaceAllUsesWith(PtrAdd);
3947
3948 // Create the backedge value for the scalar pointer phi.
3950 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3951 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3952 DL);
3953 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3954
3955 VPValue *InductionGEP =
3956 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3957 ScalarPtrPhi->addOperand(InductionGEP);
3958}
3959
3961 // Replace loop regions with explicity CFG.
3962 SmallVector<VPRegionBlock *> LoopRegions;
3964 vp_depth_first_deep(Plan.getEntry()))) {
3965 if (!R->isReplicator())
3966 LoopRegions.push_back(R);
3967 }
3968 for (VPRegionBlock *R : LoopRegions)
3969 R->dissolveToCFGLoop();
3970}
3971
3974 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3975 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3978 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3979 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3980 }
3981
3982 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3983 // single-condition branches:
3984 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3985 // the first condition is true, and otherwise jumps to a new interim block.
3986 // 2. A branch that ends the interim block, jumps to the second successor if
3987 // the second condition is true, and otherwise jumps to the third
3988 // successor.
3989 for (VPInstruction *Br : WorkList) {
3990 assert(Br->getNumOperands() == 2 &&
3991 "BranchOnTwoConds must have exactly 2 conditions");
3992 DebugLoc DL = Br->getDebugLoc();
3993 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3994 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3995 assert(Successors.size() == 3 &&
3996 "BranchOnTwoConds must have exactly 3 successors");
3997
3998 for (VPBlockBase *Succ : Successors)
3999 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4000
4001 VPValue *Cond0 = Br->getOperand(0);
4002 VPValue *Cond1 = Br->getOperand(1);
4003 VPBlockBase *Succ0 = Successors[0];
4004 VPBlockBase *Succ1 = Successors[1];
4005 VPBlockBase *Succ2 = Successors[2];
4006 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4007 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4008
4009 VPBasicBlock *InterimBB =
4010 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4011
4012 VPBuilder(BrOnTwoCondsBB)
4014 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4015 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4016
4018 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4019 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4020 Br->eraseFromParent();
4021 }
4022}
4023
4025 VPTypeAnalysis TypeInfo(Plan);
4028 vp_depth_first_deep(Plan.getEntry()))) {
4029 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4030 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4031 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4032 ToRemove.push_back(WidenIVR);
4033 continue;
4034 }
4035
4036 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4037 // If the recipe only generates scalars, scalarize it instead of
4038 // expanding it.
4039 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4040 VPBuilder Builder(WidenIVR);
4041 VPValue *PtrAdd =
4042 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4043 WidenIVR->replaceAllUsesWith(PtrAdd);
4044 ToRemove.push_back(WidenIVR);
4045 continue;
4046 }
4047 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4048 ToRemove.push_back(WidenIVR);
4049 continue;
4050 }
4051
4052 // Expand VPBlendRecipe into VPInstruction::Select.
4053 VPBuilder Builder(&R);
4054 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4055 VPValue *Select = Blend->getIncomingValue(0);
4056 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4057 Select = Builder.createSelect(Blend->getMask(I),
4058 Blend->getIncomingValue(I), Select,
4059 R.getDebugLoc(), "predphi", *Blend);
4060 Blend->replaceAllUsesWith(Select);
4061 ToRemove.push_back(Blend);
4062 }
4063
4064 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4065 if (!VEPR->getOffset()) {
4066 assert(Plan.getConcreteUF() == 1 &&
4067 "Expected unroller to have materialized offset for UF != 1");
4068 VEPR->materializeOffset();
4069 }
4070 }
4071
4072 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4073 Expr->decompose();
4074 ToRemove.push_back(Expr);
4075 }
4076
4077 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4078 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4079 if (LastActiveL &&
4080 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4081 // Create Not(Mask) for all operands.
4083 for (VPValue *Op : LastActiveL->operands()) {
4084 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4085 NotMasks.push_back(NotMask);
4086 }
4087
4088 // Create FirstActiveLane on the inverted masks.
4089 VPValue *FirstInactiveLane = Builder.createNaryOp(
4091 LastActiveL->getDebugLoc(), "first.inactive.lane");
4092
4093 // Subtract 1 to get the last active lane.
4094 VPValue *One =
4095 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4096 VPValue *LastLane =
4097 Builder.createSub(FirstInactiveLane, One,
4098 LastActiveL->getDebugLoc(), "last.active.lane");
4099
4100 LastActiveL->replaceAllUsesWith(LastLane);
4101 ToRemove.push_back(LastActiveL);
4102 continue;
4103 }
4104
4105 // Lower MaskedCond with block mask to LogicalAnd.
4107 auto *VPI = cast<VPInstruction>(&R);
4108 assert(VPI->isMasked() &&
4109 "Unmasked MaskedCond should be simplified earlier");
4110 VPI->replaceAllUsesWith(Builder.createNaryOp(
4111 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4112 ToRemove.push_back(VPI);
4113 continue;
4114 }
4115
4116 // Lower BranchOnCount to ICmp + BranchOnCond.
4117 VPValue *IV, *TC;
4118 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4119 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4120 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4121 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4122 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4123 ToRemove.push_back(BranchOnCountInst);
4124 continue;
4125 }
4126
4127 VPValue *VectorStep;
4128 VPValue *ScalarStep;
4130 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4131 continue;
4132
4133 // Expand WideIVStep.
4134 auto *VPI = cast<VPInstruction>(&R);
4135 Type *IVTy = TypeInfo.inferScalarType(VPI);
4136 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4138 ? Instruction::UIToFP
4139 : Instruction::Trunc;
4140 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4141 }
4142
4143 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4144 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4145 ScalarStep =
4146 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4147 }
4148
4149 VPIRFlags Flags;
4150 unsigned MulOpc;
4151 if (IVTy->isFloatingPointTy()) {
4152 MulOpc = Instruction::FMul;
4153 Flags = VPI->getFastMathFlags();
4154 } else {
4155 MulOpc = Instruction::Mul;
4156 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4157 }
4158
4159 VPInstruction *Mul = Builder.createNaryOp(
4160 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4161 VectorStep = Mul;
4162 VPI->replaceAllUsesWith(VectorStep);
4163 ToRemove.push_back(VPI);
4164 }
4165 }
4166
4167 for (VPRecipeBase *R : ToRemove)
4168 R->eraseFromParent();
4169}
4170
4172 VPBasicBlock *HeaderVPBB,
4173 VPBasicBlock *LatchVPBB,
4174 VPBasicBlock *MiddleVPBB,
4175 UncountableExitStyle Style) {
4176 struct EarlyExitInfo {
4177 VPBasicBlock *EarlyExitingVPBB;
4178 VPIRBasicBlock *EarlyExitVPBB;
4179 VPValue *CondToExit;
4180 };
4181
4182 VPDominatorTree VPDT(Plan);
4183 VPBuilder Builder(LatchVPBB->getTerminator());
4185 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4186 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4187 if (Pred == MiddleVPBB)
4188 continue;
4189 // Collect condition for this early exit.
4190 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4191 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4192 VPValue *CondOfEarlyExitingVPBB;
4193 [[maybe_unused]] bool Matched =
4194 match(EarlyExitingVPBB->getTerminator(),
4195 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4196 assert(Matched && "Terminator must be BranchOnCond");
4197
4198 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4199 // the correct block mask.
4200 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4201 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4203 TrueSucc == ExitBlock
4204 ? CondOfEarlyExitingVPBB
4205 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4206 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4207 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4208 VPDT.properlyDominates(
4209 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4210 LatchVPBB)) &&
4211 "exit condition must dominate the latch");
4212 Exits.push_back({
4213 EarlyExitingVPBB,
4214 ExitBlock,
4215 CondToEarlyExit,
4216 });
4217 }
4218 }
4219
4220 assert(!Exits.empty() && "must have at least one early exit");
4221 // Sort exits by RPO order to get correct program order. RPO gives a
4222 // topological ordering of the CFG, ensuring upstream exits are checked
4223 // before downstream exits in the dispatch chain.
4225 HeaderVPBB);
4227 for (const auto &[Num, VPB] : enumerate(RPOT))
4228 RPOIdx[VPB] = Num;
4229 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4230 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4231 });
4232#ifndef NDEBUG
4233 // After RPO sorting, verify that for any pair where one exit dominates
4234 // another, the dominating exit comes first. This is guaranteed by RPO
4235 // (topological order) and is required for the dispatch chain correctness.
4236 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4237 for (unsigned J = I + 1; J < Exits.size(); ++J)
4238 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4239 Exits[I].EarlyExitingVPBB) &&
4240 "RPO sort must place dominating exits before dominated ones");
4241#endif
4242
4243 // Build the AnyOf condition for the latch terminator using logical OR
4244 // to avoid poison propagation from later exit conditions when an earlier
4245 // exit is taken.
4246 VPValue *Combined = Exits[0].CondToExit;
4247 for (const EarlyExitInfo &Info : drop_begin(Exits))
4248 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4249
4250 VPValue *IsAnyExitTaken =
4251 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4252
4254 "Early exit store masking not implemented");
4255
4256 // Create the vector.early.exit blocks.
4257 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4258 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4259 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4260 VPBasicBlock *VectorEarlyExitVPBB =
4261 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4262 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4263 }
4264
4265 // Create the dispatch block (or reuse the single exit block if only one
4266 // exit). The dispatch block computes the first active lane of the combined
4267 // condition and, for multiple exits, chains through conditions to determine
4268 // which exit to take.
4269 VPBasicBlock *DispatchVPBB =
4270 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4271 : Plan.createVPBasicBlock("vector.early.exit.check");
4272 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4273 VPValue *FirstActiveLane =
4274 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4275 DebugLoc::getUnknown(), "first.active.lane");
4276
4277 // For each early exit, disconnect the original exiting block
4278 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4279 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4280 // values at the first active lane:
4281 //
4282 // Input:
4283 // early.exiting.I:
4284 // ...
4285 // EMIT branch-on-cond vp<%cond.I>
4286 // Successor(s): in.loop.succ, ir-bb<exit.I>
4287 //
4288 // ir-bb<exit.I>:
4289 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4290 //
4291 // Output:
4292 // early.exiting.I:
4293 // ...
4294 // Successor(s): in.loop.succ
4295 //
4296 // vector.early.exit.I:
4297 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4298 // Successor(s): ir-bb<exit.I>
4299 //
4300 // ir-bb<exit.I>:
4301 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4302 // vector.early.exit.I)
4303 //
4304 for (auto [Exit, VectorEarlyExitVPBB] :
4305 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4306 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4307 // Adjust the phi nodes in EarlyExitVPBB.
4308 // 1. remove incoming values from EarlyExitingVPBB,
4309 // 2. extract the incoming value at FirstActiveLane
4310 // 3. add back the extracts as last operands for the phis
4311 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4312 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4313 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4314 // values from VectorEarlyExitVPBB.
4315 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4316 auto *ExitIRI = cast<VPIRPhi>(&R);
4317 VPValue *IncomingVal =
4318 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4319 VPValue *NewIncoming = IncomingVal;
4320 if (!isa<VPIRValue>(IncomingVal)) {
4321 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4322 NewIncoming = EarlyExitBuilder.createNaryOp(
4323 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4324 DebugLoc::getUnknown(), "early.exit.value");
4325 }
4326 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4327 ExitIRI->addOperand(NewIncoming);
4328 }
4329
4330 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4331 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4332 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4333 }
4334
4335 // Chain through exits: for each exit, check if its condition is true at
4336 // the first active lane. If so, take that exit; otherwise, try the next.
4337 // The last exit needs no check since it must be taken if all others fail.
4338 //
4339 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4340 //
4341 // latch:
4342 // ...
4343 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4344 // ...
4345 //
4346 // vector.early.exit.check:
4347 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4348 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4349 // EMIT branch-on-cond vp<%at.cond.0>
4350 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4351 //
4352 // vector.early.exit.check.0:
4353 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4354 // EMIT branch-on-cond vp<%at.cond.1>
4355 // Successor(s): vector.early.exit.1, vector.early.exit.2
4356 VPBasicBlock *CurrentBB = DispatchVPBB;
4357 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4358 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4359 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4360 DebugLoc::getUnknown(), "exit.cond.at.lane");
4361
4362 // For the last dispatch, branch directly to the last exit on false;
4363 // otherwise, create a new check block.
4364 bool IsLastDispatch = (I + 2 == Exits.size());
4365 VPBasicBlock *FalseBB =
4366 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4367 : Plan.createVPBasicBlock(
4368 Twine("vector.early.exit.check.") + Twine(I));
4369
4370 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4371 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4372 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4373 FalseBB->setPredecessors({CurrentBB});
4374
4375 CurrentBB = FalseBB;
4376 DispatchBuilder.setInsertPoint(CurrentBB);
4377 }
4378
4379 // Replace the latch terminator with the new branching logic.
4380 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4381 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4382 "Unexpected terminator");
4383 auto *IsLatchExitTaken =
4384 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4385 LatchExitingBranch->getOperand(1));
4386
4387 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4388 LatchExitingBranch->eraseFromParent();
4389 Builder.setInsertPoint(LatchVPBB);
4390 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4391 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4392 LatchVPBB->clearSuccessors();
4393 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4394 DispatchVPBB->setPredecessors({LatchVPBB});
4395}
4396
4397/// This function tries convert extended in-loop reductions to
4398/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4399/// valid. The created recipe must be decomposed to its constituent
4400/// recipes before execution.
4401static VPExpressionRecipe *
4403 VFRange &Range) {
4404 Type *RedTy = Ctx.Types.inferScalarType(Red);
4405 VPValue *VecOp = Red->getVecOp();
4406
4407 // Clamp the range if using extended-reduction is profitable.
4408 auto IsExtendedRedValidAndClampRange =
4409 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4411 [&](ElementCount VF) {
4412 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4414
4416 InstructionCost ExtCost =
4417 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4418 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4419
4420 if (Red->isPartialReduction()) {
4423 // FIXME: Move partial reduction creation, costing and clamping
4424 // here from LoopVectorize.cpp.
4425 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4426 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4427 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4428 RedTy->isFloatingPointTy()
4429 ? std::optional{Red->getFastMathFlags()}
4430 : std::nullopt);
4431 } else if (!RedTy->isFloatingPointTy()) {
4432 // TTI::getExtendedReductionCost only supports integer types.
4433 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4434 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4435 Red->getFastMathFlags(), CostKind);
4436 }
4437 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4438 },
4439 Range);
4440 };
4441
4442 VPValue *A;
4443 // Match reduce(ext)).
4444 if (isa<VPWidenCastRecipe>(VecOp) &&
4445 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4446 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4447 IsExtendedRedValidAndClampRange(
4448 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4449 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4450 Ctx.Types.inferScalarType(A)))
4451 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4452
4453 return nullptr;
4454}
4455
4456/// This function tries convert extended in-loop reductions to
4457/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4458/// and valid. The created VPExpressionRecipe must be decomposed to its
4459/// constituent recipes before execution. Patterns of the
4460/// VPExpressionRecipe:
4461/// reduce.add(mul(...)),
4462/// reduce.add(mul(ext(A), ext(B))),
4463/// reduce.add(ext(mul(ext(A), ext(B)))).
4464/// reduce.fadd(fmul(ext(A), ext(B)))
4465static VPExpressionRecipe *
4467 VPCostContext &Ctx, VFRange &Range) {
4468 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4469 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4470 Opcode != Instruction::FAdd)
4471 return nullptr;
4472
4473 Type *RedTy = Ctx.Types.inferScalarType(Red);
4474
4475 // Clamp the range if using multiply-accumulate-reduction is profitable.
4476 auto IsMulAccValidAndClampRange =
4478 VPWidenCastRecipe *OuterExt) -> bool {
4480 [&](ElementCount VF) {
4482 Type *SrcTy =
4483 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4484 InstructionCost MulAccCost;
4485
4486 if (Red->isPartialReduction()) {
4487 Type *SrcTy2 =
4488 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4489 // FIXME: Move partial reduction creation, costing and clamping
4490 // here from LoopVectorize.cpp.
4491 MulAccCost = Ctx.TTI.getPartialReductionCost(
4492 Opcode, SrcTy, SrcTy2, RedTy, VF,
4494 Ext0->getOpcode())
4497 Ext1->getOpcode())
4499 Mul->getOpcode(), CostKind,
4500 RedTy->isFloatingPointTy()
4501 ? std::optional{Red->getFastMathFlags()}
4502 : std::nullopt);
4503 } else {
4504 // Only partial reductions support mixed or floating-point extends
4505 // at the moment.
4506 if (Ext0 && Ext1 &&
4507 (Ext0->getOpcode() != Ext1->getOpcode() ||
4508 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4509 return false;
4510
4511 bool IsZExt =
4512 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4513 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4514 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4515 SrcVecTy, CostKind);
4516 }
4517
4518 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4519 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4520 InstructionCost ExtCost = 0;
4521 if (Ext0)
4522 ExtCost += Ext0->computeCost(VF, Ctx);
4523 if (Ext1)
4524 ExtCost += Ext1->computeCost(VF, Ctx);
4525 if (OuterExt)
4526 ExtCost += OuterExt->computeCost(VF, Ctx);
4527
4528 return MulAccCost.isValid() &&
4529 MulAccCost < ExtCost + MulCost + RedCost;
4530 },
4531 Range);
4532 };
4533
4534 VPValue *VecOp = Red->getVecOp();
4535 VPRecipeBase *Sub = nullptr;
4536 VPValue *A, *B;
4537 VPValue *Tmp = nullptr;
4538
4539 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4540 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4541 assert(Opcode == Instruction::FAdd &&
4542 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4543 "instruction");
4544 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4545 if (!FMul)
4546 return nullptr;
4547
4548 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4549 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4550
4551 if (RecipeA && RecipeB &&
4552 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4553 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4554 }
4555 }
4556 if (RedTy->isFloatingPointTy())
4557 return nullptr;
4558
4559 // Sub reductions could have a sub between the add reduction and vec op.
4560 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4561 Sub = VecOp->getDefiningRecipe();
4562 VecOp = Tmp;
4563 }
4564
4565 // If ValB is a constant and can be safely extended, truncate it to the same
4566 // type as ExtA's operand, then extend it to the same type as ExtA. This
4567 // creates two uniform extends that can more easily be matched by the rest of
4568 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4569 // replaced with the new extend of the constant.
4570 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4571 VPWidenCastRecipe *&ExtB,
4572 VPValue *&ValB,
4573 VPWidenRecipe *Mul) {
4574 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4575 return;
4576 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4577 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4578 const APInt *Const;
4579 if (!match(ValB, m_APInt(Const)) ||
4581 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4582 return;
4583 // The truncate ensures that the type of each extended operand is the
4584 // same, and it's been proven that the constant can be extended from
4585 // NarrowTy safely. Necessary since ExtA's extended operand would be
4586 // e.g. an i8, while the const will likely be an i32. This will be
4587 // elided by later optimisations.
4588 VPBuilder Builder(Mul);
4589 auto *Trunc =
4590 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4591 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4592 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4593 Mul->setOperand(1, ExtB);
4594 };
4595
4596 // Try to match reduce.add(mul(...)).
4597 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4600 auto *Mul = cast<VPWidenRecipe>(VecOp);
4601
4602 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4603 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4604
4605 // Match reduce.add/sub(mul(ext, ext)).
4606 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4607 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4608 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4609 if (Sub)
4610 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4611 cast<VPWidenRecipe>(Sub), Red);
4612 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4613 }
4614 // TODO: Add an expression type for this variant with a negated mul
4615 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4616 return new VPExpressionRecipe(Mul, Red);
4617 }
4618 // TODO: Add an expression type for negated versions of other expression
4619 // variants.
4620 if (Sub)
4621 return nullptr;
4622
4623 // Match reduce.add(ext(mul(A, B))).
4624 if (!Red->isPartialReduction() &&
4625 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4626 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4627 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4630
4631 // reduce.add(ext(mul(ext, const)))
4632 // -> reduce.add(ext(mul(ext, ext(const))))
4633 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4634
4635 // reduce.add(ext(mul(ext(A), ext(B))))
4636 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4637 // The inner extends must either have the same opcode as the outer extend or
4638 // be the same, in which case the multiply can never result in a negative
4639 // value and the outer extend can be folded away by doing wider
4640 // extends for the operands of the mul.
4641 if (Ext0 && Ext1 &&
4642 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4643 Ext0->getOpcode() == Ext1->getOpcode() &&
4644 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4645 auto *NewExt0 = new VPWidenCastRecipe(
4646 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4647 *Ext0, *Ext0, Ext0->getDebugLoc());
4648 NewExt0->insertBefore(Ext0);
4649
4650 VPWidenCastRecipe *NewExt1 = NewExt0;
4651 if (Ext0 != Ext1) {
4652 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4653 Ext->getResultType(), nullptr, *Ext1,
4654 *Ext1, Ext1->getDebugLoc());
4655 NewExt1->insertBefore(Ext1);
4656 }
4657 Mul->setOperand(0, NewExt0);
4658 Mul->setOperand(1, NewExt1);
4659 Red->setOperand(1, Mul);
4660 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4661 }
4662 }
4663 return nullptr;
4664}
4665
4666/// This function tries to create abstract recipes from the reduction recipe for
4667/// following optimizations and cost estimation.
4669 VPCostContext &Ctx,
4670 VFRange &Range) {
4671 VPExpressionRecipe *AbstractR = nullptr;
4672 auto IP = std::next(Red->getIterator());
4673 auto *VPBB = Red->getParent();
4674 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4675 AbstractR = MulAcc;
4676 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4677 AbstractR = ExtRed;
4678 // Cannot create abstract inloop reduction recipes.
4679 if (!AbstractR)
4680 return;
4681
4682 AbstractR->insertBefore(*VPBB, IP);
4683 Red->replaceAllUsesWith(AbstractR);
4684}
4685
4696
4698 if (Plan.hasScalarVFOnly())
4699 return;
4700
4701#ifndef NDEBUG
4702 VPDominatorTree VPDT(Plan);
4703#endif
4704
4705 SmallVector<VPValue *> VPValues;
4708 append_range(VPValues, Plan.getLiveIns());
4709 for (VPRecipeBase &R : *Plan.getEntry())
4710 append_range(VPValues, R.definedValues());
4711
4712 auto *VectorPreheader = Plan.getVectorPreheader();
4713 for (VPValue *VPV : VPValues) {
4715 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4716 continue;
4717
4718 // Add explicit broadcast at the insert point that dominates all users.
4719 VPBasicBlock *HoistBlock = VectorPreheader;
4720 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4721 for (VPUser *User : VPV->users()) {
4722 if (User->usesScalars(VPV))
4723 continue;
4724 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4725 HoistPoint = HoistBlock->begin();
4726 else
4727 assert(VPDT.dominates(VectorPreheader,
4728 cast<VPRecipeBase>(User)->getParent()) &&
4729 "All users must be in the vector preheader or dominated by it");
4730 }
4731
4732 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4733 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4734 VPV->replaceUsesWithIf(Broadcast,
4735 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4736 return Broadcast != &U && !U.usesScalars(VPV);
4737 });
4738 }
4739}
4740
4742 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4743
4744 // Collect candidate loads with invariant addresses and noalias scopes
4745 // metadata and memory-writing recipes with noalias metadata.
4749 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4750 for (VPRecipeBase &R : *VPBB) {
4751 // Only handle single-scalar replicated loads with invariant addresses.
4752 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4753 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4754 RepR->getOpcode() != Instruction::Load)
4755 continue;
4756
4757 VPValue *Addr = RepR->getOperand(0);
4758 if (Addr->isDefinedOutsideLoopRegions()) {
4760 if (!Loc.AATags.Scope)
4761 continue;
4762 CandidateLoads.push_back({RepR, Loc});
4763 }
4764 }
4765 if (R.mayWriteToMemory()) {
4767 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4768 return;
4769 Stores.push_back(*Loc);
4770 }
4771 }
4772 }
4773
4774 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4775 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4776 // Hoist the load to the preheader if it doesn't alias with any stores
4777 // according to the noalias metadata. Other loads should have been hoisted
4778 // by other passes
4779 const AAMDNodes &LoadAA = LoadLoc.AATags;
4780 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4782 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4783 })) {
4784 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4785 }
4786 }
4787}
4788
4789// Collect common metadata from a group of replicate recipes by intersecting
4790// metadata from all recipes in the group.
4792 VPIRMetadata CommonMetadata = *Recipes.front();
4793 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4794 CommonMetadata.intersect(*Recipe);
4795 return CommonMetadata;
4796}
4797
4798template <unsigned Opcode>
4802 const Loop *L) {
4803 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4804 "Only Load and Store opcodes supported");
4805 constexpr bool IsLoad = (Opcode == Instruction::Load);
4806 VPTypeAnalysis TypeInfo(Plan);
4807
4808 // For each address, collect operations with the same or complementary masks.
4810 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4811 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4812 };
4814 Plan, PSE, L,
4815 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4816 for (auto Recipes : Groups) {
4817 if (Recipes.size() < 2)
4818 continue;
4819
4820 // Collect groups with the same or complementary masks.
4821 for (VPReplicateRecipe *&RecipeI : Recipes) {
4822 if (!RecipeI)
4823 continue;
4824
4825 VPValue *MaskI = RecipeI->getMask();
4826 Type *TypeI = GetLoadStoreValueType(RecipeI);
4828 Group.push_back(RecipeI);
4829 RecipeI = nullptr;
4830
4831 // Find all operations with the same or complementary masks.
4832 bool HasComplementaryMask = false;
4833 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4834 if (!RecipeJ)
4835 continue;
4836
4837 VPValue *MaskJ = RecipeJ->getMask();
4838 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4839 if (TypeI == TypeJ) {
4840 // Check if any operation in the group has a complementary mask with
4841 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4842 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4843 match(MaskJ, m_Not(m_Specific(MaskI)));
4844 Group.push_back(RecipeJ);
4845 RecipeJ = nullptr;
4846 }
4847 }
4848
4849 if (HasComplementaryMask) {
4850 assert(Group.size() >= 2 && "must have at least 2 entries");
4851 AllGroups.push_back(std::move(Group));
4852 }
4853 }
4854 }
4855
4856 return AllGroups;
4857}
4858
4859// Find the recipe with minimum alignment in the group.
4860template <typename InstType>
4861static VPReplicateRecipe *
4863 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4864 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4865 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4866 });
4867}
4868
4871 const Loop *L) {
4872 auto Groups =
4874 if (Groups.empty())
4875 return;
4876
4877 // Process each group of loads.
4878 for (auto &Group : Groups) {
4879 // Try to use the earliest (most dominating) load to replace all others.
4880 VPReplicateRecipe *EarliestLoad = Group[0];
4881 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4882 VPBasicBlock *LastBB = Group.back()->getParent();
4883
4884 // Check that the load doesn't alias with stores between first and last.
4885 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4886 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4887 continue;
4888
4889 // Collect common metadata from all loads in the group.
4890 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4891
4892 // Find the load with minimum alignment to use.
4893 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4894
4895 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4896 assert(all_of(Group,
4897 [IsSingleScalar](VPReplicateRecipe *R) {
4898 return R->isSingleScalar() == IsSingleScalar;
4899 }) &&
4900 "all members in group must agree on IsSingleScalar");
4901
4902 // Create an unpredicated version of the earliest load with common
4903 // metadata.
4904 auto *UnpredicatedLoad = new VPReplicateRecipe(
4905 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4906 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4907
4908 UnpredicatedLoad->insertBefore(EarliestLoad);
4909
4910 // Replace all loads in the group with the unpredicated load.
4911 for (VPReplicateRecipe *Load : Group) {
4912 Load->replaceAllUsesWith(UnpredicatedLoad);
4913 Load->eraseFromParent();
4914 }
4915 }
4916}
4917
4918static bool
4920 PredicatedScalarEvolution &PSE, const Loop &L,
4921 VPTypeAnalysis &TypeInfo) {
4922 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4923 if (!StoreLoc || !StoreLoc->AATags.Scope)
4924 return false;
4925
4926 // When sinking a group of stores, all members of the group alias each other.
4927 // Skip them during the alias checks.
4928 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4929 StoresToSink.end());
4930
4931 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4932 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4933 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4934 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4935}
4936
4939 const Loop *L) {
4940 auto Groups =
4942 if (Groups.empty())
4943 return;
4944
4945 VPTypeAnalysis TypeInfo(Plan);
4946
4947 for (auto &Group : Groups) {
4948 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4949 continue;
4950
4951 // Use the last (most dominated) store's location for the unconditional
4952 // store.
4953 VPReplicateRecipe *LastStore = Group.back();
4954 VPBasicBlock *InsertBB = LastStore->getParent();
4955
4956 // Collect common alias metadata from all stores in the group.
4957 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4958
4959 // Build select chain for stored values.
4960 VPValue *SelectedValue = Group[0]->getOperand(0);
4961 VPBuilder Builder(InsertBB, LastStore->getIterator());
4962
4963 bool IsSingleScalar = Group[0]->isSingleScalar();
4964 for (unsigned I = 1; I < Group.size(); ++I) {
4965 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4966 "all members in group must agree on IsSingleScalar");
4967 VPValue *Mask = Group[I]->getMask();
4968 VPValue *Value = Group[I]->getOperand(0);
4969 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4970 Group[I]->getDebugLoc());
4971 }
4972
4973 // Find the store with minimum alignment to use.
4974 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4975
4976 // Create unconditional store with selected value and common metadata.
4977 auto *UnpredicatedStore = new VPReplicateRecipe(
4978 StoreWithMinAlign->getUnderlyingInstr(),
4979 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4980 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4981 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4982
4983 // Remove all predicated stores from the group.
4984 for (VPReplicateRecipe *Store : Group)
4985 Store->eraseFromParent();
4986 }
4987}
4988
4990 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4992 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4993 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4994
4995 VPValue *TC = Plan.getTripCount();
4996 if (TC->getNumUsers() == 0)
4997 return;
4998
4999 // Skip cases for which the trip count may be non-trivial to materialize.
5000 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5001 // tail is required.
5002 if (!Plan.hasScalarTail() ||
5004 Plan.getScalarPreheader() ||
5005 !isa<VPIRValue>(TC))
5006 return;
5007
5008 // Materialize vector trip counts for constants early if it can simply
5009 // be computed as (Original TC / VF * UF) * VF * UF.
5010 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5011 // tail-folded loops.
5012 ScalarEvolution &SE = *PSE.getSE();
5013 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5014 if (!isa<SCEVConstant>(TCScev))
5015 return;
5016 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5017 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5018 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5019 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5020}
5021
5023 VPBasicBlock *VectorPH) {
5025 if (BTC->getNumUsers() == 0)
5026 return;
5027
5028 VPBuilder Builder(VectorPH, VectorPH->begin());
5029 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5030 auto *TCMO =
5031 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5032 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5033 BTC->replaceAllUsesWith(TCMO);
5034}
5035
5037 if (Plan.hasScalarVFOnly())
5038 return;
5039
5040 VPTypeAnalysis TypeInfo(Plan);
5041 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5042 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5044 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5045 vp_depth_first_shallow(LoopRegion->getEntry()));
5046 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5047 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5048 // regions. Those are not materialized explicitly yet. Those vector users are
5049 // still handled in VPReplicateRegion::execute(), via shouldPack().
5050 // TODO: materialize build vectors for replicating recipes in replicating
5051 // regions.
5052 for (VPBasicBlock *VPBB :
5053 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5054 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5056 continue;
5057 auto *DefR = cast<VPSingleDefRecipe>(&R);
5058 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5059 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5060 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5061 };
5062 if ((isa<VPReplicateRecipe>(DefR) &&
5063 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5064 (isa<VPInstruction>(DefR) &&
5066 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5067 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5068 continue;
5069
5070 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5071 unsigned Opcode = ScalarTy->isStructTy()
5074 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5075 BuildVector->insertAfter(DefR);
5076
5077 DefR->replaceUsesWithIf(
5078 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5079 VPUser &U, unsigned) {
5080 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5081 });
5082 }
5083 }
5084
5085 // Create explicit VPInstructions to convert vectors to scalars. The current
5086 // implementation is conservative - it may miss some cases that may or may not
5087 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5088 // if they are known to operate on scalar values.
5089 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5090 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5093 continue;
5094 for (VPValue *Def : R.definedValues()) {
5095 // Skip recipes that are single-scalar or only have their first lane
5096 // used.
5097 // TODO: The Defs skipped here may or may not be vector values.
5098 // Introduce Unpacks, and remove them later, if they are guaranteed to
5099 // produce scalar values.
5101 continue;
5102
5103 // At the moment, we create unpacks only for scalar users outside
5104 // replicate regions. Recipes inside replicate regions still extract the
5105 // required lanes implicitly.
5106 // TODO: Remove once replicate regions are unrolled completely.
5107 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5108 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5109 return U->usesScalars(Def) &&
5110 (!ParentRegion || !ParentRegion->isReplicator());
5111 };
5112 if (none_of(Def->users(), IsCandidateUnpackUser))
5113 continue;
5114
5115 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5116 if (R.isPhi())
5117 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5118 else
5119 Unpack->insertAfter(&R);
5120 Def->replaceUsesWithIf(Unpack,
5121 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5122 return IsCandidateUnpackUser(&U);
5123 });
5124 }
5125 }
5126 }
5127}
5128
5130 VPBasicBlock *VectorPHVPBB,
5131 bool TailByMasking,
5132 bool RequiresScalarEpilogue,
5133 VPValue *Step) {
5134 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5135 // There's nothing to do if there are no users of the vector trip count or its
5136 // IR value has already been set.
5137 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5138 return;
5139
5140 VPValue *TC = Plan.getTripCount();
5141 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5142 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5143 if (auto *StepR = Step->getDefiningRecipe()) {
5144 assert(StepR->getParent() == VectorPHVPBB &&
5145 "Step must be defined in VectorPHVPBB");
5146 // Insert after Step's definition to maintain valid def-use ordering.
5147 InsertPt = std::next(StepR->getIterator());
5148 }
5149 VPBuilder Builder(VectorPHVPBB, InsertPt);
5150
5151 // If the tail is to be folded by masking, round the number of iterations N
5152 // up to a multiple of Step instead of rounding down. This is done by first
5153 // adding Step-1 and then rounding down. Note that it's ok if this addition
5154 // overflows: the vector induction variable will eventually wrap to zero given
5155 // that it starts at zero and its Step is a power of two; the loop will then
5156 // exit, with the last early-exit vector comparison also producing all-true.
5157 if (TailByMasking) {
5158 TC = Builder.createAdd(
5159 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5160 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5161 }
5162
5163 // Now we need to generate the expression for the part of the loop that the
5164 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5165 // iterations are not required for correctness, or N - Step, otherwise. Step
5166 // is equal to the vectorization factor (number of SIMD elements) times the
5167 // unroll factor (number of SIMD instructions).
5168 VPValue *R =
5169 Builder.createNaryOp(Instruction::URem, {TC, Step},
5170 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5171
5172 // There are cases where we *must* run at least one iteration in the remainder
5173 // loop. See the cost model for when this can happen. If the step evenly
5174 // divides the trip count, we set the remainder to be equal to the step. If
5175 // the step does not evenly divide the trip count, no adjustment is necessary
5176 // since there will already be scalar iterations. Note that the minimum
5177 // iterations check ensures that N >= Step.
5178 if (RequiresScalarEpilogue) {
5179 assert(!TailByMasking &&
5180 "requiring scalar epilogue is not supported with fail folding");
5181 VPValue *IsZero =
5182 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5183 R = Builder.createSelect(IsZero, Step, R);
5184 }
5185
5186 VPValue *Res =
5187 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5188 VectorTC.replaceAllUsesWith(Res);
5189}
5190
5192 ElementCount VFEC) {
5193 // If VF and VFxUF have already been materialized (no remaining users),
5194 // there's nothing more to do.
5195 if (Plan.getVF().isMaterialized()) {
5196 assert(Plan.getVFxUF().isMaterialized() &&
5197 "VF and VFxUF must be materialized together");
5198 return;
5199 }
5200
5201 VPBuilder Builder(VectorPH, VectorPH->begin());
5202 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5203 VPValue &VF = Plan.getVF();
5204 VPValue &VFxUF = Plan.getVFxUF();
5205 // If there are no users of the runtime VF, compute VFxUF by constant folding
5206 // the multiplication of VF and UF.
5207 if (VF.getNumUsers() == 0) {
5208 VPValue *RuntimeVFxUF =
5209 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5210 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5211 return;
5212 }
5213
5214 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5215 // vscale) * UF.
5216 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5218 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5220 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5221 }
5222 VF.replaceAllUsesWith(RuntimeVF);
5223
5224 VPValue *MulByUF = Builder.createOverflowingOp(
5225 Instruction::Mul,
5226 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5227 {true, false});
5228 VFxUF.replaceAllUsesWith(MulByUF);
5229}
5230
5233 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5234
5235 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5236 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5237 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5238 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5240 continue;
5241 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5242 if (!ExpSCEV)
5243 break;
5244 const SCEV *Expr = ExpSCEV->getSCEV();
5245 Value *Res =
5246 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5247 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5248 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5249 ExpSCEV->replaceAllUsesWith(Exp);
5250 if (Plan.getTripCount() == ExpSCEV)
5251 Plan.resetTripCount(Exp);
5252 ExpSCEV->eraseFromParent();
5253 }
5255 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5256 "before any VPIRInstructions");
5257 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5258 // to the VPIRBasicBlock.
5259 auto EI = Entry->begin();
5260 for (Instruction &I : drop_end(*EntryBB)) {
5261 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5262 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5263 EI++;
5264 continue;
5265 }
5267 }
5268
5269 return ExpandedSCEVs;
5270}
5271
5272/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5273/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5274/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5275/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5276/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5277/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5278/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5279/// is defined at \p Idx of a load interleave group.
5280static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5281 VPValue *OpV, unsigned Idx, bool IsScalable) {
5282 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5283 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5284 if (!Member0OpR)
5285 return Member0Op == OpV;
5286 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5287 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5288 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5289 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5290 Member0Op == OpV;
5291 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5292 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5293 return false;
5294}
5295
5296static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5298 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5299 if (!WideMember0)
5300 return false;
5301 for (VPValue *V : Ops) {
5303 return false;
5304 auto *R = cast<VPSingleDefRecipe>(V);
5305 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5306 return false;
5307 }
5308
5309 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5311 for (VPValue *Op : Ops)
5312 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5313
5314 if (canNarrowOps(OpsI, IsScalable))
5315 continue;
5316
5317 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5318 const auto &[OpIdx, OpV] = P;
5319 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5320 }))
5321 return false;
5322 }
5323
5324 return true;
5325}
5326
5327/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5328/// number of members both equal to VF. The interleave group must also access
5329/// the full vector width.
5330static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5332 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5333 if (!InterleaveR || InterleaveR->getMask())
5334 return std::nullopt;
5335
5336 Type *GroupElementTy = nullptr;
5337 if (InterleaveR->getStoredValues().empty()) {
5338 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5339 if (!all_of(InterleaveR->definedValues(),
5340 [&TypeInfo, GroupElementTy](VPValue *Op) {
5341 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5342 }))
5343 return std::nullopt;
5344 } else {
5345 GroupElementTy =
5346 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5347 if (!all_of(InterleaveR->getStoredValues(),
5348 [&TypeInfo, GroupElementTy](VPValue *Op) {
5349 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5350 }))
5351 return std::nullopt;
5352 }
5353
5354 auto IG = InterleaveR->getInterleaveGroup();
5355 if (IG->getFactor() != IG->getNumMembers())
5356 return std::nullopt;
5357
5358 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5359 TypeSize Size = TTI.getRegisterBitWidth(
5362 assert(Size.isScalable() == VF.isScalable() &&
5363 "if Size is scalable, VF must be scalable and vice versa");
5364 return Size.getKnownMinValue();
5365 };
5366
5367 for (ElementCount VF : VFs) {
5368 unsigned MinVal = VF.getKnownMinValue();
5369 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5370 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5371 return {VF};
5372 }
5373 return std::nullopt;
5374}
5375
5376/// Returns true if \p VPValue is a narrow VPValue.
5377static bool isAlreadyNarrow(VPValue *VPV) {
5378 if (isa<VPIRValue>(VPV))
5379 return true;
5380 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5381 return RepR && RepR->isSingleScalar();
5382}
5383
5384// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5385// a narrow variant.
5386static VPValue *
5388 auto *R = V->getDefiningRecipe();
5389 if (!R || NarrowedOps.contains(V))
5390 return V;
5391
5392 if (isAlreadyNarrow(V))
5393 return V;
5394
5396 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5397 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5398 WideMember0->setOperand(
5399 Idx,
5400 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5401 return V;
5402 }
5403
5404 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5405 // Narrow interleave group to wide load, as transformed VPlan will only
5406 // process one original iteration.
5407 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5408 auto *L = new VPWidenLoadRecipe(
5409 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5410 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5411 L->insertBefore(LoadGroup);
5412 NarrowedOps.insert(L);
5413 return L;
5414 }
5415
5416 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5417 assert(RepR->isSingleScalar() &&
5418 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5419 "must be a single scalar load");
5420 NarrowedOps.insert(RepR);
5421 return RepR;
5422 }
5423
5424 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5425 VPValue *PtrOp = WideLoad->getAddr();
5426 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5427 PtrOp = VecPtr->getOperand(0);
5428 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5429 // process one original iteration.
5430 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5431 /*IsUniform*/ true,
5432 /*Mask*/ nullptr, {}, *WideLoad);
5433 N->insertBefore(WideLoad);
5434 NarrowedOps.insert(N);
5435 return N;
5436}
5437
5438std::unique_ptr<VPlan>
5440 const TargetTransformInfo &TTI) {
5441 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5442
5443 if (!VectorLoop)
5444 return nullptr;
5445
5446 // Only handle single-block loops for now.
5447 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5448 return nullptr;
5449
5450 // Skip plans when we may not be able to properly narrow.
5451 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5452 if (!match(&Exiting->back(), m_BranchOnCount()))
5453 return nullptr;
5454
5455 assert(match(&Exiting->back(),
5457 m_Specific(&Plan.getVectorTripCount()))) &&
5458 "unexpected branch-on-count");
5459
5460 VPTypeAnalysis TypeInfo(Plan);
5462 std::optional<ElementCount> VFToOptimize;
5463 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5465 continue;
5466
5469 continue;
5470
5471 // Bail out on recipes not supported at the moment:
5472 // * phi recipes other than the canonical induction
5473 // * recipes writing to memory except interleave groups
5474 // Only support plans with a canonical induction phi.
5475 if (R.isPhi())
5476 return nullptr;
5477
5478 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5479 if (R.mayWriteToMemory() && !InterleaveR)
5480 return nullptr;
5481
5482 // All other ops are allowed, but we reject uses that cannot be converted
5483 // when checking all allowed consumers (store interleave groups) below.
5484 if (!InterleaveR)
5485 continue;
5486
5487 // Try to find a single VF, where all interleave groups are consecutive and
5488 // saturate the full vector width. If we already have a candidate VF, check
5489 // if it is applicable for the current InterleaveR, otherwise look for a
5490 // suitable VF across the Plan's VFs.
5492 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5493 : to_vector(Plan.vectorFactors());
5494 std::optional<ElementCount> NarrowedVF =
5495 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5496 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5497 return nullptr;
5498 VFToOptimize = NarrowedVF;
5499
5500 // Skip read interleave groups.
5501 if (InterleaveR->getStoredValues().empty())
5502 continue;
5503
5504 // Narrow interleave groups, if all operands are already matching narrow
5505 // ops.
5506 auto *Member0 = InterleaveR->getStoredValues()[0];
5507 if (isAlreadyNarrow(Member0) &&
5508 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5509 StoreGroups.push_back(InterleaveR);
5510 continue;
5511 }
5512
5513 // For now, we only support full interleave groups storing load interleave
5514 // groups.
5515 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5516 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5517 if (!DefR)
5518 return false;
5519 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5520 return IR && IR->getInterleaveGroup()->isFull() &&
5521 IR->getVPValue(Op.index()) == Op.value();
5522 })) {
5523 StoreGroups.push_back(InterleaveR);
5524 continue;
5525 }
5526
5527 // Check if all values feeding InterleaveR are matching wide recipes, which
5528 // operands that can be narrowed.
5529 if (!canNarrowOps(InterleaveR->getStoredValues(),
5530 VFToOptimize->isScalable()))
5531 return nullptr;
5532 StoreGroups.push_back(InterleaveR);
5533 }
5534
5535 if (StoreGroups.empty())
5536 return nullptr;
5537
5538 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5539 bool RequiresScalarEpilogue =
5540 MiddleVPBB->getNumSuccessors() == 1 &&
5541 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5542 // Bail out for tail-folding (middle block with a single successor to exit).
5543 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5544 return nullptr;
5545
5546 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5547 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5548 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5549 // TODO: Handle cases where only some interleave groups can be narrowed.
5550 std::unique_ptr<VPlan> NewPlan;
5551 if (size(Plan.vectorFactors()) != 1) {
5552 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5553 Plan.setVF(*VFToOptimize);
5554 NewPlan->removeVF(*VFToOptimize);
5555 }
5556
5557 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5558 SmallPtrSet<VPValue *, 4> NarrowedOps;
5559 // Narrow operation tree rooted at store groups.
5560 for (auto *StoreGroup : StoreGroups) {
5561 VPValue *Res =
5562 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5563 auto *SI =
5564 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5565 auto *S = new VPWidenStoreRecipe(
5566 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5567 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5568 S->insertBefore(StoreGroup);
5569 StoreGroup->eraseFromParent();
5570 }
5571
5572 // Adjust induction to reflect that the transformed plan only processes one
5573 // original iteration.
5574 auto *CanIV = VectorLoop->getCanonicalIV();
5575 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5576 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5577 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5578
5579 VPValue *UF = &Plan.getUF();
5580 VPValue *Step;
5581 if (VFToOptimize->isScalable()) {
5582 VPValue *VScale = PHBuilder.createElementCount(
5584 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5585 {true, false});
5586 Plan.getVF().replaceAllUsesWith(VScale);
5587 } else {
5588 Step = UF;
5590 Plan.getConstantInt(CanIV->getScalarType(), 1));
5591 }
5592 // Materialize vector trip count with the narrowed step.
5593 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5594 RequiresScalarEpilogue, Step);
5595
5596 Inc->setOperand(1, Step);
5597 Plan.getVFxUF().replaceAllUsesWith(Step);
5598
5599 removeDeadRecipes(Plan);
5600 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5602 "All VPVectorPointerRecipes should have been removed");
5603 return NewPlan;
5604}
5605
5606/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5607/// BranchOnCond recipe.
5609 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5610 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5611 auto *MiddleTerm =
5613 // Only add branch metadata if there is a (conditional) terminator.
5614 if (!MiddleTerm)
5615 return;
5616
5617 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5618 "must have a BranchOnCond");
5619 // Assume that `TripCount % VectorStep ` is equally distributed.
5620 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5621 if (VF.isScalable() && VScaleForTuning.has_value())
5622 VectorStep *= *VScaleForTuning;
5623 assert(VectorStep > 0 && "trip count should not be zero");
5624 MDBuilder MDB(Plan.getContext());
5625 MDNode *BranchWeights =
5626 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5627 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5628}
5629
5631 VFRange &Range) {
5632 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5633 auto *MiddleVPBB = Plan.getMiddleBlock();
5634 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5635
5636 auto IsScalableOne = [](ElementCount VF) -> bool {
5637 return VF == ElementCount::getScalable(1);
5638 };
5639
5640 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5641 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5642 if (!FOR)
5643 continue;
5644
5645 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5646 "Cannot handle loops with uncountable early exits");
5647
5648 // This is the second phase of vectorizing first-order recurrences, creating
5649 // extract for users outside the loop. An overview of the transformation is
5650 // described below. Suppose we have the following loop with some use after
5651 // the loop of the last a[i-1],
5652 //
5653 // for (int i = 0; i < n; ++i) {
5654 // t = a[i - 1];
5655 // b[i] = a[i] - t;
5656 // }
5657 // use t;
5658 //
5659 // There is a first-order recurrence on "a". For this loop, the shorthand
5660 // scalar IR looks like:
5661 //
5662 // scalar.ph:
5663 // s.init = a[-1]
5664 // br scalar.body
5665 //
5666 // scalar.body:
5667 // i = phi [0, scalar.ph], [i+1, scalar.body]
5668 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5669 // s2 = a[i]
5670 // b[i] = s2 - s1
5671 // br cond, scalar.body, exit.block
5672 //
5673 // exit.block:
5674 // use = lcssa.phi [s1, scalar.body]
5675 //
5676 // In this example, s1 is a recurrence because it's value depends on the
5677 // previous iteration. In the first phase of vectorization, we created a
5678 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5679 // for users in the scalar preheader and exit block.
5680 //
5681 // vector.ph:
5682 // v_init = vector(..., ..., ..., a[-1])
5683 // br vector.body
5684 //
5685 // vector.body
5686 // i = phi [0, vector.ph], [i+4, vector.body]
5687 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5688 // v2 = a[i, i+1, i+2, i+3]
5689 // b[i] = v2 - v1
5690 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5691 // b[i, i+1, i+2, i+3] = v2 - v1
5692 // br cond, vector.body, middle.block
5693 //
5694 // middle.block:
5695 // vector.recur.extract.for.phi = v2(2)
5696 // vector.recur.extract = v2(3)
5697 // br cond, scalar.ph, exit.block
5698 //
5699 // scalar.ph:
5700 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5701 // [s.init, otherwise]
5702 // br scalar.body
5703 //
5704 // scalar.body:
5705 // i = phi [0, scalar.ph], [i+1, scalar.body]
5706 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5707 // s2 = a[i]
5708 // b[i] = s2 - s1
5709 // br cond, scalar.body, exit.block
5710 //
5711 // exit.block:
5712 // lo = lcssa.phi [s1, scalar.body],
5713 // [vector.recur.extract.for.phi, middle.block]
5714 //
5715 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5716 // Extract the penultimate value of the recurrence and use it as operand for
5717 // the VPIRInstruction modeling the phi.
5719 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5721 continue;
5722
5723 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5724 // penultimate value of the recurrence. Instead we rely on the existing
5725 // extract of the last element from the result of
5726 // VPInstruction::FirstOrderRecurrenceSplice.
5727 // TODO: Consider vscale_range info and UF.
5729 Range))
5730 return;
5731 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5732 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5733 "vector.recur.extract.for.phi");
5734 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5735 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5736 if (!ExitPhi)
5737 continue;
5738 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5739 }
5740 }
5741 }
5742}
5743
5746 Loop &L) {
5747 ScalarEvolution &SE = *PSE.getSE();
5748 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5749
5750 // Helper lambda to check if the IV range excludes the sentinel value.
5751 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5752 bool Signed) -> std::optional<APInt> {
5753 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5754 APInt Sentinel =
5755 UseMax
5758
5759 ConstantRange IVRange =
5760 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5761 if (!IVRange.contains(Sentinel))
5762 return Sentinel;
5763 return std::nullopt;
5764 };
5765
5766 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5767 for (VPRecipeBase &Phi :
5768 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5769 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5771 PhiR->getRecurrenceKind()))
5772 continue;
5773
5774 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5775 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5776 continue;
5777
5778 // If there's a header mask, the backedge select will not be the find-last
5779 // select.
5780 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5781 VPValue *CondSelect = BackedgeVal;
5782 if (HeaderMask &&
5783 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5784 m_VPValue(CondSelect), m_Specific(PhiR))))
5785 llvm_unreachable("expected header mask select");
5786
5787 // Get the IV from the conditional select of the reduction phi.
5788 // The conditional select should be a select between the phi and the IV.
5789 VPValue *Cond, *TrueVal, *FalseVal;
5790 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5791 m_VPValue(FalseVal))))
5792 continue;
5793
5794 // The non-phi operand of the select is the IV.
5795 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5796 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5797
5798 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5799 const SCEV *Step;
5800 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5801 continue;
5802
5803 // Determine direction from SCEV step.
5804 if (!SE.isKnownNonZero(Step))
5805 continue;
5806
5807 // Positive step means we need UMax/SMax to find the last IV value, and
5808 // UMin/SMin otherwise.
5809 bool UseMax = SE.isKnownPositive(Step);
5810 bool UseSigned = true;
5811 std::optional<APInt> SentinelVal =
5812 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5813 if (!SentinelVal) {
5814 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5815 UseSigned = false;
5816 }
5817
5818 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5819 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5820 // cannot use min/max.
5821 if (!SentinelVal) {
5822 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5823 if (AR->hasNoSignedWrap())
5824 UseSigned = true;
5825 else if (AR->hasNoUnsignedWrap())
5826 UseSigned = false;
5827 else
5828 continue;
5829 }
5830
5832 BackedgeVal,
5834
5835 RecurKind MinMaxKind =
5836 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5837 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5838 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5839 FastMathFlags());
5840 DebugLoc ExitDL = RdxResult->getDebugLoc();
5841 VPBuilder MiddleBuilder(RdxResult);
5842 VPValue *ReducedIV =
5844 RdxResult->getOperand(0), Flags, ExitDL);
5845
5846 VPValue *NewRdxResult;
5847 VPValue *StartVPV = PhiR->getStartValue();
5848 if (SentinelVal) {
5849 // Sentinel-based approach: reduce IVs with min/max, compare against
5850 // sentinel to detect if condition was ever true, select accordingly.
5851 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5852 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5853 Sentinel, ExitDL);
5854 NewRdxResult =
5855 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5856 StartVPV = Sentinel;
5857 } else {
5858 // Introduce a boolean AnyOf reduction to track if the condition was ever
5859 // true in the loop. Use it to select the initial start value, if it was
5860 // never true.
5861 auto *AnyOfPhi = new VPReductionPHIRecipe(
5862 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5863 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5864 AnyOfPhi->insertAfter(PhiR);
5865
5866 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5867 VPValue *AnyOfCond = Cond;
5868 if (TrueVal == PhiR)
5869 AnyOfCond = LoopBuilder.createNot(Cond);
5870 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5871 AnyOfPhi->setOperand(1, OrVal);
5872
5873 NewRdxResult =
5875 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5876
5877 // Initialize the IV reduction phi with the neutral element, not the
5878 // original start value, to ensure correct min/max reduction results.
5879 StartVPV = Plan.getOrAddLiveIn(
5880 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5881 }
5882 RdxResult->replaceAllUsesWith(NewRdxResult);
5883 RdxResult->eraseFromParent();
5884
5885 auto *NewPhiR = new VPReductionPHIRecipe(
5886 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5887 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5888 NewPhiR->insertBefore(PhiR);
5889 PhiR->replaceAllUsesWith(NewPhiR);
5890 PhiR->eraseFromParent();
5891 }
5892}
5893
5894namespace {
5895
5896/// Holds the binary operation used to compute the extended operand and the
5897/// casts that feed into it.
5898struct ExtendedReductionOperand {
5899 VPWidenRecipe *BinOp = nullptr;
5900 // Note: The second cast recipe may be null.
5901 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5902};
5903
5904/// A chain of recipes that form a partial reduction. Matches either
5905/// reduction_bin_op (extend (A), accumulator), or
5906/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5907struct VPPartialReductionChain {
5908 /// The top-level binary operation that forms the reduction to a scalar
5909 /// after the loop body.
5910 VPWidenRecipe *ReductionBinOp;
5911 /// The user of the extends that is then reduced.
5912 ExtendedReductionOperand ExtendedOp;
5913 unsigned ScaleFactor;
5914 /// The recurrence kind for the entire partial reduction chain.
5915 /// This allows distinguishing between Sub and AddWithSub recurrences,
5916 /// when the ReductionBinOp is a Instruction::Sub.
5917 RecurKind RK;
5918};
5919
5920static VPSingleDefRecipe *
5921optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5922 VPTypeAnalysis &TypeInfo) {
5923 // reduce.add(mul(ext(A), C))
5924 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5925 const APInt *Const;
5926 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5927 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5928 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5929 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5930 if (!BinOp->hasOneUse() ||
5932 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5933 return BinOp;
5934
5935 VPBuilder Builder(BinOp);
5936 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5937 BinOp->getOperand(1), NarrowTy);
5938 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5939 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5940 return BinOp;
5941 }
5942
5943 // reduce.add(ext(mul(ext(A), ext(B))))
5944 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5945 // TODO: Support this optimization for float types.
5947 m_ZExtOrSExt(m_VPValue()))))) {
5948 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5949 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5950 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5951 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5952 if (!Mul->hasOneUse() ||
5953 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5954 MulLHS->getOpcode() != MulRHS->getOpcode())
5955 return BinOp;
5956 VPBuilder Builder(Mul);
5957 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5958 MulLHS->getOperand(0),
5959 Ext->getResultType()));
5960 Mul->setOperand(1, MulLHS == MulRHS
5961 ? Mul->getOperand(0)
5962 : Builder.createWidenCast(MulRHS->getOpcode(),
5963 MulRHS->getOperand(0),
5964 Ext->getResultType()));
5965 return Mul;
5966 }
5967
5968 return BinOp;
5969}
5970
5971// Helper to transform a partial reduction chain into a partial reduction
5972// recipe. Assumes profitability has been checked.
5973static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5974 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5975 VPReductionPHIRecipe *RdxPhi) {
5976 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5977 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5978
5979 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5980 VPValue *Accumulator = WidenRecipe->getOperand(1);
5981
5982 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5984 isa<VPExpressionRecipe>(BinOpVal))
5985 std::swap(BinOpVal, Accumulator);
5986 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5987
5988 // Sub-reductions can be implemented in two ways:
5989 // (1) negate the operand in the vector loop (the default way).
5990 // (2) subtract the reduced value from the init value in the middle block.
5991 // Both ways keep the reduction itself as an 'add' reduction.
5992 //
5993 // The ISD nodes for partial reductions don't support folding the
5994 // sub/negation into its operands because the following is not a valid
5995 // transformation:
5996 // sub(0, mul(ext(a), ext(b)))
5997 // -> mul(ext(a), ext(sub(0, b)))
5998 //
5999 // It's therefore better to choose option (2) such that the partial
6000 // reduction is always positive (starting at '0') and to do a final
6001 // subtract in the middle block.
6002 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6003 Chain.RK != RecurKind::Sub) {
6004 VPBuilder Builder(WidenRecipe);
6005 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
6006 auto *Zero = Plan.getZero(ElemTy);
6007 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
6008 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
6009 : VPIRFlags();
6010 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
6012 Builder.insert(NegRecipe);
6013 BinOp = NegRecipe;
6014 }
6015
6016 // FIXME: Do these transforms before invoking the cost-model.
6017 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
6018
6019 // Check if WidenRecipe is the final result of the reduction. If so look
6020 // through selects for predicated reductions.
6021 VPValue *Cond = nullptr;
6023 WidenRecipe,
6024 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6025 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6026 RdxPhi->getBackedgeValue() == ExitValue;
6027 assert((!ExitValue || IsLastInChain) &&
6028 "if we found ExitValue, it must match RdxPhi's backedge value");
6029
6030 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6031 RecurKind RdxKind =
6033 auto *PartialRed = new VPReductionRecipe(
6034 RdxKind,
6035 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6036 : FastMathFlags(),
6037 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6038 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6039 PartialRed->insertBefore(WidenRecipe);
6040
6041 if (Cond)
6042 ExitValue->replaceAllUsesWith(PartialRed);
6043 WidenRecipe->replaceAllUsesWith(PartialRed);
6044
6045 // We only need to update the PHI node once, which is when we find the
6046 // last reduction in the chain.
6047 if (!IsLastInChain)
6048 return;
6049
6050 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6051 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6052 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6053
6054 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6055 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6056 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6057 StartInst->setOperand(2, NewScaleFactor);
6058
6059 // If this is the last value in a sub-reduction chain, then update the PHI
6060 // node to start at `0` and update the reduction-result to subtract from
6061 // the PHI's start value.
6062 if (Chain.RK != RecurKind::Sub)
6063 return;
6064
6065 VPValue *OldStartValue = StartInst->getOperand(0);
6066 StartInst->setOperand(0, StartInst->getOperand(1));
6067
6068 // Replace reduction_result by 'sub (startval, reductionresult)'.
6070 assert(RdxResult && "Could not find reduction result");
6071
6072 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6073 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6074 VPInstruction *NewResult = Builder.createNaryOp(
6075 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6076 RdxPhi->getDebugLoc());
6077 RdxResult->replaceUsesWithIf(
6078 NewResult,
6079 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6080}
6081
6082/// Check if a partial reduction chain is is supported by the target (i.e. does
6083/// not have an invalid cost) for the given VF range. Clamps the range and
6084/// returns true if profitable for any VF.
6085static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6086 Type *PhiType, VPCostContext &CostCtx,
6087 VFRange &Range) {
6088 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6089 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6090 if (!Ext)
6091 return {nullptr, TargetTransformInfo::PR_None};
6092 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6094 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6095 return {ExtOpType, ExtKind};
6096 };
6097 ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6098 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6099 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6100
6101 Type *ExtOpTypeA, *ExtOpTypeB;
6103 std::tie(ExtOpTypeA, ExtKindA) = GetExtInfo(ExtendA);
6104 std::tie(ExtOpTypeB, ExtKindB) = GetExtInfo(ExtendB);
6105
6106 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6107 // was a constant that can use the same extend kind as the first.
6108 if (!ExtendB && ExtendedOp.BinOp &&
6109 ExtendedOp.BinOp != Chain.ReductionBinOp) {
6110 const APInt *Const = nullptr;
6111 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6112 if (match(Op, m_APInt(Const)))
6113 break;
6114 }
6115 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6116 return false;
6117 ExtOpTypeB = ExtOpTypeA;
6118 ExtKindB = ExtKindA;
6119 }
6120
6121 std::optional<unsigned> BinOpc;
6122 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6123 BinOpc = ExtendedOp.BinOp->getOpcode();
6124
6125 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6127 [&](ElementCount VF) {
6128 return CostCtx.TTI
6130 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6131 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6132 PhiType->isFloatingPointTy()
6133 ? std::optional{WidenRecipe->getFastMathFlags()}
6134 : std::nullopt)
6135 .isValid();
6136 },
6137 Range);
6138}
6139
6141getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6143}
6144
6145/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6146/// operand. This is an operand where the source of the value (e.g. a load) has
6147/// been extended (sext, zext, or fpext) before it is used in the reduction.
6148///
6149/// Possible forms matched by this function:
6150/// - UpdateR(PrevValue, ext(...))
6151/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6152/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6153/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6154/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6155/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6156/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6157///
6158/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6159static std::optional<ExtendedReductionOperand>
6160matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6161 assert(is_contained(UpdateR->operands(), Op) &&
6162 "Op should be operand of UpdateR");
6163
6164 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6165 if (match(Op, m_AnyExtend(m_VPValue()))) {
6166 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6167 VPValue *CastSource = CastRecipe->getOperand(0);
6168 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6169 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6170 // Match: ext(mul(...))
6171 // Record the outer extend kind and set `Op` to the mul. We can then match
6172 // this as a binary operation. Note: We can optimize out the outer extend
6173 // by widening the inner extends to match it. See
6174 // optimizeExtendsForPartialReduction.
6175 Op = CastSource;
6176 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6177 } else if (UpdateR->getOpcode() == Instruction::Add ||
6178 UpdateR->getOpcode() == Instruction::FAdd) {
6179 // Match: UpdateR(PrevValue, ext(...))
6180 // TODO: Remove the add/fadd restriction (we should be able to handle this
6181 // case for sub reductions too).
6182 return ExtendedReductionOperand{UpdateR, {CastRecipe, nullptr}};
6183 }
6184 }
6185
6186 if (!Op->hasOneUse())
6187 return std::nullopt;
6188
6189 // Handle neg(...) pattern (aka sub(0, ...)).
6190 VPValue *NegatedOp = nullptr;
6191 if (match(Op, m_Sub(m_ZeroInt(), m_VPValue(NegatedOp))))
6192 Op = NegatedOp;
6193
6195 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
6196 return std::nullopt;
6197
6198 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6199 // binary operation.
6200
6201 VPValue *LHS = BinOp->getOperand(0);
6202 VPValue *RHS = BinOp->getOperand(1);
6203
6204 // The LHS of the operation must always be an extend.
6205 if (!match(LHS, m_AnyExtend(m_VPValue())))
6206 return std::nullopt;
6207
6208 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6209
6210 // The RHS of the operation can be an extend or a constant integer.
6211 // The constant will be validated in isValidPartialReduction.
6212 VPWidenCastRecipe *RHSCast = nullptr;
6213 if (match(RHS, m_AnyExtend(m_VPValue())))
6214 RHSCast = cast<VPWidenCastRecipe>(RHS);
6215 else if (!isa<VPConstantInt>(RHS))
6216 return std::nullopt;
6217
6218 // The outer extend kind must match the inner extends for folding.
6219 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6220 if (Cast && OuterExtKind &&
6221 getPartialReductionExtendKind(Cast) != OuterExtKind)
6222 return std::nullopt;
6223
6224 return ExtendedReductionOperand{BinOp, {LHSCast, RHSCast}};
6225}
6226
6227/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6228/// and determines if the target can use a cheaper operation with a wider
6229/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6230/// of operations in the reduction.
6231static std::optional<SmallVector<VPPartialReductionChain>>
6232getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6233 VFRange &Range) {
6234 // Get the backedge value from the reduction PHI and find the
6235 // ComputeReductionResult that uses it (directly or through a select for
6236 // predicated reductions).
6237 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6238 if (!RdxResult)
6239 return std::nullopt;
6240 VPValue *ExitValue = RdxResult->getOperand(0);
6241 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6242
6244 RecurKind RK = RedPhiR->getRecurrenceKind();
6245 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6246 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6247
6248 // Work backwards from the ExitValue examining each reduction operation.
6249 VPValue *CurrentValue = ExitValue;
6250 while (CurrentValue != RedPhiR) {
6251 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6252 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6253 return std::nullopt;
6254
6255 VPValue *Op = UpdateR->getOperand(1);
6256 VPValue *PrevValue = UpdateR->getOperand(0);
6257
6258 // Find the extended operand. The other operand (PrevValue) is the next link
6259 // in the reduction chain.
6260 std::optional<ExtendedReductionOperand> ExtendedOp =
6261 matchExtendedReductionOperand(UpdateR, Op);
6262 if (!ExtendedOp) {
6263 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6264 if (!ExtendedOp)
6265 return std::nullopt;
6266 std::swap(Op, PrevValue);
6267 }
6268
6269 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6270 ExtendedOp->CastRecipes[0]->getOperand(0));
6271 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6272 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6273 return std::nullopt;
6274
6275 VPPartialReductionChain Chain(
6276 {UpdateR, *ExtendedOp,
6277 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)), RK});
6278 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6279 return std::nullopt;
6280
6281 Chains.push_back(Chain);
6282 CurrentValue = PrevValue;
6283 }
6284
6285 // The chains were collected by traversing backwards from the exit value.
6286 // Reverse the chains so they are in program order.
6287 std::reverse(Chains.begin(), Chains.end());
6288 return Chains;
6289}
6290} // namespace
6291
6293 VPCostContext &CostCtx,
6294 VFRange &Range) {
6295 // Find all possible valid partial reductions, grouping chains by their PHI.
6296 // This grouping allows invalidating the whole chain, if any link is not a
6297 // valid partial reduction.
6299 ChainsByPhi;
6300 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6301 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6302 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6303 if (!RedPhiR)
6304 continue;
6305
6306 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6307 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6308 }
6309
6310 if (ChainsByPhi.empty())
6311 return;
6312
6313 // Build set of partial reduction operations for extend user validation and
6314 // a map of reduction bin ops to their scale factors for scale validation.
6315 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6316 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6317 for (const auto &[_, Chains] : ChainsByPhi)
6318 for (const VPPartialReductionChain &Chain : Chains) {
6319 PartialReductionOps.insert(Chain.ExtendedOp.BinOp);
6320 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6321 }
6322
6323 // A partial reduction is invalid if any of its extends are used by
6324 // something that isn't another partial reduction. This is because the
6325 // extends are intended to be lowered along with the reduction itself.
6326 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6327 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6328 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6329 });
6330 };
6331
6332 // Validate chains: check that extends are only used by partial reductions,
6333 // and that reduction bin ops are only used by other partial reductions with
6334 // matching scale factors, are outside the loop region or the select
6335 // introduced by tail-folding. Otherwise we would create users of scaled
6336 // reductions where the types of the other operands don't match.
6337 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6338 for (const VPPartialReductionChain &Chain : Chains) {
6339 if (!all_of(Chain.ExtendedOp.CastRecipes, ExtendUsersValid)) {
6340 Chains.clear();
6341 break;
6342 }
6343 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6344 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6345 return PhiR == RedPhiR;
6346 auto *R = cast<VPSingleDefRecipe>(U);
6347 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6349 m_Specific(Chain.ReductionBinOp))) ||
6350 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6351 m_Specific(RedPhiR)));
6352 };
6353 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6354 Chains.clear();
6355 break;
6356 }
6357
6358 // Check if the compute-reduction-result is used by a sunk store.
6359 // TODO: Also form partial reductions in those cases.
6360 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6361 if (any_of(RdxResult->users(), [](VPUser *U) {
6362 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6363 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6364 })) {
6365 Chains.clear();
6366 break;
6367 }
6368 }
6369 }
6370 }
6371
6372 for (auto &[Phi, Chains] : ChainsByPhi)
6373 for (const VPPartialReductionChain &Chain : Chains)
6374 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6375}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV.
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1603
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3875
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4255
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4330
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4282
iterator end()
Definition VPlan.h:4292
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4290
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4343
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4302
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4304
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2780
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2816
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2806
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2822
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2802
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:266
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:287
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3284
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3817
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3907
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3987
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3329
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2292
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2334
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2323
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4408
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1156
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1211
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1313
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1258
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1255
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1307
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1250
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1247
@ CanonicalIVIncrementForPart
Definition VPlan.h:1231
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2925
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2917
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2946
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2998
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2956
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1579
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3471
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4560
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3158
A recipe for handling reduction phis.
Definition VPlan.h:2686
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2733
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2726
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2744
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3049
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4443
const VPBlockBase * getEntry() const
Definition VPlan.h:4479
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4554
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4511
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4496
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4541
const VPBlockBase * getExiting() const
Definition VPlan.h:4491
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4504
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3203
bool isSingleScalar() const
Definition VPlan.h:3244
bool isPredicated() const
Definition VPlan.h:3246
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3268
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4059
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
unsigned getNumOperands() const
Definition VPlanValue.h:334
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
void addOperand(VPValue *Operand)
Definition VPlanValue.h:329
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1431
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
bool hasOneUse() const
Definition VPlanValue.h:166
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1434
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1440
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2140
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3950
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1826
Instruction::CastOps getOpcode() const
Definition VPlan.h:1864
A recipe for handling GEP instructions.
Definition VPlan.h:2076
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2358
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2386
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2404
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2389
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2409
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2440
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2487
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2491
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2518
A recipe for widening vector intrinsics.
Definition VPlan.h:1878
A common base class for widening memory operations.
Definition VPlan.h:3514
A recipe for widened phis.
Definition VPlan.h:2576
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1770
unsigned getOpcode() const
Definition VPlan.h:1807
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4573
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4881
bool hasVF(ElementCount VF) const
Definition VPlan.h:4786
const DataLayout & getDataLayout() const
Definition VPlan.h:4768
LLVMContext & getContext() const
Definition VPlan.h:4764
VPBasicBlock * getEntry()
Definition VPlan.h:4665
bool hasScalableVF() const
Definition VPlan.h:4787
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4723
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4744
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4793
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4852
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4762
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4858
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4928
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4884
bool hasUF(unsigned UF) const
Definition VPlan.h:4804
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4713
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4752
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4829
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4855
void setVF(ElementCount VF)
Definition VPlan.h:4774
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4820
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1064
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4807
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4737
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4690
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4907
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4849
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4759
bool hasScalarVFOnly() const
Definition VPlan.h:4797
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4704
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4670
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4755
void setUF(unsigned UF)
Definition VPlan.h:4812
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4960
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1212
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4863
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:427
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyExtend(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:273
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1796
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:266
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2668
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2624
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:207
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:255
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3647
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3605
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3732
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3688
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...