LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Return true if we do not know how to (mechanically) hoist or sink \p R out
234/// of a loop region.
236 // Assumes don't alias anything or throw; as long as they're guaranteed to
237 // execute, they're safe to hoist.
239 return false;
240
241 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
242 // memory location is not modified in the vector loop.
243 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
244 return true;
245
246 // Allocas cannot be hoisted.
247 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
248 return RepR && RepR->getOpcode() == Instruction::Alloca;
249}
250
251static bool sinkScalarOperands(VPlan &Plan) {
252 auto Iter = vp_depth_first_deep(Plan.getEntry());
253 bool ScalarVFOnly = Plan.hasScalarVFOnly();
254 bool Changed = false;
255
257 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
258 VPBasicBlock *SinkTo, VPValue *Op) {
259 auto *Candidate =
260 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
261 if (!Candidate)
262 return;
263
264 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
265 // for now.
267 return;
268
269 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
270 return;
271
272 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
273 if (!ScalarVFOnly && RepR->isSingleScalar())
274 return;
275
276 WorkList.insert({SinkTo, Candidate});
277 };
278
279 // First, collect the operands of all recipes in replicate blocks as seeds for
280 // sinking.
282 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
283 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
284 continue;
285 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
286 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
287 continue;
288 for (auto &Recipe : *VPBB)
289 for (VPValue *Op : Recipe.operands())
290 InsertIfValidSinkCandidate(VPBB, Op);
291 }
292
293 // Try to sink each replicate or scalar IV steps recipe in the worklist.
294 for (unsigned I = 0; I != WorkList.size(); ++I) {
295 VPBasicBlock *SinkTo;
296 VPSingleDefRecipe *SinkCandidate;
297 std::tie(SinkTo, SinkCandidate) = WorkList[I];
298
299 // All recipe users of SinkCandidate must be in the same block SinkTo or all
300 // users outside of SinkTo must only use the first lane of SinkCandidate. In
301 // the latter case, we need to duplicate SinkCandidate.
302 auto UsersOutsideSinkTo =
303 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
304 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
305 });
306 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
307 return !U->usesFirstLaneOnly(SinkCandidate);
308 }))
309 continue;
310 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
311
312 if (NeedsDuplicating) {
313 if (ScalarVFOnly)
314 continue;
315 VPSingleDefRecipe *Clone;
316 if (auto *SinkCandidateRepR =
317 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
318 // TODO: Handle converting to uniform recipes as separate transform,
319 // then cloning should be sufficient here.
320 Instruction *I = SinkCandidate->getUnderlyingInstr();
321 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
322 nullptr /*Mask*/, *SinkCandidateRepR,
323 *SinkCandidateRepR);
324 // TODO: add ".cloned" suffix to name of Clone's VPValue.
325 } else {
326 Clone = SinkCandidate->clone();
327 }
328
329 Clone->insertBefore(SinkCandidate);
330 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
331 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
332 });
333 }
334 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
335 for (VPValue *Op : SinkCandidate->operands())
336 InsertIfValidSinkCandidate(SinkTo, Op);
337 Changed = true;
338 }
339 return Changed;
340}
341
342/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
343/// the mask.
345 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
346 if (!EntryBB || EntryBB->size() != 1 ||
347 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
348 return nullptr;
349
350 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
351}
352
353/// If \p R is a triangle region, return the 'then' block of the triangle.
355 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
356 if (EntryBB->getNumSuccessors() != 2)
357 return nullptr;
358
359 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
360 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
361 if (!Succ0 || !Succ1)
362 return nullptr;
363
364 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
365 return nullptr;
366 if (Succ0->getSingleSuccessor() == Succ1)
367 return Succ0;
368 if (Succ1->getSingleSuccessor() == Succ0)
369 return Succ1;
370 return nullptr;
371}
372
373// Merge replicate regions in their successor region, if a replicate region
374// is connected to a successor replicate region with the same predicate by a
375// single, empty VPBasicBlock.
377 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
378
379 // Collect replicate regions followed by an empty block, followed by another
380 // replicate region with matching masks to process front. This is to avoid
381 // iterator invalidation issues while merging regions.
384 vp_depth_first_deep(Plan.getEntry()))) {
385 if (!Region1->isReplicator())
386 continue;
387 auto *MiddleBasicBlock =
388 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
389 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
390 continue;
391
392 auto *Region2 =
393 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
394 if (!Region2 || !Region2->isReplicator())
395 continue;
396
397 VPValue *Mask1 = getPredicatedMask(Region1);
398 VPValue *Mask2 = getPredicatedMask(Region2);
399 if (!Mask1 || Mask1 != Mask2)
400 continue;
401
402 assert(Mask1 && Mask2 && "both region must have conditions");
403 WorkList.push_back(Region1);
404 }
405
406 // Move recipes from Region1 to its successor region, if both are triangles.
407 for (VPRegionBlock *Region1 : WorkList) {
408 if (TransformedRegions.contains(Region1))
409 continue;
410 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
411 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
412
413 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
414 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
415 if (!Then1 || !Then2)
416 continue;
417
418 // Note: No fusion-preventing memory dependencies are expected in either
419 // region. Such dependencies should be rejected during earlier dependence
420 // checks, which guarantee accesses can be re-ordered for vectorization.
421 //
422 // Move recipes to the successor region.
423 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
424 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
425
426 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
427 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
428
429 // Move VPPredInstPHIRecipes from the merge block to the successor region's
430 // merge block. Update all users inside the successor region to use the
431 // original values.
432 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
433 VPValue *PredInst1 =
434 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
435 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
436 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
437 return cast<VPRecipeBase>(&U)->getParent() == Then2;
438 });
439
440 // Remove phi recipes that are unused after merging the regions.
441 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
442 Phi1ToMove.eraseFromParent();
443 continue;
444 }
445 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
446 }
447
448 // Remove the dead recipes in Region1's entry block.
449 for (VPRecipeBase &R :
450 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
451 R.eraseFromParent();
452
453 // Finally, remove the first region.
454 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
455 VPBlockUtils::disconnectBlocks(Pred, Region1);
456 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
457 }
458 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
459 TransformedRegions.insert(Region1);
460 }
461
462 return !TransformedRegions.empty();
463}
464
466 VPlan &Plan) {
467 Instruction *Instr = PredRecipe->getUnderlyingInstr();
468 // Build the triangular if-then region.
469 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
470 assert(Instr->getParent() && "Predicated instruction not in any basic block");
471 auto *BlockInMask = PredRecipe->getMask();
472 auto *MaskDef = BlockInMask->getDefiningRecipe();
473 auto *BOMRecipe = new VPBranchOnMaskRecipe(
474 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
475 auto *Entry =
476 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
477
478 // Replace predicated replicate recipe with a replicate recipe without a
479 // mask but in the replicate region.
480 auto *RecipeWithoutMask = new VPReplicateRecipe(
481 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
482 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
483 PredRecipe->getDebugLoc());
484 auto *Pred =
485 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
486
487 VPPredInstPHIRecipe *PHIRecipe = nullptr;
488 if (PredRecipe->getNumUsers() != 0) {
489 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
490 RecipeWithoutMask->getDebugLoc());
491 PredRecipe->replaceAllUsesWith(PHIRecipe);
492 PHIRecipe->setOperand(0, RecipeWithoutMask);
493 }
494 PredRecipe->eraseFromParent();
495 auto *Exiting =
496 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
498 Plan.createReplicateRegion(Entry, Exiting, RegionName);
499
500 // Note: first set Entry as region entry and then connect successors starting
501 // from it in order, to propagate the "parent" of each VPBasicBlock.
502 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
503 VPBlockUtils::connectBlocks(Pred, Exiting);
504
505 return Region;
506}
507
508static void addReplicateRegions(VPlan &Plan) {
511 vp_depth_first_deep(Plan.getEntry()))) {
512 for (VPRecipeBase &R : *VPBB)
513 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
514 if (RepR->isPredicated())
515 WorkList.push_back(RepR);
516 }
517 }
518
519 unsigned BBNum = 0;
520 for (VPReplicateRecipe *RepR : WorkList) {
521 VPBasicBlock *CurrentBlock = RepR->getParent();
522 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
523
524 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
525 SplitBlock->setName(
526 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
527 // Record predicated instructions for above packing optimizations.
529 Region->setParent(CurrentBlock->getParent());
531
532 VPRegionBlock *ParentRegion = Region->getParent();
533 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
534 ParentRegion->setExiting(SplitBlock);
535 }
536}
537
538/// Remove redundant VPBasicBlocks by merging them into their predecessor if
539/// the predecessor has a single successor.
543 vp_depth_first_deep(Plan.getEntry()))) {
544 // Don't fold the blocks in the skeleton of the Plan into their single
545 // predecessors for now.
546 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
547 if (!VPBB->getParent())
548 continue;
549 auto *PredVPBB =
550 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
551 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
552 isa<VPIRBasicBlock>(PredVPBB))
553 continue;
554 WorkList.push_back(VPBB);
555 }
556
557 for (VPBasicBlock *VPBB : WorkList) {
558 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
559 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
560 R.moveBefore(*PredVPBB, PredVPBB->end());
561 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
562 auto *ParentRegion = VPBB->getParent();
563 if (ParentRegion && ParentRegion->getExiting() == VPBB)
564 ParentRegion->setExiting(PredVPBB);
565 for (auto *Succ : to_vector(VPBB->successors())) {
567 VPBlockUtils::connectBlocks(PredVPBB, Succ);
568 }
569 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
570 }
571 return !WorkList.empty();
572}
573
575 // Convert masked VPReplicateRecipes to if-then region blocks.
577
578 bool ShouldSimplify = true;
579 while (ShouldSimplify) {
580 ShouldSimplify = sinkScalarOperands(Plan);
581 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
582 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
583 }
584}
585
586/// Remove redundant casts of inductions.
587///
588/// Such redundant casts are casts of induction variables that can be ignored,
589/// because we already proved that the casted phi is equal to the uncasted phi
590/// in the vectorized loop. There is no need to vectorize the cast - the same
591/// value can be used for both the phi and casts in the vector loop.
593 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
595 if (!IV || IV->getTruncInst())
596 continue;
597
598 // A sequence of IR Casts has potentially been recorded for IV, which
599 // *must be bypassed* when the IV is vectorized, because the vectorized IV
600 // will produce the desired casted value. This sequence forms a def-use
601 // chain and is provided in reverse order, ending with the cast that uses
602 // the IV phi. Search for the recipe of the last cast in the chain and
603 // replace it with the original IV. Note that only the final cast is
604 // expected to have users outside the cast-chain and the dead casts left
605 // over will be cleaned up later.
606 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
607 VPValue *FindMyCast = IV;
608 for (Instruction *IRCast : reverse(Casts)) {
609 VPSingleDefRecipe *FoundUserCast = nullptr;
610 for (auto *U : FindMyCast->users()) {
611 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
612 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
613 FoundUserCast = UserCast;
614 break;
615 }
616 }
617 FindMyCast = FoundUserCast;
618 }
619 FindMyCast->replaceAllUsesWith(IV);
620 }
621}
622
623/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
624/// recipe, if it exists.
626 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
627 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
628 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
629 for (VPUser *U : CanonicalIV->users()) {
631 if (WidenNewIV)
632 break;
633 }
634
635 if (!WidenNewIV)
636 return;
637
638 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
639 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
640 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
641
642 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
643 continue;
644
645 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
646 // everything WidenNewIV's users need. That is, WidenOriginalIV will
647 // generate a vector phi or all users of WidenNewIV demand the first lane
648 // only.
649 if (Plan.hasScalarVFOnly() ||
650 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
651 vputils::onlyFirstLaneUsed(WidenNewIV)) {
652 // We are replacing a wide canonical iv with a suitable wide induction.
653 // This is used to compute header mask, hence all lanes will be used and
654 // we need to drop wrap flags only applying to lanes guranteed to execute
655 // in the original scalar loop.
656 WidenOriginalIV->dropPoisonGeneratingFlags();
657 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
658 WidenNewIV->eraseFromParent();
659 return;
660 }
661 }
662}
663
664/// Returns true if \p R is dead and can be removed.
665static bool isDeadRecipe(VPRecipeBase &R) {
666 // Do remove conditional assume instructions as their conditions may be
667 // flattened.
668 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
669 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
671 if (IsConditionalAssume)
672 return true;
673
674 if (R.mayHaveSideEffects())
675 return false;
676
677 // Recipe is dead if no user keeps the recipe alive.
678 return all_of(R.definedValues(),
679 [](VPValue *V) { return V->getNumUsers() == 0; });
680}
681
684 vp_post_order_deep(Plan.getEntry()))) {
685 // The recipes in the block are processed in reverse order, to catch chains
686 // of dead recipes.
687 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
688 if (isDeadRecipe(R)) {
689 R.eraseFromParent();
690 continue;
691 }
692
693 // Check if R is a dead VPPhi <-> update cycle and remove it.
694 auto *PhiR = dyn_cast<VPPhi>(&R);
695 if (!PhiR || PhiR->getNumOperands() != 2)
696 continue;
697 VPUser *PhiUser = PhiR->getSingleUser();
698 if (!PhiUser)
699 continue;
700 VPValue *Incoming = PhiR->getOperand(1);
701 if (PhiUser != Incoming->getDefiningRecipe() ||
702 Incoming->getNumUsers() != 1)
703 continue;
704 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
705 PhiR->eraseFromParent();
706 Incoming->getDefiningRecipe()->eraseFromParent();
707 }
708 }
709}
710
713 Instruction::BinaryOps InductionOpcode,
714 FPMathOperator *FPBinOp, Instruction *TruncI,
715 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
716 VPBuilder &Builder) {
717 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
718 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
719 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
720 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
721 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
722
723 // Truncate base induction if needed.
724 VPTypeAnalysis TypeInfo(Plan);
725 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
726 if (TruncI) {
727 Type *TruncTy = TruncI->getType();
728 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
729 "Not truncating.");
730 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
731 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
732 ResultTy = TruncTy;
733 }
734
735 // Truncate step if needed.
736 Type *StepTy = TypeInfo.inferScalarType(Step);
737 if (ResultTy != StepTy) {
738 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
739 "Not truncating.");
740 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
741 auto *VecPreheader =
743 VPBuilder::InsertPointGuard Guard(Builder);
744 Builder.setInsertPoint(VecPreheader);
745 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
746 }
747 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
748 &Plan.getVF(), DL);
749}
750
753 for (unsigned I = 0; I != Users.size(); ++I) {
755 if (isa<VPHeaderPHIRecipe>(Cur))
756 continue;
757 for (VPValue *V : Cur->definedValues())
758 Users.insert_range(V->users());
759 }
760 return Users.takeVector();
761}
762
763/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
764/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
765/// generates scalar values.
766static VPValue *
768 VPlan &Plan, VPBuilder &Builder) {
770 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
771 VPValue *StepV = PtrIV->getOperand(1);
773 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
774 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
775
776 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
777 PtrIV->getDebugLoc(), "next.gep");
778}
779
780/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
781/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
782/// VPWidenPointerInductionRecipe will generate vectors only. If some users
783/// require vectors while other require scalars, the scalar uses need to extract
784/// the scalars from the generated vectors (Note that this is different to how
785/// int/fp inductions are handled). Legalize extract-from-ends using uniform
786/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
787/// the correct end value is available. Also optimize
788/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
789/// providing them scalar steps built on the canonical scalar IV and update the
790/// original IV's users. This is an optional optimization to reduce the needs of
791/// vector extracts.
794 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
795 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
796 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
797 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
798 if (!PhiR)
799 continue;
800
801 // Try to narrow wide and replicating recipes to uniform recipes, based on
802 // VPlan analysis.
803 // TODO: Apply to all recipes in the future, to replace legacy uniformity
804 // analysis.
805 auto Users = collectUsersRecursively(PhiR);
806 for (VPUser *U : reverse(Users)) {
807 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
808 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
809 // Skip recipes that shouldn't be narrowed.
810 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
811 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
812 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
813 continue;
814
815 // Skip recipes that may have other lanes than their first used.
817 continue;
818
819 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
820 Def->operands(), /*IsUniform*/ true,
821 /*Mask*/ nullptr, /*Flags*/ *Def);
822 Clone->insertAfter(Def);
823 Def->replaceAllUsesWith(Clone);
824 }
825
826 // Replace wide pointer inductions which have only their scalars used by
827 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
828 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
829 if (!Plan.hasScalarVFOnly() &&
830 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
831 continue;
832
833 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
834 PtrIV->replaceAllUsesWith(PtrAdd);
835 continue;
836 }
837
838 // Replace widened induction with scalar steps for users that only use
839 // scalars.
840 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
841 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
842 return U->usesScalars(WideIV);
843 }))
844 continue;
845
846 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
848 Plan, ID.getKind(), ID.getInductionOpcode(),
849 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
850 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
851 WideIV->getDebugLoc(), Builder);
852
853 // Update scalar users of IV to use Step instead.
854 if (!HasOnlyVectorVFs) {
855 assert(!Plan.hasScalableVF() &&
856 "plans containing a scalar VF cannot also include scalable VFs");
857 WideIV->replaceAllUsesWith(Steps);
858 } else {
859 bool HasScalableVF = Plan.hasScalableVF();
860 WideIV->replaceUsesWithIf(Steps,
861 [WideIV, HasScalableVF](VPUser &U, unsigned) {
862 if (HasScalableVF)
863 return U.usesFirstLaneOnly(WideIV);
864 return U.usesScalars(WideIV);
865 });
866 }
867 }
868}
869
870/// Check if \p VPV is an untruncated wide induction, either before or after the
871/// increment. If so return the header IV (before the increment), otherwise
872/// return null.
875 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
876 if (WideIV) {
877 // VPV itself is a wide induction, separately compute the end value for exit
878 // users if it is not a truncated IV.
879 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
880 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
881 }
882
883 // Check if VPV is an optimizable induction increment.
884 VPRecipeBase *Def = VPV->getDefiningRecipe();
885 if (!Def || Def->getNumOperands() != 2)
886 return nullptr;
887 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
888 if (!WideIV)
889 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
890 if (!WideIV)
891 return nullptr;
892
893 auto IsWideIVInc = [&]() {
894 auto &ID = WideIV->getInductionDescriptor();
895
896 // Check if VPV increments the induction by the induction step.
897 VPValue *IVStep = WideIV->getStepValue();
898 switch (ID.getInductionOpcode()) {
899 case Instruction::Add:
900 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
901 case Instruction::FAdd:
902 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
903 case Instruction::FSub:
904 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
905 m_Specific(IVStep)));
906 case Instruction::Sub: {
907 // IVStep will be the negated step of the subtraction. Check if Step == -1
908 // * IVStep.
909 VPValue *Step;
910 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
911 return false;
912 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
913 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
914 ScalarEvolution &SE = *PSE.getSE();
915 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
916 !isa<SCEVCouldNotCompute>(StepSCEV) &&
917 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
918 }
919 default:
920 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
921 match(VPV, m_GetElementPtr(m_Specific(WideIV),
922 m_Specific(WideIV->getStepValue())));
923 }
924 llvm_unreachable("should have been covered by switch above");
925 };
926 return IsWideIVInc() ? WideIV : nullptr;
927}
928
929/// Attempts to optimize the induction variable exit values for users in the
930/// early exit block.
932 VPTypeAnalysis &TypeInfo,
933 VPBlockBase *PredVPBB,
934 VPValue *Op,
936 VPValue *Incoming, *Mask;
939 return nullptr;
940
941 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
942 if (!WideIV)
943 return nullptr;
944
945 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
946 if (WideIntOrFp && WideIntOrFp->getTruncInst())
947 return nullptr;
948
949 // Calculate the final index.
950 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
951 auto *CanonicalIV = LoopRegion->getCanonicalIV();
952 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
953 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
954
955 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
956 VPValue *FirstActiveLane =
957 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
958 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
959 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
960 FirstActiveLaneType, DL);
961 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
962
963 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
964 // changed it means the exit is using the incremented value, so we need to
965 // add the step.
966 if (Incoming != WideIV) {
967 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
968 EndValue = B.createAdd(EndValue, One, DL);
969 }
970
971 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
972 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
973 VPIRValue *Start = WideIV->getStartValue();
974 VPValue *Step = WideIV->getStepValue();
975 EndValue = B.createDerivedIV(
976 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
977 Start, EndValue, Step);
978 }
979
980 return EndValue;
981}
982
983/// Attempts to optimize the induction variable exit values for users in the
984/// exit block coming from the latch in the original scalar loop.
986 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
989 VPWidenInductionRecipe *WideIV = nullptr;
991 WideIV = getOptimizableIVOf(Incoming, PSE);
992 assert(WideIV && "must have an optimizable IV");
993 return EndValues.lookup(WideIV);
994 }
995
997 WideIV = getOptimizableIVOf(Incoming, PSE);
998
999 if (!WideIV)
1000 return nullptr;
1001
1002 VPValue *EndValue = EndValues.lookup(WideIV);
1003 assert(EndValue && "end value must have been pre-computed");
1004
1005 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1006 // changed it means the exit is using the incremented value, so we don't
1007 // need to subtract the step.
1008 if (Incoming != WideIV)
1009 return EndValue;
1010
1011 // Otherwise, subtract the step from the EndValue.
1012 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1013 VPValue *Step = WideIV->getStepValue();
1014 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1015 if (ScalarTy->isIntegerTy())
1016 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1017 if (ScalarTy->isPointerTy()) {
1018 Type *StepTy = TypeInfo.inferScalarType(Step);
1019 auto *Zero = Plan.getConstantInt(StepTy, 0);
1020 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1021 DebugLoc::getUnknown(), "ind.escape");
1022 }
1023 if (ScalarTy->isFloatingPointTy()) {
1024 const auto &ID = WideIV->getInductionDescriptor();
1025 return B.createNaryOp(
1026 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1027 ? Instruction::FSub
1028 : Instruction::FAdd,
1029 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1030 }
1031 llvm_unreachable("all possible induction types must be handled");
1032 return nullptr;
1033}
1034
1036 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1038 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1039 VPTypeAnalysis TypeInfo(Plan);
1040 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1041 for (VPRecipeBase &R : ExitVPBB->phis()) {
1042 auto *ExitIRI = cast<VPIRPhi>(&R);
1043
1044 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1045 VPValue *Escape = nullptr;
1046 if (PredVPBB == MiddleVPBB)
1047 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1048 ExitIRI->getOperand(Idx),
1049 EndValues, PSE);
1050 else
1052 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1053 if (Escape)
1054 ExitIRI->setOperand(Idx, Escape);
1055 }
1056 }
1057 }
1058}
1059
1060/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1061/// them with already existing recipes expanding the same SCEV expression.
1064
1065 for (VPRecipeBase &R :
1067 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1068 if (!ExpR)
1069 continue;
1070
1071 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1072 if (Inserted)
1073 continue;
1074 ExpR->replaceAllUsesWith(V->second);
1075 ExpR->eraseFromParent();
1076 }
1077}
1078
1080 SmallVector<VPValue *> WorkList;
1082 WorkList.push_back(V);
1083
1084 while (!WorkList.empty()) {
1085 VPValue *Cur = WorkList.pop_back_val();
1086 if (!Seen.insert(Cur).second)
1087 continue;
1088 VPRecipeBase *R = Cur->getDefiningRecipe();
1089 if (!R)
1090 continue;
1091 if (!isDeadRecipe(*R))
1092 continue;
1093 append_range(WorkList, R->operands());
1094 R->eraseFromParent();
1095 }
1096}
1097
1098/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1099/// Returns an optional pair, where the first element indicates whether it is
1100/// an intrinsic ID.
1101static std::optional<std::pair<bool, unsigned>>
1103 return TypeSwitch<const VPSingleDefRecipe *,
1104 std::optional<std::pair<bool, unsigned>>>(R)
1107 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1108 .Case([](const VPWidenIntrinsicRecipe *I) {
1109 return std::make_pair(true, I->getVectorIntrinsicID());
1110 })
1111 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1112 // For recipes that do not directly map to LLVM IR instructions,
1113 // assign opcodes after the last VPInstruction opcode (which is also
1114 // after the last IR Instruction opcode), based on the VPRecipeID.
1115 return std::make_pair(false,
1116 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1117 })
1118 .Default([](auto *) { return std::nullopt; });
1119}
1120
1121/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1122/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1123/// Operands are foldable live-ins.
1125 ArrayRef<VPValue *> Operands,
1126 const DataLayout &DL,
1127 VPTypeAnalysis &TypeInfo) {
1128 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1129 if (!OpcodeOrIID)
1130 return nullptr;
1131
1133 for (VPValue *Op : Operands) {
1134 if (!match(Op, m_LiveIn()))
1135 return nullptr;
1136 Value *V = Op->getUnderlyingValue();
1137 if (!V)
1138 return nullptr;
1139 Ops.push_back(V);
1140 }
1141
1142 auto FoldToIRValue = [&]() -> Value * {
1143 InstSimplifyFolder Folder(DL);
1144 if (OpcodeOrIID->first) {
1145 if (R.getNumOperands() != 2)
1146 return nullptr;
1147 unsigned ID = OpcodeOrIID->second;
1148 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1149 TypeInfo.inferScalarType(&R));
1150 }
1151 unsigned Opcode = OpcodeOrIID->second;
1152 if (Instruction::isBinaryOp(Opcode))
1153 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1154 Ops[0], Ops[1]);
1155 if (Instruction::isCast(Opcode))
1156 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1157 TypeInfo.inferScalarType(R.getVPSingleValue()));
1158 switch (Opcode) {
1160 return Folder.FoldSelect(Ops[0], Ops[1],
1162 case VPInstruction::Not:
1163 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1165 case Instruction::Select:
1166 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1167 case Instruction::ICmp:
1168 case Instruction::FCmp:
1169 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1170 Ops[1]);
1171 case Instruction::GetElementPtr: {
1172 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1173 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1174 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1175 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1176 }
1179 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1180 Ops[0], Ops[1],
1181 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1182 // An extract of a live-in is an extract of a broadcast, so return the
1183 // broadcasted element.
1184 case Instruction::ExtractElement:
1185 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1186 return Ops[0];
1187 }
1188 return nullptr;
1189 };
1190
1191 if (Value *V = FoldToIRValue())
1192 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1193 return nullptr;
1194}
1195
1196/// Try to simplify VPSingleDefRecipe \p Def.
1198 VPlan *Plan = Def->getParent()->getPlan();
1199
1200 // Simplification of live-in IR values for SingleDef recipes using
1201 // InstSimplifyFolder.
1202 const DataLayout &DL =
1204 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1205 return Def->replaceAllUsesWith(V);
1206
1207 // Fold PredPHI LiveIn -> LiveIn.
1208 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1209 VPValue *Op = PredPHI->getOperand(0);
1210 if (isa<VPIRValue>(Op))
1211 PredPHI->replaceAllUsesWith(Op);
1212 }
1213
1214 VPBuilder Builder(Def);
1215
1216 // Avoid replacing VPInstructions with underlying values with new
1217 // VPInstructions, as we would fail to create widen/replicate recpes from the
1218 // new VPInstructions without an underlying value, and miss out on some
1219 // transformations that only apply to widened/replicated recipes later, by
1220 // doing so.
1221 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1222 // VPInstructions without underlying values, as those will get skipped during
1223 // cost computation.
1224 bool CanCreateNewRecipe =
1225 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1226
1227 VPValue *A;
1228 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1229 Type *TruncTy = TypeInfo.inferScalarType(Def);
1230 Type *ATy = TypeInfo.inferScalarType(A);
1231 if (TruncTy == ATy) {
1232 Def->replaceAllUsesWith(A);
1233 } else {
1234 // Don't replace a non-widened cast recipe with a widened cast.
1235 if (!isa<VPWidenCastRecipe>(Def))
1236 return;
1237 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1238
1239 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1240 ? Instruction::SExt
1241 : Instruction::ZExt;
1242 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1243 TruncTy);
1244 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1245 // UnderlyingExt has distinct return type, used to retain legacy cost.
1246 Ext->setUnderlyingValue(UnderlyingExt);
1247 }
1248 Def->replaceAllUsesWith(Ext);
1249 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1250 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1251 Def->replaceAllUsesWith(Trunc);
1252 }
1253 }
1254#ifndef NDEBUG
1255 // Verify that the cached type info is for both A and its users is still
1256 // accurate by comparing it to freshly computed types.
1257 VPTypeAnalysis TypeInfo2(*Plan);
1258 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1259 for (VPUser *U : A->users()) {
1260 auto *R = cast<VPRecipeBase>(U);
1261 for (VPValue *VPV : R->definedValues())
1262 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1263 }
1264#endif
1265 }
1266
1267 // Simplify (X && Y) | (X && !Y) -> X.
1268 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1269 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1270 // recipes to be visited during simplification.
1271 VPValue *X, *Y, *Z;
1272 if (match(Def,
1275 Def->replaceAllUsesWith(X);
1276 Def->eraseFromParent();
1277 return;
1278 }
1279
1280 // x | 1 -> 1
1281 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1282 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1283
1284 // x | 0 -> x
1285 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1286 return Def->replaceAllUsesWith(X);
1287
1288 // x | !x -> AllOnes
1289 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1290 return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
1292 }
1293
1294 // x & 0 -> 0
1295 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1296 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1297
1298 // x & AllOnes -> x
1299 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1300 return Def->replaceAllUsesWith(X);
1301
1302 // x && false -> false
1303 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1304 return Def->replaceAllUsesWith(Plan->getFalse());
1305
1306 // x && true -> x
1307 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1308 return Def->replaceAllUsesWith(X);
1309
1310 // (x && y) | (x && z) -> x && (y | z)
1311 if (CanCreateNewRecipe &&
1314 // Simplify only if one of the operands has one use to avoid creating an
1315 // extra recipe.
1316 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1317 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1318 return Def->replaceAllUsesWith(
1319 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1320
1321 // x && !x -> 0
1323 return Def->replaceAllUsesWith(Plan->getFalse());
1324
1325 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1326 return Def->replaceAllUsesWith(X);
1327
1328 // select c, false, true -> not c
1329 VPValue *C;
1330 if (CanCreateNewRecipe &&
1331 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1332 return Def->replaceAllUsesWith(Builder.createNot(C));
1333
1334 // select !c, x, y -> select c, y, x
1335 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1336 Def->setOperand(0, C);
1337 Def->setOperand(1, Y);
1338 Def->setOperand(2, X);
1339 return;
1340 }
1341
1342 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1343 return Def->replaceAllUsesWith(A);
1344
1345 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1346 return Def->replaceAllUsesWith(A);
1347
1348 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1349 return Def->replaceAllUsesWith(
1350 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1351
1352 const APInt *APC;
1353 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1354 APC->isPowerOf2())
1355 return Def->replaceAllUsesWith(Builder.createNaryOp(
1356 Instruction::Shl,
1357 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1358 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1359
1360 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1361 // not allowed in them.
1362 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1363 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1364 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1365 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1366 return Def->replaceAllUsesWith(Builder.createNaryOp(
1367 Instruction::LShr,
1368 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1369 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1370
1371 if (match(Def, m_Not(m_VPValue(A)))) {
1372 if (match(A, m_Not(m_VPValue(A))))
1373 return Def->replaceAllUsesWith(A);
1374
1375 // Try to fold Not into compares by adjusting the predicate in-place.
1376 CmpPredicate Pred;
1377 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1378 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1379 if (all_of(Cmp->users(),
1381 m_Not(m_Specific(Cmp)),
1382 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1383 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1384 for (VPUser *U : to_vector(Cmp->users())) {
1385 auto *R = cast<VPSingleDefRecipe>(U);
1386 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1387 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1388 R->setOperand(1, Y);
1389 R->setOperand(2, X);
1390 } else {
1391 // not (cmp pred) -> cmp inv_pred
1392 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1393 R->replaceAllUsesWith(Cmp);
1394 }
1395 }
1396 // If Cmp doesn't have a debug location, use the one from the negation,
1397 // to preserve the location.
1398 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1399 Cmp->setDebugLoc(Def->getDebugLoc());
1400 }
1401 }
1402 }
1403
1404 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1405 // any-of (fcmp uno %A, %B), ...
1406 if (match(Def, m_AnyOf())) {
1408 VPRecipeBase *UnpairedCmp = nullptr;
1409 for (VPValue *Op : Def->operands()) {
1410 VPValue *X;
1411 if (Op->getNumUsers() > 1 ||
1413 m_Deferred(X)))) {
1414 NewOps.push_back(Op);
1415 } else if (!UnpairedCmp) {
1416 UnpairedCmp = Op->getDefiningRecipe();
1417 } else {
1418 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1419 UnpairedCmp->getOperand(0), X));
1420 UnpairedCmp = nullptr;
1421 }
1422 }
1423
1424 if (UnpairedCmp)
1425 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1426
1427 if (NewOps.size() < Def->getNumOperands()) {
1428 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1429 return Def->replaceAllUsesWith(NewAnyOf);
1430 }
1431 }
1432
1433 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1434 // This is useful for fmax/fmin without fast-math flags, where we need to
1435 // check if any operand is NaN.
1436 if (CanCreateNewRecipe &&
1438 m_Deferred(X)),
1440 m_Deferred(Y))))) {
1441 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1442 return Def->replaceAllUsesWith(NewCmp);
1443 }
1444
1445 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1446 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1447 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1448 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1449 TypeInfo.inferScalarType(Def))
1450 return Def->replaceAllUsesWith(Def->getOperand(1));
1451
1453 m_One()))) {
1454 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1455 if (TypeInfo.inferScalarType(X) != WideStepTy)
1456 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1457 Def->replaceAllUsesWith(X);
1458 return;
1459 }
1460
1461 // For i1 vp.merges produced by AnyOf reductions:
1462 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1464 m_VPValue(X), m_VPValue())) &&
1466 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1467 Def->setOperand(1, Def->getOperand(0));
1468 Def->setOperand(0, Y);
1469 return;
1470 }
1471
1472 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1473 if (Phi->getOperand(0) == Phi->getOperand(1))
1474 Phi->replaceAllUsesWith(Phi->getOperand(0));
1475 return;
1476 }
1477
1478 // Simplify MaskedCond with no block mask to its single operand.
1480 !cast<VPInstruction>(Def)->isMasked())
1481 return Def->replaceAllUsesWith(Def->getOperand(0));
1482
1483 // Look through ExtractLastLane.
1484 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1485 if (match(A, m_BuildVector())) {
1486 auto *BuildVector = cast<VPInstruction>(A);
1487 Def->replaceAllUsesWith(
1488 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1489 return;
1490 }
1491 if (Plan->hasScalarVFOnly())
1492 return Def->replaceAllUsesWith(A);
1493 }
1494
1495 // Look through ExtractPenultimateElement (BuildVector ....).
1497 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1498 Def->replaceAllUsesWith(
1499 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1500 return;
1501 }
1502
1503 uint64_t Idx;
1505 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1506 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1507 return;
1508 }
1509
1510 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1511 Def->replaceAllUsesWith(
1512 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1513 return;
1514 }
1515
1516 // Look through broadcast of single-scalar when used as select conditions; in
1517 // that case the scalar condition can be used directly.
1518 if (match(Def,
1521 "broadcast operand must be single-scalar");
1522 Def->setOperand(0, C);
1523 return;
1524 }
1525
1527 if (Def->getNumOperands() == 1)
1528 Def->replaceAllUsesWith(Def->getOperand(0));
1529 return;
1530 }
1531
1532 VPIRValue *IRV;
1533 if (Def->getNumOperands() == 1 &&
1535 return Def->replaceAllUsesWith(IRV);
1536
1537 // Some simplifications can only be applied after unrolling. Perform them
1538 // below.
1539 if (!Plan->isUnrolled())
1540 return;
1541
1542 // After unrolling, extract-lane may be used to extract values from multiple
1543 // scalar sources. Only simplify when extracting from a single scalar source.
1544 VPValue *LaneToExtract;
1545 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1546 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1548 return Def->replaceAllUsesWith(A);
1549
1550 // Simplify extract-lane with single source to extract-element.
1551 Def->replaceAllUsesWith(Builder.createNaryOp(
1552 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1553 return;
1554 }
1555
1556 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1557 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1558 isa<VPPhi>(X)) {
1559 auto *Phi = cast<VPPhi>(X);
1560 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1561 Phi->getSingleUser() == Def) {
1562 Phi->setOperand(0, Y);
1563 Def->replaceAllUsesWith(Phi);
1564 return;
1565 }
1566 }
1567
1568 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1569 // just the pointer operand.
1570 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1571 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1572 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1573
1574 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1575 // the start index is zero and only the first lane 0 is demanded.
1576 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1577 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1578 Steps->replaceAllUsesWith(Steps->getOperand(0));
1579 return;
1580 }
1581 }
1582 // Simplify redundant ReductionStartVector recipes after unrolling.
1583 VPValue *StartV;
1585 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1586 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1587 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1588 return PhiR && PhiR->isInLoop();
1589 });
1590 return;
1591 }
1592
1594 Def->replaceAllUsesWith(A);
1595 return;
1596 }
1597
1598 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1601 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1602 all_of(A->users(),
1603 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1604 return Def->replaceAllUsesWith(A);
1605 }
1606
1607 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1608 return Def->replaceAllUsesWith(A);
1609}
1610
1613 Plan.getEntry());
1614 VPTypeAnalysis TypeInfo(Plan);
1616 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1617 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1618 simplifyRecipe(Def, TypeInfo);
1619 }
1620}
1621
1622/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1623/// header mask to be simplified further when tail folding, e.g. in
1624/// optimizeEVLMasks.
1625static void reassociateHeaderMask(VPlan &Plan) {
1626 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1627 if (!HeaderMask)
1628 return;
1629
1630 SmallVector<VPUser *> Worklist;
1631 for (VPUser *U : HeaderMask->users())
1632 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1634
1635 while (!Worklist.empty()) {
1636 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1637 VPValue *X, *Y;
1638 if (!R || !match(R, m_LogicalAnd(
1639 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1640 m_VPValue(Y))))
1641 continue;
1642 append_range(Worklist, R->users());
1643 VPBuilder Builder(R);
1644 R->replaceAllUsesWith(
1645 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1646 }
1647}
1648
1650 if (Plan.hasScalarVFOnly())
1651 return;
1652
1653 // Try to narrow wide and replicating recipes to single scalar recipes,
1654 // based on VPlan analysis. Only process blocks in the loop region for now,
1655 // without traversing into nested regions, as recipes in replicate regions
1656 // cannot be converted yet.
1659 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1661 VPWidenStoreRecipe>(&R))
1662 continue;
1663 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1664 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1665 continue;
1666
1667 // Convert an unmasked scatter with an uniform address into
1668 // extract-last-lane + scalar store.
1669 // TODO: Add a profitability check comparing the cost of a scatter vs.
1670 // extract + scalar store.
1671 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1672 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1673 !WidenStoreR->isConsecutive()) {
1674 assert(!WidenStoreR->isReverse() &&
1675 "Not consecutive memory recipes shouldn't be reversed");
1676 VPValue *Mask = WidenStoreR->getMask();
1677
1678 // Only convert the scatter to a scalar store if it is unmasked.
1679 // TODO: Support converting scatter masked by the header mask to scalar
1680 // store.
1681 if (Mask)
1682 continue;
1683
1685 {WidenStoreR->getOperand(1)});
1686 Extract->insertBefore(WidenStoreR);
1687
1688 // TODO: Sink the scalar store recipe to middle block if possible.
1689 auto *ScalarStore = new VPReplicateRecipe(
1690 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1691 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1692 *WidenStoreR /*Metadata*/);
1693 ScalarStore->insertBefore(WidenStoreR);
1694 WidenStoreR->eraseFromParent();
1695 continue;
1696 }
1697
1698 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1699 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1700 vputils::isSingleScalar(RepR->getOperand(1))) {
1701 auto *Clone = new VPReplicateRecipe(
1702 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1703 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1704 *RepR /*Metadata*/, RepR->getDebugLoc());
1705 Clone->insertBefore(RepOrWidenR);
1706 VPBuilder Builder(Clone);
1707 VPValue *ExtractOp = Clone->getOperand(0);
1708 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1709 ExtractOp =
1710 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1711 ExtractOp =
1712 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1713 Clone->setOperand(0, ExtractOp);
1714 RepR->eraseFromParent();
1715 continue;
1716 }
1717
1718 // Skip recipes that aren't single scalars.
1719 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1720 continue;
1721
1722 // Predicate to check if a user of Op introduces extra broadcasts.
1723 auto IntroducesBCastOf = [](const VPValue *Op) {
1724 return [Op](const VPUser *U) {
1725 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1729 VPI->getOpcode()))
1730 return false;
1731 }
1732 return !U->usesScalars(Op);
1733 };
1734 };
1735
1736 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1737 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1738 if (any_of(
1739 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1740 IntroducesBCastOf(Op)))
1741 return false;
1742 // Non-constant live-ins require broadcasts, while constants do not
1743 // need explicit broadcasts.
1744 auto *IRV = dyn_cast<VPIRValue>(Op);
1745 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1746 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1747 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1748 }))
1749 continue;
1750
1751 auto *Clone = new VPReplicateRecipe(
1752 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1753 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1754 Clone->insertBefore(RepOrWidenR);
1755 RepOrWidenR->replaceAllUsesWith(Clone);
1756 if (isDeadRecipe(*RepOrWidenR))
1757 RepOrWidenR->eraseFromParent();
1758 }
1759 }
1760}
1761
1762/// Try to see if all of \p Blend's masks share a common value logically and'ed
1763/// and remove it from the masks.
1765 if (Blend->isNormalized())
1766 return;
1767 VPValue *CommonEdgeMask;
1768 if (!match(Blend->getMask(0),
1769 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1770 return;
1771 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1772 if (!match(Blend->getMask(I),
1773 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1774 return;
1775 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1776 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1777}
1778
1779/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1780/// to make sure the masks are simplified.
1781static void simplifyBlends(VPlan &Plan) {
1784 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1785 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1786 if (!Blend)
1787 continue;
1788
1789 removeCommonBlendMask(Blend);
1790
1791 // Try to remove redundant blend recipes.
1792 SmallPtrSet<VPValue *, 4> UniqueValues;
1793 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1794 UniqueValues.insert(Blend->getIncomingValue(0));
1795 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1796 if (!match(Blend->getMask(I), m_False()))
1797 UniqueValues.insert(Blend->getIncomingValue(I));
1798
1799 if (UniqueValues.size() == 1) {
1800 Blend->replaceAllUsesWith(*UniqueValues.begin());
1801 Blend->eraseFromParent();
1802 continue;
1803 }
1804
1805 if (Blend->isNormalized())
1806 continue;
1807
1808 // Normalize the blend so its first incoming value is used as the initial
1809 // value with the others blended into it.
1810
1811 unsigned StartIndex = 0;
1812 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1813 // If a value's mask is used only by the blend then is can be deadcoded.
1814 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1815 // that's used by multiple blends where it can be removed from them all.
1816 VPValue *Mask = Blend->getMask(I);
1817 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1818 StartIndex = I;
1819 break;
1820 }
1821 }
1822
1823 SmallVector<VPValue *, 4> OperandsWithMask;
1824 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1825
1826 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1827 if (I == StartIndex)
1828 continue;
1829 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1830 OperandsWithMask.push_back(Blend->getMask(I));
1831 }
1832
1833 auto *NewBlend =
1834 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1835 OperandsWithMask, *Blend, Blend->getDebugLoc());
1836 NewBlend->insertBefore(&R);
1837
1838 VPValue *DeadMask = Blend->getMask(StartIndex);
1839 Blend->replaceAllUsesWith(NewBlend);
1840 Blend->eraseFromParent();
1842
1843 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1844 VPValue *NewMask;
1845 if (NewBlend->getNumOperands() == 3 &&
1846 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1847 VPValue *Inc0 = NewBlend->getOperand(0);
1848 VPValue *Inc1 = NewBlend->getOperand(1);
1849 VPValue *OldMask = NewBlend->getOperand(2);
1850 NewBlend->setOperand(0, Inc1);
1851 NewBlend->setOperand(1, Inc0);
1852 NewBlend->setOperand(2, NewMask);
1853 if (OldMask->getNumUsers() == 0)
1854 cast<VPInstruction>(OldMask)->eraseFromParent();
1855 }
1856 }
1857 }
1858}
1859
1860/// Optimize the width of vector induction variables in \p Plan based on a known
1861/// constant Trip Count, \p BestVF and \p BestUF.
1863 ElementCount BestVF,
1864 unsigned BestUF) {
1865 // Only proceed if we have not completely removed the vector region.
1866 if (!Plan.getVectorLoopRegion())
1867 return false;
1868
1869 const APInt *TC;
1870 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1871 return false;
1872
1873 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1874 // and UF. Returns at least 8.
1875 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1876 APInt AlignedTC =
1879 APInt MaxVal = AlignedTC - 1;
1880 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1881 };
1882 unsigned NewBitWidth =
1883 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1884
1885 LLVMContext &Ctx = Plan.getContext();
1886 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1887
1888 bool MadeChange = false;
1889
1890 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1891 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1892 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1893
1894 // Currently only handle canonical IVs as it is trivial to replace the start
1895 // and stop values, and we currently only perform the optimization when the
1896 // IV has a single use.
1897 if (!WideIV || !WideIV->isCanonical() ||
1898 WideIV->hasMoreThanOneUniqueUser() ||
1899 NewIVTy == WideIV->getScalarType())
1900 continue;
1901
1902 // Currently only handle cases where the single user is a header-mask
1903 // comparison with the backedge-taken-count.
1904 VPUser *SingleUser = WideIV->getSingleUser();
1905 if (!SingleUser ||
1906 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1909 continue;
1910
1911 // Update IV operands and comparison bound to use new narrower type.
1912 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1913 WideIV->setStartValue(NewStart);
1914 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1915 WideIV->setStepValue(NewStep);
1916
1917 auto *NewBTC = new VPWidenCastRecipe(
1918 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
1919 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
1920 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1921 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1922 Cmp->setOperand(1, NewBTC);
1923
1924 MadeChange = true;
1925 }
1926
1927 return MadeChange;
1928}
1929
1930/// Return true if \p Cond is known to be true for given \p BestVF and \p
1931/// BestUF.
1933 ElementCount BestVF, unsigned BestUF,
1936 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1937 &PSE](VPValue *C) {
1938 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1939 });
1940
1941 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1943 m_Specific(CanIV->getBackedgeValue()),
1944 m_Specific(&Plan.getVectorTripCount()))))
1945 return false;
1946
1947 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1948 // count is not conveniently available as SCEV so far, so we compare directly
1949 // against the original trip count. This is stricter than necessary, as we
1950 // will only return true if the trip count == vector trip count.
1951 const SCEV *VectorTripCount =
1953 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1954 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1955 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1956 "Trip count SCEV must be computable");
1957 ScalarEvolution &SE = *PSE.getSE();
1958 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1959 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1960 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1961}
1962
1963/// Try to replace multiple active lane masks used for control flow with
1964/// a single, wide active lane mask instruction followed by multiple
1965/// extract subvector intrinsics. This applies to the active lane mask
1966/// instructions both in the loop and in the preheader.
1967/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1968/// new extracts from the first active lane mask, which has it's last
1969/// operand (multiplier) set to UF.
1971 unsigned UF) {
1972 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1973 return false;
1974
1975 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1976 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1977 auto *Term = &ExitingVPBB->back();
1978
1979 using namespace llvm::VPlanPatternMatch;
1981 m_VPValue(), m_VPValue(), m_VPValue())))))
1982 return false;
1983
1984 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1985 LLVMContext &Ctx = Plan.getContext();
1986
1987 auto ExtractFromALM = [&](VPInstruction *ALM,
1988 SmallVectorImpl<VPValue *> &Extracts) {
1989 DebugLoc DL = ALM->getDebugLoc();
1990 for (unsigned Part = 0; Part < UF; ++Part) {
1992 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
1993 auto *Ext =
1994 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1995 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1996 Extracts[Part] = Ext;
1997 Ext->insertAfter(ALM);
1998 }
1999 };
2000
2001 // Create a list of each active lane mask phi, ordered by unroll part.
2003 for (VPRecipeBase &R : Header->phis()) {
2005 if (!Phi)
2006 continue;
2007 VPValue *Index = nullptr;
2008 match(Phi->getBackedgeValue(),
2010 assert(Index && "Expected index from ActiveLaneMask instruction");
2011
2012 uint64_t Part;
2013 if (match(Index,
2015 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2016 Phis[Part] = Phi;
2017 else {
2018 // Anything other than a CanonicalIVIncrementForPart is part 0
2019 assert(!match(
2020 Index,
2022 Phis[0] = Phi;
2023 }
2024 }
2025
2026 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2027 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2028
2029 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2030 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2031
2032 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2033 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2034 "Expected incoming values of Phi to be ActiveLaneMasks");
2035
2036 // When using wide lane masks, the return type of the get.active.lane.mask
2037 // intrinsic is VF x UF (last operand).
2038 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2039 EntryALM->setOperand(2, ALMMultiplier);
2040 LoopALM->setOperand(2, ALMMultiplier);
2041
2042 // Create UF x extract vectors and insert into preheader.
2043 SmallVector<VPValue *> EntryExtracts(UF);
2044 ExtractFromALM(EntryALM, EntryExtracts);
2045
2046 // Create UF x extract vectors and insert before the loop compare & branch,
2047 // updating the compare to use the first extract.
2048 SmallVector<VPValue *> LoopExtracts(UF);
2049 ExtractFromALM(LoopALM, LoopExtracts);
2050 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2051 Not->setOperand(0, LoopExtracts[0]);
2052
2053 // Update the incoming values of active lane mask phis.
2054 for (unsigned Part = 0; Part < UF; ++Part) {
2055 Phis[Part]->setStartValue(EntryExtracts[Part]);
2056 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2057 }
2058
2059 return true;
2060}
2061
2062/// Try to simplify the branch condition of \p Plan. This may restrict the
2063/// resulting plan to \p BestVF and \p BestUF.
2065 unsigned BestUF,
2067 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2068 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2069 auto *Term = &ExitingVPBB->back();
2070 VPValue *Cond;
2071 if (match(Term,
2073 m_VPValue())) ||
2075 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2076 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2077 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2078 const SCEV *VectorTripCount =
2080 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2081 VectorTripCount =
2083 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2084 "Trip count SCEV must be computable");
2085 ScalarEvolution &SE = *PSE.getSE();
2086 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2087 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2088 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2089 return false;
2090 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2092 // For BranchOnCond, check if we can prove the condition to be true using VF
2093 // and UF.
2094 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2095 return false;
2096 } else {
2097 return false;
2098 }
2099
2100 // The vector loop region only executes once. If possible, completely remove
2101 // the region, otherwise replace the terminator controlling the latch with
2102 // (BranchOnCond true).
2103 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2104 // support for other non-canonical widen induction recipes (e.g.,
2105 // VPWidenPointerInductionRecipe).
2106 // TODO: fold branch-on-constant after dissolving region.
2107 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2108 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2109 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2110 return R->isCanonical();
2111 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2112 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2113 })) {
2114 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2115 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2116 VPBuilder Builder(Plan.getVectorPreheader());
2117 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2118 R->getScalarType());
2119 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2120 HeaderR.eraseFromParent();
2121 continue;
2122 }
2123 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2124 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2125 HeaderR.eraseFromParent();
2126 }
2127
2128 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2129 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2130 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2131 for (VPBlockBase *Exit : Exits)
2132 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2133
2134 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2135 B->setParent(nullptr);
2136
2137 VPBlockUtils::connectBlocks(Preheader, Header);
2138
2139 for (VPBlockBase *Exit : Exits)
2140 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2141
2142 // Replace terminating branch-on-two-conds with branch-on-cond to early
2143 // exit.
2144 if (Exits.size() != 1) {
2145 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2146 "BranchOnTwoConds needs 2 remaining exits");
2148 Term->getOperand(0));
2149 }
2151 } else {
2152 // The vector region contains header phis for which we cannot remove the
2153 // loop region yet.
2154
2155 // For BranchOnTwoConds, set the latch exit condition to true directly.
2156 if (match(Term, m_BranchOnTwoConds())) {
2157 Term->setOperand(1, Plan.getTrue());
2158 return true;
2159 }
2160
2161 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2162 {}, {}, Term->getDebugLoc());
2163 ExitingVPBB->appendRecipe(BOC);
2164 }
2165
2166 Term->eraseFromParent();
2167
2168 return true;
2169}
2170
2171/// From the definition of llvm.experimental.get.vector.length,
2172/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2176 vp_depth_first_deep(Plan.getEntry()))) {
2177 for (VPRecipeBase &R : *VPBB) {
2178 VPValue *AVL;
2179 if (!match(&R, m_EVL(m_VPValue(AVL))))
2180 continue;
2181
2182 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2183 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2184 continue;
2185 ScalarEvolution &SE = *PSE.getSE();
2186 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2187 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2188 continue;
2189
2191 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2192 R.getDebugLoc());
2193 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2194 return true;
2195 }
2196 }
2197 return false;
2198}
2199
2201 unsigned BestUF,
2203 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2204 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2205
2206 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2207 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2208 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2209 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2210
2211 if (MadeChange) {
2212 Plan.setVF(BestVF);
2213 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2214 }
2215}
2216
2217/// Sink users of \p FOR after the recipe defining the previous value \p
2218/// Previous of the recurrence. \returns true if all users of \p FOR could be
2219/// re-arranged as needed or false if it is not possible.
2220static bool
2222 VPRecipeBase *Previous,
2223 VPDominatorTree &VPDT) {
2224 // If Previous is a live-in (no defining recipe), it naturally dominates all
2225 // recipes in the loop, so no sinking is needed.
2226 if (!Previous)
2227 return true;
2228
2229 // Collect recipes that need sinking.
2232 Seen.insert(Previous);
2233 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2234 // The previous value must not depend on the users of the recurrence phi. In
2235 // that case, FOR is not a fixed order recurrence.
2236 if (SinkCandidate == Previous)
2237 return false;
2238
2239 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2240 !Seen.insert(SinkCandidate).second ||
2241 VPDT.properlyDominates(Previous, SinkCandidate))
2242 return true;
2243
2244 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2245 return false;
2246
2247 WorkList.push_back(SinkCandidate);
2248 return true;
2249 };
2250
2251 // Recursively sink users of FOR after Previous.
2252 WorkList.push_back(FOR);
2253 for (unsigned I = 0; I != WorkList.size(); ++I) {
2254 VPRecipeBase *Current = WorkList[I];
2255 assert(Current->getNumDefinedValues() == 1 &&
2256 "only recipes with a single defined value expected");
2257
2258 for (VPUser *User : Current->getVPSingleValue()->users()) {
2259 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2260 return false;
2261 }
2262 }
2263
2264 // Keep recipes to sink ordered by dominance so earlier instructions are
2265 // processed first.
2266 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2267 return VPDT.properlyDominates(A, B);
2268 });
2269
2270 for (VPRecipeBase *SinkCandidate : WorkList) {
2271 if (SinkCandidate == FOR)
2272 continue;
2273
2274 SinkCandidate->moveAfter(Previous);
2275 Previous = SinkCandidate;
2276 }
2277 return true;
2278}
2279
2280/// Try to hoist \p Previous and its operands before all users of \p FOR.
2282 VPRecipeBase *Previous,
2283 VPDominatorTree &VPDT) {
2284 if (cannotHoistOrSinkRecipe(*Previous))
2285 return false;
2286
2287 // Collect recipes that need hoisting.
2288 SmallVector<VPRecipeBase *> HoistCandidates;
2290 VPRecipeBase *HoistPoint = nullptr;
2291 // Find the closest hoist point by looking at all users of FOR and selecting
2292 // the recipe dominating all other users.
2293 for (VPUser *U : FOR->users()) {
2294 auto *R = cast<VPRecipeBase>(U);
2295 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2296 HoistPoint = R;
2297 }
2298 assert(all_of(FOR->users(),
2299 [&VPDT, HoistPoint](VPUser *U) {
2300 auto *R = cast<VPRecipeBase>(U);
2301 return HoistPoint == R ||
2302 VPDT.properlyDominates(HoistPoint, R);
2303 }) &&
2304 "HoistPoint must dominate all users of FOR");
2305
2306 auto NeedsHoisting = [HoistPoint, &VPDT,
2307 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2308 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2309 if (!HoistCandidate)
2310 return nullptr;
2311 VPRegionBlock *EnclosingLoopRegion =
2312 HoistCandidate->getParent()->getEnclosingLoopRegion();
2313 assert((!HoistCandidate->getRegion() ||
2314 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2315 "CFG in VPlan should still be flat, without replicate regions");
2316 // Hoist candidate was already visited, no need to hoist.
2317 if (!Visited.insert(HoistCandidate).second)
2318 return nullptr;
2319
2320 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2321 // hoisting.
2322 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2323 return nullptr;
2324
2325 // If we reached a recipe that dominates HoistPoint, we don't need to
2326 // hoist the recipe.
2327 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2328 return nullptr;
2329 return HoistCandidate;
2330 };
2331
2332 if (!NeedsHoisting(Previous->getVPSingleValue()))
2333 return true;
2334
2335 // Recursively try to hoist Previous and its operands before all users of FOR.
2336 HoistCandidates.push_back(Previous);
2337
2338 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2339 VPRecipeBase *Current = HoistCandidates[I];
2340 assert(Current->getNumDefinedValues() == 1 &&
2341 "only recipes with a single defined value expected");
2342 if (cannotHoistOrSinkRecipe(*Current))
2343 return false;
2344
2345 for (VPValue *Op : Current->operands()) {
2346 // If we reach FOR, it means the original Previous depends on some other
2347 // recurrence that in turn depends on FOR. If that is the case, we would
2348 // also need to hoist recipes involving the other FOR, which may break
2349 // dependencies.
2350 if (Op == FOR)
2351 return false;
2352
2353 if (auto *R = NeedsHoisting(Op)) {
2354 // Bail out if the recipe defines multiple values.
2355 // TODO: Hoisting such recipes requires additional handling.
2356 if (R->getNumDefinedValues() != 1)
2357 return false;
2358 HoistCandidates.push_back(R);
2359 }
2360 }
2361 }
2362
2363 // Order recipes to hoist by dominance so earlier instructions are processed
2364 // first.
2365 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2366 return VPDT.properlyDominates(A, B);
2367 });
2368
2369 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2370 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2371 HoistPoint->getIterator());
2372 }
2373
2374 return true;
2375}
2376
2378 VPBuilder &LoopBuilder) {
2379 VPDominatorTree VPDT(Plan);
2380
2382 for (VPRecipeBase &R :
2385 RecurrencePhis.push_back(FOR);
2386
2387 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2389 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2390 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2391 // to terminate.
2392 while (auto *PrevPhi =
2394 assert(PrevPhi->getParent() == FOR->getParent());
2395 assert(SeenPhis.insert(PrevPhi).second);
2396 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2397 }
2398
2399 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2400 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2401 return false;
2402
2403 // Introduce a recipe to combine the incoming and previous values of a
2404 // fixed-order recurrence.
2405 VPBasicBlock *InsertBlock =
2406 Previous ? Previous->getParent() : FOR->getParent();
2407 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2408 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2409 else
2410 LoopBuilder.setInsertPoint(InsertBlock,
2411 std::next(Previous->getIterator()));
2412
2413 auto *RecurSplice =
2415 {FOR, FOR->getBackedgeValue()});
2416
2417 FOR->replaceAllUsesWith(RecurSplice);
2418 // Set the first operand of RecurSplice to FOR again, after replacing
2419 // all users.
2420 RecurSplice->setOperand(0, FOR);
2421
2422 // Check for users extracting at the penultimate active lane of the FOR.
2423 // If only a single lane is active in the current iteration, we need to
2424 // select the last element from the previous iteration (from the FOR phi
2425 // directly).
2426 for (VPUser *U : RecurSplice->users()) {
2428 m_Specific(RecurSplice))))
2429 continue;
2430
2432 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2433 VPValue *Zero = Plan.getConstantInt(64, 0);
2434 VPValue *One = Plan.getConstantInt(64, 1);
2435 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2436 VPValue *PenultimateLastIter =
2437 B.createNaryOp(VPInstruction::ExtractLane,
2438 {PenultimateIndex, FOR->getBackedgeValue()});
2439 VPValue *LastPrevIter =
2440 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2441
2442 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2443 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2444 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2445 }
2446 }
2447 return true;
2448}
2449
2451 for (VPRecipeBase &R :
2453 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2454 if (!PhiR)
2455 continue;
2456 RecurKind RK = PhiR->getRecurrenceKind();
2457 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2459 continue;
2460
2461 for (VPUser *U : collectUsersRecursively(PhiR))
2462 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2463 RecWithFlags->dropPoisonGeneratingFlags();
2464 }
2465 }
2466}
2467
2468namespace {
2469struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2470 static bool isSentinel(const VPSingleDefRecipe *Def) {
2471 return Def == getEmptyKey() || Def == getTombstoneKey();
2472 }
2473
2474 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2475 /// return that source element type.
2476 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2477 // All VPInstructions that lower to GEPs must have the i8 source element
2478 // type (as they are PtrAdds), so we omit it.
2480 .Case([](const VPReplicateRecipe *I) -> Type * {
2481 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2482 return GEP->getSourceElementType();
2483 return nullptr;
2484 })
2485 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2486 [](auto *I) { return I->getSourceElementType(); })
2487 .Default([](auto *) { return nullptr; });
2488 }
2489
2490 /// Returns true if recipe \p Def can be safely handed for CSE.
2491 static bool canHandle(const VPSingleDefRecipe *Def) {
2492 // We can extend the list of handled recipes in the future,
2493 // provided we account for the data embedded in them while checking for
2494 // equality or hashing.
2495 auto C = getOpcodeOrIntrinsicID(Def);
2496
2497 // The issue with (Insert|Extract)Value is that the index of the
2498 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2499 // VPlan.
2500 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2501 C->second == Instruction::ExtractValue)))
2502 return false;
2503
2504 // During CSE, we can only handle recipes that don't read from memory: if
2505 // they read from memory, there could be an intervening write to memory
2506 // before the next instance is CSE'd, leading to an incorrect result.
2507 return !Def->mayReadFromMemory();
2508 }
2509
2510 /// Hash the underlying data of \p Def.
2511 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2512 const VPlan *Plan = Def->getParent()->getPlan();
2513 VPTypeAnalysis TypeInfo(*Plan);
2514 hash_code Result = hash_combine(
2515 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2516 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2518 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2519 if (RFlags->hasPredicate())
2520 return hash_combine(Result, RFlags->getPredicate());
2521 return Result;
2522 }
2523
2524 /// Check equality of underlying data of \p L and \p R.
2525 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2526 if (isSentinel(L) || isSentinel(R))
2527 return L == R;
2528 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2530 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2532 !equal(L->operands(), R->operands()))
2533 return false;
2535 "must have valid opcode info for both recipes");
2536 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2537 if (LFlags->hasPredicate() &&
2538 LFlags->getPredicate() !=
2539 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2540 return false;
2541 // Recipes in replicate regions implicitly depend on predicate. If either
2542 // recipe is in a replicate region, only consider them equal if both have
2543 // the same parent.
2544 const VPRegionBlock *RegionL = L->getRegion();
2545 const VPRegionBlock *RegionR = R->getRegion();
2546 if (((RegionL && RegionL->isReplicator()) ||
2547 (RegionR && RegionR->isReplicator())) &&
2548 L->getParent() != R->getParent())
2549 return false;
2550 const VPlan *Plan = L->getParent()->getPlan();
2551 VPTypeAnalysis TypeInfo(*Plan);
2552 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2553 }
2554};
2555} // end anonymous namespace
2556
2557/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2558/// Plan.
2560 VPDominatorTree VPDT(Plan);
2562
2564 vp_depth_first_deep(Plan.getEntry()))) {
2565 for (VPRecipeBase &R : *VPBB) {
2566 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2567 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2568 continue;
2569 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2570 // V must dominate Def for a valid replacement.
2571 if (!VPDT.dominates(V->getParent(), VPBB))
2572 continue;
2573 // Only keep flags present on both V and Def.
2574 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2575 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2576 Def->replaceAllUsesWith(V);
2577 continue;
2578 }
2579 CSEMap[Def] = Def;
2580 }
2581 }
2582}
2583
2584/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2585static void licm(VPlan &Plan) {
2586 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2587
2588 // Hoist any loop invariant recipes from the vector loop region to the
2589 // preheader. Preform a shallow traversal of the vector loop region, to
2590 // exclude recipes in replicate regions. Since the top-level blocks in the
2591 // vector loop region are guaranteed to execute if the vector pre-header is,
2592 // we don't need to check speculation safety.
2593 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2594 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2595 "Expected vector prehader's successor to be the vector loop region");
2597 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2598 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2600 continue;
2601 if (any_of(R.operands(), [](VPValue *Op) {
2602 return !Op->isDefinedOutsideLoopRegions();
2603 }))
2604 continue;
2605 R.moveBefore(*Preheader, Preheader->end());
2606 }
2607 }
2608
2609#ifndef NDEBUG
2610 VPDominatorTree VPDT(Plan);
2611#endif
2612 // Sink recipes with no users inside the vector loop region if all users are
2613 // in the same exit block of the region.
2614 // TODO: Extend to sink recipes from inner loops.
2616 vp_post_order_shallow(LoopRegion->getEntry()))) {
2617 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2619 continue;
2620
2621 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2622 // handles sunk recipes correctly.
2623 if (isa<VPReplicateRecipe>(&R))
2624 continue;
2625
2626 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2627 // support recipes with multiple defined values (e.g., interleaved loads).
2628 auto *Def = cast<VPSingleDefRecipe>(&R);
2629 // Skip recipes without users as we cannot determine a sink block.
2630 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2631 // their execution frequency.
2632 if (Def->getNumUsers() == 0)
2633 continue;
2634
2635 VPBasicBlock *SinkBB = nullptr;
2636 // Cannot sink the recipe if any user
2637 // * is defined in any loop region, or
2638 // * is a phi, or
2639 // * multiple users in different blocks.
2640 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2641 auto *UserR = cast<VPRecipeBase>(U);
2642 VPBasicBlock *Parent = UserR->getParent();
2643 // TODO: If the user is a PHI node, we should check the block of
2644 // incoming value. Support PHI node users if needed.
2645 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2646 return true;
2647 // TODO: Support sinking when users are in multiple blocks.
2648 if (SinkBB && SinkBB != Parent)
2649 return true;
2650 SinkBB = Parent;
2651 return false;
2652 }))
2653 continue;
2654
2655 // Only sink to dedicated exit blocks of the loop region.
2656 if (SinkBB->getSinglePredecessor() != LoopRegion)
2657 continue;
2658
2659 // TODO: This will need to be a check instead of a assert after
2660 // conditional branches in vectorized loops are supported.
2661 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2662 "Defining block must dominate sink block");
2663 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2664 // just moving.
2665 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2666 }
2667 }
2668}
2669
2671 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2672 if (Plan.hasScalarVFOnly())
2673 return;
2674 // Keep track of created truncates, so they can be re-used. Note that we
2675 // cannot use RAUW after creating a new truncate, as this would could make
2676 // other uses have different types for their operands, making them invalidly
2677 // typed.
2679 VPTypeAnalysis TypeInfo(Plan);
2680 VPBasicBlock *PH = Plan.getVectorPreheader();
2683 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2686 continue;
2687
2688 VPValue *ResultVPV = R.getVPSingleValue();
2689 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2690 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2691 if (!NewResSizeInBits)
2692 continue;
2693
2694 // If the value wasn't vectorized, we must maintain the original scalar
2695 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2696 // skip casts which do not need to be handled explicitly here, as
2697 // redundant casts will be removed during recipe simplification.
2699 continue;
2700
2701 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2702 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2703 assert(OldResTy->isIntegerTy() && "only integer types supported");
2704 (void)OldResSizeInBits;
2705
2706 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2707
2708 // Any wrapping introduced by shrinking this operation shouldn't be
2709 // considered undefined behavior. So, we can't unconditionally copy
2710 // arithmetic wrapping flags to VPW.
2711 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2712 VPW->dropPoisonGeneratingFlags();
2713
2714 if (OldResSizeInBits != NewResSizeInBits &&
2715 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2716 // Extend result to original width.
2717 auto *Ext = new VPWidenCastRecipe(
2718 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2719 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2720 Ext->insertAfter(&R);
2721 ResultVPV->replaceAllUsesWith(Ext);
2722 Ext->setOperand(0, ResultVPV);
2723 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2724 } else {
2725 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2726 "Only ICmps should not need extending the result.");
2727 }
2728
2729 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2731 continue;
2732
2733 // Shrink operands by introducing truncates as needed.
2734 unsigned StartIdx =
2735 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2736 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2737 auto *Op = R.getOperand(Idx);
2738 unsigned OpSizeInBits =
2740 if (OpSizeInBits == NewResSizeInBits)
2741 continue;
2742 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2743 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2744 if (!IterIsEmpty) {
2745 R.setOperand(Idx, ProcessedIter->second);
2746 continue;
2747 }
2748
2749 VPBuilder Builder;
2750 if (isa<VPIRValue>(Op))
2751 Builder.setInsertPoint(PH);
2752 else
2753 Builder.setInsertPoint(&R);
2754 VPWidenCastRecipe *NewOp =
2755 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2756 ProcessedIter->second = NewOp;
2757 R.setOperand(Idx, NewOp);
2758 }
2759
2760 }
2761 }
2762}
2763
2767 VPValue *Cond;
2768 // Skip blocks that are not terminated by BranchOnCond.
2769 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2770 continue;
2771
2772 assert(VPBB->getNumSuccessors() == 2 &&
2773 "Two successors expected for BranchOnCond");
2774 unsigned RemovedIdx;
2775 if (match(Cond, m_True()))
2776 RemovedIdx = 1;
2777 else if (match(Cond, m_False()))
2778 RemovedIdx = 0;
2779 else
2780 continue;
2781
2782 VPBasicBlock *RemovedSucc =
2783 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2784 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2785 "There must be a single edge between VPBB and its successor");
2786 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2787 // these recipes.
2788 for (VPRecipeBase &R : RemovedSucc->phis())
2789 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2790
2791 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2792 // automatically on VPlan destruction if it becomes unreachable.
2793 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2794 VPBB->back().eraseFromParent();
2795 }
2796}
2797
2819
2820// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2821// the loop terminator with a branch-on-cond recipe with the negated
2822// active-lane-mask as operand. Note that this turns the loop into an
2823// uncountable one. Only the existing terminator is replaced, all other existing
2824// recipes/users remain unchanged, except for poison-generating flags being
2825// dropped from the canonical IV increment. Return the created
2826// VPActiveLaneMaskPHIRecipe.
2827//
2828// The function uses the following definitions:
2829//
2830// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2831// calculate-trip-count-minus-VF (original TC) : original TC
2832// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2833// CanonicalIVPhi : CanonicalIVIncrement
2834// %StartV is the canonical induction start value.
2835//
2836// The function adds the following recipes:
2837//
2838// vector.ph:
2839// %TripCount = calculate-trip-count-minus-VF (original TC)
2840// [if DataWithControlFlowWithoutRuntimeCheck]
2841// %EntryInc = canonical-iv-increment-for-part %StartV
2842// %EntryALM = active-lane-mask %EntryInc, %TripCount
2843//
2844// vector.body:
2845// ...
2846// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2847// ...
2848// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2849// %ALM = active-lane-mask %InLoopInc, TripCount
2850// %Negated = Not %ALM
2851// branch-on-cond %Negated
2852//
2855 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2856 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2857 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2858 VPValue *StartV = CanonicalIVPHI->getStartValue();
2859
2860 auto *CanonicalIVIncrement =
2861 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2862 // TODO: Check if dropping the flags is needed if
2863 // !DataAndControlFlowWithoutRuntimeCheck.
2864 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2865 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2866 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2867 // we have to take unrolling into account. Each part needs to start at
2868 // Part * VF
2869 auto *VecPreheader = Plan.getVectorPreheader();
2870 VPBuilder Builder(VecPreheader);
2871
2872 // Create the ActiveLaneMask instruction using the correct start values.
2873 VPValue *TC = Plan.getTripCount();
2874 VPValue *VFxUF = &Plan.getVFxUF();
2875 VPValue *VF = &Plan.getVF();
2876
2877 VPValue *TripCount, *IncrementValue;
2879 // When the loop is guarded by a runtime overflow check for the loop
2880 // induction variable increment by VF, we can increment the value before
2881 // the get.active.lane mask and use the unmodified tripcount.
2882 IncrementValue = CanonicalIVIncrement;
2883 TripCount = TC;
2884 } else {
2885 // When avoiding a runtime check, the active.lane.mask inside the loop
2886 // uses a modified trip count and the induction variable increment is
2887 // done after the active.lane.mask intrinsic is called.
2888 IncrementValue = CanonicalIVPHI;
2889 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2890 {TC, VFxUF}, DL);
2891 }
2892 auto *EntryIncrement = Builder.createOverflowingOp(
2893 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2894 DL, "index.part.next");
2895
2896 // Create the active lane mask instruction in the VPlan preheader.
2897 VPValue *ALMMultiplier =
2898 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2899 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2900 {EntryIncrement, TC, ALMMultiplier}, DL,
2901 "active.lane.mask.entry");
2902
2903 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2904 // preheader ActiveLaneMask instruction.
2905 auto *LaneMaskPhi =
2907 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2908
2909 // Create the active lane mask for the next iteration of the loop before the
2910 // original terminator.
2911 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2912 Builder.setInsertPoint(OriginalTerminator);
2913 auto *InLoopIncrement = Builder.createOverflowingOp(
2915 {IncrementValue, &Plan.getVF()}, {false, false}, DL);
2916 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2917 {InLoopIncrement, TripCount, ALMMultiplier},
2918 DL, "active.lane.mask.next");
2919 LaneMaskPhi->addOperand(ALM);
2920
2921 // Replace the original terminator with BranchOnCond. We have to invert the
2922 // mask here because a true condition means jumping to the exit block.
2923 auto *NotMask = Builder.createNot(ALM, DL);
2924 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2925 OriginalTerminator->eraseFromParent();
2926 return LaneMaskPhi;
2927}
2928
2930 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2933 UseActiveLaneMaskForControlFlow) &&
2934 "DataAndControlFlowWithoutRuntimeCheck implies "
2935 "UseActiveLaneMaskForControlFlow");
2936
2937 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2938 auto *FoundWidenCanonicalIVUser = find_if(
2940 assert(FoundWidenCanonicalIVUser &&
2941 "Must have widened canonical IV when tail folding!");
2942 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2943 auto *WideCanonicalIV =
2944 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2945 VPSingleDefRecipe *LaneMask;
2946 if (UseActiveLaneMaskForControlFlow) {
2949 } else {
2950 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2951 VPValue *ALMMultiplier =
2952 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2953 LaneMask =
2954 B.createNaryOp(VPInstruction::ActiveLaneMask,
2955 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2956 nullptr, "active.lane.mask");
2957 }
2958
2959 // Walk users of WideCanonicalIV and replace the header mask of the form
2960 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2961 // removing the old one to ensure there is always only a single header mask.
2962 HeaderMask->replaceAllUsesWith(LaneMask);
2963 HeaderMask->eraseFromParent();
2964}
2965
2966template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2967 Op0_t In;
2969
2970 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2971
2972 template <typename OpTy> bool match(OpTy *V) const {
2973 if (m_Specific(In).match(V)) {
2974 Out = nullptr;
2975 return true;
2976 }
2977 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2978 }
2979};
2980
2981/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2982/// Returns the remaining part \p Out if so, or nullptr otherwise.
2983template <typename Op0_t, typename Op1_t>
2984static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2985 Op1_t &Out) {
2986 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2987}
2988
2989/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2990/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2991/// recipe could be created.
2992/// \p HeaderMask Header Mask.
2993/// \p CurRecipe Recipe to be transform.
2994/// \p TypeInfo VPlan-based type analysis.
2995/// \p EVL The explicit vector length parameter of vector-predication
2996/// intrinsics.
2998 VPRecipeBase &CurRecipe,
2999 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3000 VPlan *Plan = CurRecipe.getParent()->getPlan();
3001 DebugLoc DL = CurRecipe.getDebugLoc();
3002 VPValue *Addr, *Mask, *EndPtr;
3003
3004 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3005 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3006 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3007 EVLEndPtr->insertBefore(&CurRecipe);
3008 EVLEndPtr->setOperand(1, &EVL);
3009 return EVLEndPtr;
3010 };
3011
3012 if (match(&CurRecipe,
3013 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3014 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3015 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3016 EVL, Mask);
3017
3018 VPValue *ReversedVal;
3019 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3020 match(ReversedVal,
3021 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3022 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3023 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3024 auto *LoadR = new VPWidenLoadEVLRecipe(
3025 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3026 LoadR->insertBefore(&CurRecipe);
3027 return new VPWidenIntrinsicRecipe(
3028 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3029 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3030 }
3031
3032 VPValue *StoredVal;
3033 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3034 m_RemoveMask(HeaderMask, Mask))) &&
3035 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3036 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3037 StoredVal, EVL, Mask);
3038
3039 if (match(&CurRecipe,
3040 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3041 m_RemoveMask(HeaderMask, Mask))) &&
3042 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3043 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3044 auto *NewReverse = new VPWidenIntrinsicRecipe(
3045 Intrinsic::experimental_vp_reverse,
3046 {ReversedVal, Plan->getTrue(), &EVL},
3047 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3048 NewReverse->insertBefore(&CurRecipe);
3049 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3050 AdjustEndPtr(EndPtr), NewReverse, EVL,
3051 Mask);
3052 }
3053
3054 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3055 if (Rdx->isConditional() &&
3056 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3057 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3058
3059 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3060 if (Interleave->getMask() &&
3061 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3062 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3063
3064 VPValue *LHS, *RHS;
3065 if (match(&CurRecipe,
3066 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3067 return new VPWidenIntrinsicRecipe(
3068 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3069 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3070
3071 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3072 m_VPValue(RHS))))
3073 return new VPWidenIntrinsicRecipe(
3074 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3075 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3076
3077 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3078 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3079 VPValue *ZExt =
3080 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3081 return new VPInstruction(
3082 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3083 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3084 }
3085
3086 return nullptr;
3087}
3088
3089/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3090/// The transforms here need to preserve the original semantics.
3092 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3093 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3096 m_VPValue(EVL))) &&
3097 match(EVL, m_EVL(m_VPValue()))) {
3098 HeaderMask = R.getVPSingleValue();
3099 break;
3100 }
3101 }
3102 if (!HeaderMask)
3103 return;
3104
3105 VPTypeAnalysis TypeInfo(Plan);
3106 SmallVector<VPRecipeBase *> OldRecipes;
3107 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3109 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3110 NewR->insertBefore(R);
3111 for (auto [Old, New] :
3112 zip_equal(R->definedValues(), NewR->definedValues()))
3113 Old->replaceAllUsesWith(New);
3114 OldRecipes.push_back(R);
3115 }
3116 }
3117 // Erase old recipes at the end so we don't invalidate TypeInfo.
3118 for (VPRecipeBase *R : reverse(OldRecipes)) {
3119 SmallVector<VPValue *> PossiblyDead(R->operands());
3120 R->eraseFromParent();
3121 for (VPValue *Op : PossiblyDead)
3123 }
3124}
3125
3126/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3127/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3128/// iteration.
3129static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3130 VPTypeAnalysis TypeInfo(Plan);
3131 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3132 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3133
3134 assert(all_of(Plan.getVF().users(),
3137 "User of VF that we can't transform to EVL.");
3138 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3140 });
3141
3142 assert(all_of(Plan.getVFxUF().users(),
3143 [&LoopRegion, &Plan](VPUser *U) {
3144 return match(U,
3145 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3146 m_Specific(&Plan.getVFxUF()))) ||
3147 isa<VPWidenPointerInductionRecipe>(U);
3148 }) &&
3149 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3150 "increment of the canonical induction.");
3151 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3152 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3153 // canonical induction must not be updated.
3155 });
3156
3157 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3158 // contained.
3159 bool ContainsFORs =
3161 if (ContainsFORs) {
3162 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3163 VPValue *MaxEVL = &Plan.getVF();
3164 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3165 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3166 MaxEVL = Builder.createScalarZExtOrTrunc(
3167 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3168 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3169
3170 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3171 VPValue *PrevEVL = Builder.createScalarPhi(
3172 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3173
3176 for (VPRecipeBase &R : *VPBB) {
3177 VPValue *V1, *V2;
3178 if (!match(&R,
3180 m_VPValue(V1), m_VPValue(V2))))
3181 continue;
3182 VPValue *Imm = Plan.getOrAddLiveIn(
3185 Intrinsic::experimental_vp_splice,
3186 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3187 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3188 R.getDebugLoc());
3189 VPSplice->insertBefore(&R);
3190 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3191 }
3192 }
3193 }
3194
3195 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3196 if (!HeaderMask)
3197 return;
3198
3199 // Replace header masks with a mask equivalent to predicating by EVL:
3200 //
3201 // icmp ule widen-canonical-iv backedge-taken-count
3202 // ->
3203 // icmp ult step-vector, EVL
3204 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3205 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3206 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3207 VPValue *EVLMask = Builder.createICmp(
3209 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3210 HeaderMask->replaceAllUsesWith(EVLMask);
3211}
3212
3213/// Converts a tail folded vector loop region to step by
3214/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3215/// iteration.
3216///
3217/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3218/// replaces all uses except the canonical IV increment of
3219/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3220/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3221/// this transformation.
3222///
3223/// - The header mask is replaced with a header mask based on the EVL.
3224///
3225/// - Plans with FORs have a new phi added to keep track of the EVL of the
3226/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3227/// @llvm.vp.splice.
3228///
3229/// The function uses the following definitions:
3230/// %StartV is the canonical induction start value.
3231///
3232/// The function adds the following recipes:
3233///
3234/// vector.ph:
3235/// ...
3236///
3237/// vector.body:
3238/// ...
3239/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3240/// [ %NextIter, %vector.body ]
3241/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3242/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3243/// ...
3244/// %OpEVL = cast i32 %VPEVL to IVSize
3245/// %NextIter = add IVSize %OpEVL, %CurrentIter
3246/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3247/// ...
3248///
3249/// If MaxSafeElements is provided, the function adds the following recipes:
3250/// vector.ph:
3251/// ...
3252///
3253/// vector.body:
3254/// ...
3255/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3256/// [ %NextIter, %vector.body ]
3257/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3258/// %cmp = cmp ult %AVL, MaxSafeElements
3259/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3260/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3261/// ...
3262/// %OpEVL = cast i32 %VPEVL to IVSize
3263/// %NextIter = add IVSize %OpEVL, %CurrentIter
3264/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3265/// ...
3266///
3268 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3269 if (Plan.hasScalarVFOnly())
3270 return;
3271 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3272 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3273
3274 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3275 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3276 VPValue *StartV = CanonicalIVPHI->getStartValue();
3277
3278 // Create the CurrentIteration recipe in the vector loop.
3279 auto *CurrentIteration =
3281 CurrentIteration->insertAfter(CanonicalIVPHI);
3282 VPBuilder Builder(Header, Header->getFirstNonPhi());
3283 // Create the AVL (application vector length), starting from TC -> 0 in steps
3284 // of EVL.
3285 VPPhi *AVLPhi = Builder.createScalarPhi(
3286 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3287 VPValue *AVL = AVLPhi;
3288
3289 if (MaxSafeElements) {
3290 // Support for MaxSafeDist for correct loop emission.
3291 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3292 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3293 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3294 "safe_avl");
3295 }
3296 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3297 DebugLoc::getUnknown(), "evl");
3298
3299 auto *CanonicalIVIncrement =
3300 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3301 Builder.setInsertPoint(CanonicalIVIncrement);
3302 VPValue *OpVPEVL = VPEVL;
3303
3304 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3305 OpVPEVL = Builder.createScalarZExtOrTrunc(
3306 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3307
3308 auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
3309 CanonicalIVIncrement->getDebugLoc(),
3310 "current.iteration.next",
3311 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3312 CanonicalIVIncrement->hasNoSignedWrap()});
3313 CurrentIteration->addOperand(NextIter);
3314
3315 VPValue *NextAVL =
3316 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3317 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3318 AVLPhi->addOperand(NextAVL);
3319
3320 fixupVFUsersForEVL(Plan, *VPEVL);
3321 removeDeadRecipes(Plan);
3322
3323 // Replace all uses of VPCanonicalIVPHIRecipe by
3324 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3325 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3326 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3327 // TODO: support unroll factor > 1.
3328 Plan.setUF(1);
3329}
3330
3332 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3333 // There should be only one VPCurrentIteration in the entire plan.
3334 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3335
3338 for (VPRecipeBase &R : VPBB->phis())
3339 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3340 assert(!CurrentIteration &&
3341 "Found multiple CurrentIteration. Only one expected");
3342 CurrentIteration = PhiR;
3343 }
3344
3345 // Early return if it is not variable-length stepping.
3346 if (!CurrentIteration)
3347 return;
3348
3349 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3350 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3351
3352 // Convert CurrentIteration to concrete recipe.
3353 auto *ScalarR =
3354 VPBuilder(CurrentIteration)
3356 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3357 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3358 CurrentIteration->replaceAllUsesWith(ScalarR);
3359 CurrentIteration->eraseFromParent();
3360
3361 // Replace CanonicalIVInc with CurrentIteration increment.
3362 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3363 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3364 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3365 m_Specific(&Plan.getVFxUF()))) &&
3366 "Unexpected canonical iv");
3367 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3368
3369 // Remove unused phi and increment.
3370 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3371 CanonicalIVIncrement->eraseFromParent();
3372 CanonicalIV->eraseFromParent();
3373}
3374
3376 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3377 // The canonical IV may not exist at this stage.
3378 if (!LoopRegion ||
3380 return;
3381 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3382 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3383 return;
3384 // The EVL IV is always immediately after the canonical IV.
3386 std::next(CanIV->getIterator()));
3387 if (!EVLPhi)
3388 return;
3389
3390 // Bail if not an EVL tail folded loop.
3391 VPValue *AVL;
3392 if (!match(EVLPhi->getBackedgeValue(),
3393 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3394 return;
3395
3396 // The AVL may be capped to a safe distance.
3397 VPValue *SafeAVL;
3398 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3399 AVL = SafeAVL;
3400
3401 VPValue *AVLNext;
3402 [[maybe_unused]] bool FoundAVLNext =
3404 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3405 assert(FoundAVLNext && "Didn't find AVL backedge?");
3406
3407 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3408 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3409 if (match(LatchBr, m_BranchOnCond(m_True())))
3410 return;
3411
3412 assert(
3413 match(LatchBr,
3416 m_Specific(&Plan.getVectorTripCount())))) &&
3417 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3418 "trip count");
3419
3420 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3421 VPBuilder Builder(LatchBr);
3422 LatchBr->setOperand(0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3423 Plan.getConstantInt(AVLTy, 0)));
3424}
3425
3427 VPlan &Plan, PredicatedScalarEvolution &PSE,
3428 const DenseMap<Value *, const SCEV *> &StridesMap) {
3429 // Replace VPValues for known constant strides guaranteed by predicate scalar
3430 // evolution.
3431 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3432 auto *R = cast<VPRecipeBase>(&U);
3433 return R->getRegion() ||
3434 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3435 };
3436 ValueToSCEVMapTy RewriteMap;
3437 for (const SCEV *Stride : StridesMap.values()) {
3438 using namespace SCEVPatternMatch;
3439 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3440 const APInt *StrideConst;
3441 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3442 // Only handle constant strides for now.
3443 continue;
3444
3445 auto *CI = Plan.getConstantInt(*StrideConst);
3446 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3447 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3448
3449 // The versioned value may not be used in the loop directly but through a
3450 // sext/zext. Add new live-ins in those cases.
3451 for (Value *U : StrideV->users()) {
3453 continue;
3454 VPValue *StrideVPV = Plan.getLiveIn(U);
3455 if (!StrideVPV)
3456 continue;
3457 unsigned BW = U->getType()->getScalarSizeInBits();
3458 APInt C =
3459 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3460 VPValue *CI = Plan.getConstantInt(C);
3461 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3462 }
3463 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3464 }
3465
3466 for (VPRecipeBase &R : *Plan.getEntry()) {
3467 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3468 if (!ExpSCEV)
3469 continue;
3470 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3471 auto *NewSCEV =
3472 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3473 if (NewSCEV != ScevExpr) {
3474 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3475 ExpSCEV->replaceAllUsesWith(NewExp);
3476 if (Plan.getTripCount() == ExpSCEV)
3477 Plan.resetTripCount(NewExp);
3478 }
3479 }
3480}
3481
3483 VPlan &Plan,
3484 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3485 // Collect recipes in the backward slice of `Root` that may generate a poison
3486 // value that is used after vectorization.
3488 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3490 Worklist.push_back(Root);
3491
3492 // Traverse the backward slice of Root through its use-def chain.
3493 while (!Worklist.empty()) {
3494 VPRecipeBase *CurRec = Worklist.pop_back_val();
3495
3496 if (!Visited.insert(CurRec).second)
3497 continue;
3498
3499 // Prune search if we find another recipe generating a widen memory
3500 // instruction. Widen memory instructions involved in address computation
3501 // will lead to gather/scatter instructions, which don't need to be
3502 // handled.
3504 VPHeaderPHIRecipe>(CurRec))
3505 continue;
3506
3507 // This recipe contributes to the address computation of a widen
3508 // load/store. If the underlying instruction has poison-generating flags,
3509 // drop them directly.
3510 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3511 VPValue *A, *B;
3512 // Dropping disjoint from an OR may yield incorrect results, as some
3513 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3514 // for dependence analysis). Instead, replace it with an equivalent Add.
3515 // This is possible as all users of the disjoint OR only access lanes
3516 // where the operands are disjoint or poison otherwise.
3517 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3518 RecWithFlags->isDisjoint()) {
3519 VPBuilder Builder(RecWithFlags);
3520 VPInstruction *New =
3521 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3522 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3523 RecWithFlags->replaceAllUsesWith(New);
3524 RecWithFlags->eraseFromParent();
3525 CurRec = New;
3526 } else
3527 RecWithFlags->dropPoisonGeneratingFlags();
3528 } else {
3531 (void)Instr;
3532 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3533 "found instruction with poison generating flags not covered by "
3534 "VPRecipeWithIRFlags");
3535 }
3536
3537 // Add new definitions to the worklist.
3538 for (VPValue *Operand : CurRec->operands())
3539 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3540 Worklist.push_back(OpDef);
3541 }
3542 });
3543
3544 // Traverse all the recipes in the VPlan and collect the poison-generating
3545 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3546 // VPInterleaveRecipe.
3547 auto Iter = vp_depth_first_deep(Plan.getEntry());
3549 for (VPRecipeBase &Recipe : *VPBB) {
3550 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3551 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3552 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3553 if (AddrDef && WidenRec->isConsecutive() &&
3554 BlockNeedsPredication(UnderlyingInstr.getParent()))
3555 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3556 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3557 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3558 if (AddrDef) {
3559 // Check if any member of the interleave group needs predication.
3560 const InterleaveGroup<Instruction> *InterGroup =
3561 InterleaveRec->getInterleaveGroup();
3562 bool NeedPredication = false;
3563 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3564 I < NumMembers; ++I) {
3565 Instruction *Member = InterGroup->getMember(I);
3566 if (Member)
3567 NeedPredication |= BlockNeedsPredication(Member->getParent());
3568 }
3569
3570 if (NeedPredication)
3571 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3572 }
3573 }
3574 }
3575 }
3576}
3577
3579 VPlan &Plan,
3581 &InterleaveGroups,
3582 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3583 if (InterleaveGroups.empty())
3584 return;
3585
3586 // Interleave memory: for each Interleave Group we marked earlier as relevant
3587 // for this VPlan, replace the Recipes widening its memory instructions with a
3588 // single VPInterleaveRecipe at its insertion point.
3589 VPDominatorTree VPDT(Plan);
3590 for (const auto *IG : InterleaveGroups) {
3591 auto *Start =
3592 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3593 VPIRMetadata InterleaveMD(*Start);
3594 SmallVector<VPValue *, 4> StoredValues;
3595 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3596 StoredValues.push_back(StoreR->getStoredValue());
3597 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3598 Instruction *MemberI = IG->getMember(I);
3599 if (!MemberI)
3600 continue;
3601 VPWidenMemoryRecipe *MemoryR =
3602 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3603 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3604 StoredValues.push_back(StoreR->getStoredValue());
3605 InterleaveMD.intersect(*MemoryR);
3606 }
3607
3608 bool NeedsMaskForGaps =
3609 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3610 (!StoredValues.empty() && !IG->isFull());
3611
3612 Instruction *IRInsertPos = IG->getInsertPos();
3613 auto *InsertPos =
3614 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3615
3617 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3618 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3619 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3620
3621 // Get or create the start address for the interleave group.
3622 VPValue *Addr = Start->getAddr();
3623 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3624 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3625 // We cannot re-use the address of member zero because it does not
3626 // dominate the insert position. Instead, use the address of the insert
3627 // position and create a PtrAdd adjusting it to the address of member
3628 // zero.
3629 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3630 // InsertPos or sink loads above zero members to join it.
3631 assert(IG->getIndex(IRInsertPos) != 0 &&
3632 "index of insert position shouldn't be zero");
3633 auto &DL = IRInsertPos->getDataLayout();
3634 APInt Offset(32,
3635 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3636 IG->getIndex(IRInsertPos),
3637 /*IsSigned=*/true);
3638 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3639 VPBuilder B(InsertPos);
3640 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3641 }
3642 // If the group is reverse, adjust the index to refer to the last vector
3643 // lane instead of the first. We adjust the index from the first vector
3644 // lane, rather than directly getting the pointer for lane VF - 1, because
3645 // the pointer operand of the interleaved access is supposed to be uniform.
3646 if (IG->isReverse()) {
3647 auto *ReversePtr = new VPVectorEndPointerRecipe(
3648 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3649 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3650 ReversePtr->insertBefore(InsertPos);
3651 Addr = ReversePtr;
3652 }
3653 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3654 InsertPos->getMask(), NeedsMaskForGaps,
3655 InterleaveMD, InsertPos->getDebugLoc());
3656 VPIG->insertBefore(InsertPos);
3657
3658 unsigned J = 0;
3659 for (unsigned i = 0; i < IG->getFactor(); ++i)
3660 if (Instruction *Member = IG->getMember(i)) {
3661 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3662 if (!Member->getType()->isVoidTy()) {
3663 VPValue *OriginalV = MemberR->getVPSingleValue();
3664 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3665 J++;
3666 }
3667 MemberR->eraseFromParent();
3668 }
3669 }
3670}
3671
3672/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3673/// value, phi and backedge value. In the following example:
3674///
3675/// vector.ph:
3676/// Successor(s): vector loop
3677///
3678/// <x1> vector loop: {
3679/// vector.body:
3680/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3681/// ...
3682/// EMIT branch-on-count ...
3683/// No successors
3684/// }
3685///
3686/// WIDEN-INDUCTION will get expanded to:
3687///
3688/// vector.ph:
3689/// ...
3690/// vp<%induction.start> = ...
3691/// vp<%induction.increment> = ...
3692///
3693/// Successor(s): vector loop
3694///
3695/// <x1> vector loop: {
3696/// vector.body:
3697/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3698/// ...
3699/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3700/// EMIT branch-on-count ...
3701/// No successors
3702/// }
3703static void
3705 VPTypeAnalysis &TypeInfo) {
3706 VPlan *Plan = WidenIVR->getParent()->getPlan();
3707 VPValue *Start = WidenIVR->getStartValue();
3708 VPValue *Step = WidenIVR->getStepValue();
3709 VPValue *VF = WidenIVR->getVFValue();
3710 DebugLoc DL = WidenIVR->getDebugLoc();
3711
3712 // The value from the original loop to which we are mapping the new induction
3713 // variable.
3714 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3715
3716 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3719 VPIRFlags Flags = *WidenIVR;
3720 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3721 AddOp = Instruction::Add;
3722 MulOp = Instruction::Mul;
3723 } else {
3724 AddOp = ID.getInductionOpcode();
3725 MulOp = Instruction::FMul;
3726 }
3727
3728 // If the phi is truncated, truncate the start and step values.
3729 VPBuilder Builder(Plan->getVectorPreheader());
3730 Type *StepTy = TypeInfo.inferScalarType(Step);
3731 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3732 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3733 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3734 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3735 // Truncation doesn't preserve WrapFlags.
3736 Flags.dropPoisonGeneratingFlags();
3737 StepTy = Ty;
3738 }
3739
3740 // Construct the initial value of the vector IV in the vector loop preheader.
3741 Type *IVIntTy =
3743 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3744 if (StepTy->isFloatingPointTy())
3745 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3746
3747 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3748 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3749
3750 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3751 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3752 DebugLoc::getUnknown(), "induction");
3753
3754 // Create the widened phi of the vector IV.
3755 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3756 WidenIVR->getDebugLoc(), "vec.ind");
3757 WidePHI->insertBefore(WidenIVR);
3758
3759 // Create the backedge value for the vector IV.
3760 VPValue *Inc;
3761 VPValue *Prev;
3762 // If unrolled, use the increment and prev value from the operands.
3763 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3764 Inc = SplatVF;
3765 Prev = WidenIVR->getLastUnrolledPartOperand();
3766 } else {
3767 if (VPRecipeBase *R = VF->getDefiningRecipe())
3768 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3769 // Multiply the vectorization factor by the step using integer or
3770 // floating-point arithmetic as appropriate.
3771 if (StepTy->isFloatingPointTy())
3772 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3773 DL);
3774 else
3775 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3776 TypeInfo.inferScalarType(VF), DL);
3777
3778 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3779 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3780 Prev = WidePHI;
3781 }
3782
3784 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3785 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3786 WidenIVR->getDebugLoc(), "vec.ind.next");
3787
3788 WidePHI->addOperand(Next);
3789
3790 WidenIVR->replaceAllUsesWith(WidePHI);
3791}
3792
3793/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3794/// initial value, phi and backedge value. In the following example:
3795///
3796/// <x1> vector loop: {
3797/// vector.body:
3798/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3799/// ...
3800/// EMIT branch-on-count ...
3801/// }
3802///
3803/// WIDEN-POINTER-INDUCTION will get expanded to:
3804///
3805/// <x1> vector loop: {
3806/// vector.body:
3807/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3808/// EMIT %mul = mul %stepvector, %step
3809/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3810/// ...
3811/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3812/// EMIT branch-on-count ...
3813/// }
3815 VPTypeAnalysis &TypeInfo) {
3816 VPlan *Plan = R->getParent()->getPlan();
3817 VPValue *Start = R->getStartValue();
3818 VPValue *Step = R->getStepValue();
3819 VPValue *VF = R->getVFValue();
3820
3821 assert(R->getInductionDescriptor().getKind() ==
3823 "Not a pointer induction according to InductionDescriptor!");
3824 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3825 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3826 "Recipe should have been replaced");
3827
3828 VPBuilder Builder(R);
3829 DebugLoc DL = R->getDebugLoc();
3830
3831 // Build a scalar pointer phi.
3832 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3833
3834 // Create actual address geps that use the pointer phi as base and a
3835 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3836 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3837 Type *StepTy = TypeInfo.inferScalarType(Step);
3838 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3839 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3840 VPValue *PtrAdd =
3841 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3842 R->replaceAllUsesWith(PtrAdd);
3843
3844 // Create the backedge value for the scalar pointer phi.
3846 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3847 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3848 DL);
3849 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3850
3851 VPValue *InductionGEP =
3852 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3853 ScalarPtrPhi->addOperand(InductionGEP);
3854}
3855
3857 // Replace loop regions with explicity CFG.
3858 SmallVector<VPRegionBlock *> LoopRegions;
3860 vp_depth_first_deep(Plan.getEntry()))) {
3861 if (!R->isReplicator())
3862 LoopRegions.push_back(R);
3863 }
3864 for (VPRegionBlock *R : LoopRegions)
3865 R->dissolveToCFGLoop();
3866}
3867
3870 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3871 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3874 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3875 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3876 }
3877
3878 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3879 // single-condition branches:
3880 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3881 // the first condition is true, and otherwise jumps to a new interim block.
3882 // 2. A branch that ends the interim block, jumps to the second successor if
3883 // the second condition is true, and otherwise jumps to the third
3884 // successor.
3885 for (VPInstruction *Br : WorkList) {
3886 assert(Br->getNumOperands() == 2 &&
3887 "BranchOnTwoConds must have exactly 2 conditions");
3888 DebugLoc DL = Br->getDebugLoc();
3889 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3890 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3891 assert(Successors.size() == 3 &&
3892 "BranchOnTwoConds must have exactly 3 successors");
3893
3894 for (VPBlockBase *Succ : Successors)
3895 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3896
3897 VPValue *Cond0 = Br->getOperand(0);
3898 VPValue *Cond1 = Br->getOperand(1);
3899 VPBlockBase *Succ0 = Successors[0];
3900 VPBlockBase *Succ1 = Successors[1];
3901 VPBlockBase *Succ2 = Successors[2];
3902 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3903 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3904
3905 VPBasicBlock *InterimBB =
3906 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3907
3908 VPBuilder(BrOnTwoCondsBB)
3910 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3911 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3912
3914 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3915 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3916 Br->eraseFromParent();
3917 }
3918}
3919
3921 VPTypeAnalysis TypeInfo(Plan);
3924 vp_depth_first_deep(Plan.getEntry()))) {
3925 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3926 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3927 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3928 ToRemove.push_back(WidenIVR);
3929 continue;
3930 }
3931
3932 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3933 // If the recipe only generates scalars, scalarize it instead of
3934 // expanding it.
3935 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3936 VPBuilder Builder(WidenIVR);
3937 VPValue *PtrAdd =
3938 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3939 WidenIVR->replaceAllUsesWith(PtrAdd);
3940 ToRemove.push_back(WidenIVR);
3941 continue;
3942 }
3943 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3944 ToRemove.push_back(WidenIVR);
3945 continue;
3946 }
3947
3948 // Expand VPBlendRecipe into VPInstruction::Select.
3949 VPBuilder Builder(&R);
3950 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3951 VPValue *Select = Blend->getIncomingValue(0);
3952 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3953 Select = Builder.createSelect(Blend->getMask(I),
3954 Blend->getIncomingValue(I), Select,
3955 R.getDebugLoc(), "predphi", *Blend);
3956 Blend->replaceAllUsesWith(Select);
3957 ToRemove.push_back(Blend);
3958 }
3959
3960 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
3961 if (!VEPR->getOffset()) {
3962 assert(Plan.getConcreteUF() == 1 &&
3963 "Expected unroller to have materialized offset for UF != 1");
3964 VEPR->materializeOffset();
3965 }
3966 }
3967
3968 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3969 Expr->decompose();
3970 ToRemove.push_back(Expr);
3971 }
3972
3973 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3974 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3975 if (LastActiveL &&
3976 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3977 // Create Not(Mask) for all operands.
3979 for (VPValue *Op : LastActiveL->operands()) {
3980 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3981 NotMasks.push_back(NotMask);
3982 }
3983
3984 // Create FirstActiveLane on the inverted masks.
3985 VPValue *FirstInactiveLane = Builder.createNaryOp(
3987 LastActiveL->getDebugLoc(), "first.inactive.lane");
3988
3989 // Subtract 1 to get the last active lane.
3990 VPValue *One = Plan.getConstantInt(64, 1);
3991 VPValue *LastLane =
3992 Builder.createSub(FirstInactiveLane, One,
3993 LastActiveL->getDebugLoc(), "last.active.lane");
3994
3995 LastActiveL->replaceAllUsesWith(LastLane);
3996 ToRemove.push_back(LastActiveL);
3997 continue;
3998 }
3999
4000 // Lower MaskedCond with block mask to LogicalAnd.
4002 auto *VPI = cast<VPInstruction>(&R);
4003 assert(VPI->isMasked() &&
4004 "Unmasked MaskedCond should be simplified earlier");
4005 VPI->replaceAllUsesWith(Builder.createNaryOp(
4006 VPInstruction::LogicalAnd, {VPI->getOperand(0), VPI->getMask()}));
4007 ToRemove.push_back(VPI);
4008 continue;
4009 }
4010
4011 // Lower BranchOnCount to ICmp + BranchOnCond.
4012 VPValue *IV, *TC;
4013 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4014 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4015 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4016 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4017 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4018 ToRemove.push_back(BranchOnCountInst);
4019 continue;
4020 }
4021
4022 VPValue *VectorStep;
4023 VPValue *ScalarStep;
4025 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4026 continue;
4027
4028 // Expand WideIVStep.
4029 auto *VPI = cast<VPInstruction>(&R);
4030 Type *IVTy = TypeInfo.inferScalarType(VPI);
4031 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4033 ? Instruction::UIToFP
4034 : Instruction::Trunc;
4035 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4036 }
4037
4038 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4039 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4040 ScalarStep =
4041 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4042 }
4043
4044 VPIRFlags Flags;
4045 unsigned MulOpc;
4046 if (IVTy->isFloatingPointTy()) {
4047 MulOpc = Instruction::FMul;
4048 Flags = VPI->getFastMathFlags();
4049 } else {
4050 MulOpc = Instruction::Mul;
4051 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4052 }
4053
4054 VPInstruction *Mul = Builder.createNaryOp(
4055 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4056 VectorStep = Mul;
4057 VPI->replaceAllUsesWith(VectorStep);
4058 ToRemove.push_back(VPI);
4059 }
4060 }
4061
4062 for (VPRecipeBase *R : ToRemove)
4063 R->eraseFromParent();
4064}
4065
4067 VPBasicBlock *HeaderVPBB,
4068 VPBasicBlock *LatchVPBB,
4069 VPBasicBlock *MiddleVPBB) {
4070 struct EarlyExitInfo {
4071 VPBasicBlock *EarlyExitingVPBB;
4072 VPIRBasicBlock *EarlyExitVPBB;
4073 VPValue *CondToExit;
4074 };
4075
4076 VPDominatorTree VPDT(Plan);
4077 VPBuilder Builder(LatchVPBB->getTerminator());
4079 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4080 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4081 if (Pred == MiddleVPBB)
4082 continue;
4083 // Collect condition for this early exit.
4084 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4085 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4086 VPValue *CondOfEarlyExitingVPBB;
4087 [[maybe_unused]] bool Matched =
4088 match(EarlyExitingVPBB->getTerminator(),
4089 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4090 assert(Matched && "Terminator must be BranchOnCond");
4091
4092 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4093 // the correct block mask.
4094 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4095 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4097 TrueSucc == ExitBlock
4098 ? CondOfEarlyExitingVPBB
4099 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4100 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4101 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4102 VPDT.properlyDominates(
4103 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4104 LatchVPBB)) &&
4105 "exit condition must dominate the latch");
4106 Exits.push_back({
4107 EarlyExitingVPBB,
4108 ExitBlock,
4109 CondToEarlyExit,
4110 });
4111 }
4112 }
4113
4114 assert(!Exits.empty() && "must have at least one early exit");
4115 // Sort exits by dominance to get the correct program order.
4116 llvm::sort(Exits, [&VPDT](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4117 return VPDT.properlyDominates(A.EarlyExitingVPBB, B.EarlyExitingVPBB);
4118 });
4119
4120 // Build the AnyOf condition for the latch terminator using logical OR
4121 // to avoid poison propagation from later exit conditions when an earlier
4122 // exit is taken.
4123 VPValue *Combined = Exits[0].CondToExit;
4124 for (const EarlyExitInfo &Info : drop_begin(Exits))
4125 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4126
4127 VPValue *IsAnyExitTaken =
4128 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4129
4130 // Create the vector.early.exit blocks.
4131 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4132 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4133 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4134 VPBasicBlock *VectorEarlyExitVPBB =
4135 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4136 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4137 }
4138
4139 // Create the dispatch block (or reuse the single exit block if only one
4140 // exit). The dispatch block computes the first active lane of the combined
4141 // condition and, for multiple exits, chains through conditions to determine
4142 // which exit to take.
4143 VPBasicBlock *DispatchVPBB =
4144 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4145 : Plan.createVPBasicBlock("vector.early.exit.check");
4146 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4147 VPValue *FirstActiveLane =
4148 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4149 DebugLoc::getUnknown(), "first.active.lane");
4150
4151 // For each early exit, disconnect the original exiting block
4152 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4153 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4154 // values at the first active lane:
4155 //
4156 // Input:
4157 // early.exiting.I:
4158 // ...
4159 // EMIT branch-on-cond vp<%cond.I>
4160 // Successor(s): in.loop.succ, ir-bb<exit.I>
4161 //
4162 // ir-bb<exit.I>:
4163 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4164 //
4165 // Output:
4166 // early.exiting.I:
4167 // ...
4168 // Successor(s): in.loop.succ
4169 //
4170 // vector.early.exit.I:
4171 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4172 // Successor(s): ir-bb<exit.I>
4173 //
4174 // ir-bb<exit.I>:
4175 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4176 // vector.early.exit.I)
4177 //
4178 for (auto [Exit, VectorEarlyExitVPBB] :
4179 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4180 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4181 // Adjust the phi nodes in EarlyExitVPBB.
4182 // 1. remove incoming values from EarlyExitingVPBB,
4183 // 2. extract the incoming value at FirstActiveLane
4184 // 3. add back the extracts as last operands for the phis
4185 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4186 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4187 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4188 // values from VectorEarlyExitVPBB.
4189 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4190 auto *ExitIRI = cast<VPIRPhi>(&R);
4191 VPValue *IncomingVal =
4192 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4193 VPValue *NewIncoming = IncomingVal;
4194 if (!isa<VPIRValue>(IncomingVal)) {
4195 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4196 NewIncoming = EarlyExitBuilder.createNaryOp(
4197 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4198 DebugLoc::getUnknown(), "early.exit.value");
4199 }
4200 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4201 ExitIRI->addOperand(NewIncoming);
4202 }
4203
4204 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4205 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4206 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4207 }
4208
4209 // Chain through exits: for each exit, check if its condition is true at
4210 // the first active lane. If so, take that exit; otherwise, try the next.
4211 // The last exit needs no check since it must be taken if all others fail.
4212 //
4213 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4214 //
4215 // latch:
4216 // ...
4217 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4218 // ...
4219 //
4220 // vector.early.exit.check:
4221 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4222 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4223 // EMIT branch-on-cond vp<%at.cond.0>
4224 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4225 //
4226 // vector.early.exit.check.0:
4227 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4228 // EMIT branch-on-cond vp<%at.cond.1>
4229 // Successor(s): vector.early.exit.1, vector.early.exit.2
4230 VPBasicBlock *CurrentBB = DispatchVPBB;
4231 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4232 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4233 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4234 DebugLoc::getUnknown(), "exit.cond.at.lane");
4235
4236 // For the last dispatch, branch directly to the last exit on false;
4237 // otherwise, create a new check block.
4238 bool IsLastDispatch = (I + 2 == Exits.size());
4239 VPBasicBlock *FalseBB =
4240 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4241 : Plan.createVPBasicBlock(
4242 Twine("vector.early.exit.check.") + Twine(I));
4243
4244 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4245 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4246 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4247 FalseBB->setPredecessors({CurrentBB});
4248
4249 CurrentBB = FalseBB;
4250 DispatchBuilder.setInsertPoint(CurrentBB);
4251 }
4252
4253 // Replace the latch terminator with the new branching logic.
4254 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4255 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4256 "Unexpected terminator");
4257 auto *IsLatchExitTaken =
4258 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4259 LatchExitingBranch->getOperand(1));
4260
4261 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4262 LatchExitingBranch->eraseFromParent();
4263 Builder.setInsertPoint(LatchVPBB);
4264 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4265 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4266 LatchVPBB->clearSuccessors();
4267 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4268 DispatchVPBB->setPredecessors({LatchVPBB});
4269}
4270
4271/// This function tries convert extended in-loop reductions to
4272/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4273/// valid. The created recipe must be decomposed to its constituent
4274/// recipes before execution.
4275static VPExpressionRecipe *
4277 VFRange &Range) {
4278 Type *RedTy = Ctx.Types.inferScalarType(Red);
4279 VPValue *VecOp = Red->getVecOp();
4280
4281 // Clamp the range if using extended-reduction is profitable.
4282 auto IsExtendedRedValidAndClampRange =
4283 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4285 [&](ElementCount VF) {
4286 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4288
4290 InstructionCost ExtCost =
4291 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4292 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4293
4294 if (Red->isPartialReduction()) {
4297 // FIXME: Move partial reduction creation, costing and clamping
4298 // here from LoopVectorize.cpp.
4299 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4300 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4301 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4302 RedTy->isFloatingPointTy()
4303 ? std::optional{Red->getFastMathFlags()}
4304 : std::nullopt);
4305 } else if (!RedTy->isFloatingPointTy()) {
4306 // TTI::getExtendedReductionCost only supports integer types.
4307 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4308 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4309 Red->getFastMathFlags(), CostKind);
4310 }
4311 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4312 },
4313 Range);
4314 };
4315
4316 VPValue *A;
4317 // Match reduce(ext)).
4318 if (isa<VPWidenCastRecipe>(VecOp) &&
4319 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4320 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4321 IsExtendedRedValidAndClampRange(
4322 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4323 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4324 Ctx.Types.inferScalarType(A)))
4325 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4326
4327 return nullptr;
4328}
4329
4330/// This function tries convert extended in-loop reductions to
4331/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4332/// and valid. The created VPExpressionRecipe must be decomposed to its
4333/// constituent recipes before execution. Patterns of the
4334/// VPExpressionRecipe:
4335/// reduce.add(mul(...)),
4336/// reduce.add(mul(ext(A), ext(B))),
4337/// reduce.add(ext(mul(ext(A), ext(B)))).
4338/// reduce.fadd(fmul(ext(A), ext(B)))
4339static VPExpressionRecipe *
4341 VPCostContext &Ctx, VFRange &Range) {
4342 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4343 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4344 Opcode != Instruction::FAdd)
4345 return nullptr;
4346
4347 Type *RedTy = Ctx.Types.inferScalarType(Red);
4348
4349 // Clamp the range if using multiply-accumulate-reduction is profitable.
4350 auto IsMulAccValidAndClampRange =
4352 VPWidenCastRecipe *OuterExt) -> bool {
4354 [&](ElementCount VF) {
4356 Type *SrcTy =
4357 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4358 InstructionCost MulAccCost;
4359
4360 if (Red->isPartialReduction()) {
4361 Type *SrcTy2 =
4362 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4363 // FIXME: Move partial reduction creation, costing and clamping
4364 // here from LoopVectorize.cpp.
4365 MulAccCost = Ctx.TTI.getPartialReductionCost(
4366 Opcode, SrcTy, SrcTy2, RedTy, VF,
4368 Ext0->getOpcode())
4371 Ext1->getOpcode())
4373 Mul->getOpcode(), CostKind,
4374 RedTy->isFloatingPointTy()
4375 ? std::optional{Red->getFastMathFlags()}
4376 : std::nullopt);
4377 } else {
4378 // Only partial reductions support mixed or floating-point extends
4379 // at the moment.
4380 if (Ext0 && Ext1 &&
4381 (Ext0->getOpcode() != Ext1->getOpcode() ||
4382 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4383 return false;
4384
4385 bool IsZExt =
4386 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4387 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4388 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4389 SrcVecTy, CostKind);
4390 }
4391
4392 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4393 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4394 InstructionCost ExtCost = 0;
4395 if (Ext0)
4396 ExtCost += Ext0->computeCost(VF, Ctx);
4397 if (Ext1)
4398 ExtCost += Ext1->computeCost(VF, Ctx);
4399 if (OuterExt)
4400 ExtCost += OuterExt->computeCost(VF, Ctx);
4401
4402 return MulAccCost.isValid() &&
4403 MulAccCost < ExtCost + MulCost + RedCost;
4404 },
4405 Range);
4406 };
4407
4408 VPValue *VecOp = Red->getVecOp();
4409 VPRecipeBase *Sub = nullptr;
4410 VPValue *A, *B;
4411 VPValue *Tmp = nullptr;
4412
4413 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4414 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4415 assert(Opcode == Instruction::FAdd &&
4416 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4417 "instruction");
4418 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4419 if (!FMul)
4420 return nullptr;
4421
4422 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4423 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4424
4425 if (RecipeA && RecipeB &&
4426 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4427 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4428 }
4429 }
4430 if (RedTy->isFloatingPointTy())
4431 return nullptr;
4432
4433 // Sub reductions could have a sub between the add reduction and vec op.
4434 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4435 Sub = VecOp->getDefiningRecipe();
4436 VecOp = Tmp;
4437 }
4438
4439 // If ValB is a constant and can be safely extended, truncate it to the same
4440 // type as ExtA's operand, then extend it to the same type as ExtA. This
4441 // creates two uniform extends that can more easily be matched by the rest of
4442 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4443 // replaced with the new extend of the constant.
4444 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4445 VPWidenCastRecipe *&ExtB,
4446 VPValue *&ValB, VPWidenRecipe *Mul) {
4447 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4448 return;
4449 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4450 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4451 const APInt *Const;
4452 if (!match(ValB, m_APInt(Const)) ||
4454 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4455 return;
4456 // The truncate ensures that the type of each extended operand is the
4457 // same, and it's been proven that the constant can be extended from
4458 // NarrowTy safely. Necessary since ExtA's extended operand would be
4459 // e.g. an i8, while the const will likely be an i32. This will be
4460 // elided by later optimisations.
4461 VPBuilder Builder(Mul);
4462 auto *Trunc =
4463 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4464 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4465 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4466 Mul->setOperand(1, ExtB);
4467 };
4468
4469 // Try to match reduce.add(mul(...)).
4470 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4473 auto *Mul = cast<VPWidenRecipe>(VecOp);
4474
4475 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4476 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4477
4478 // Match reduce.add/sub(mul(ext, ext)).
4479 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4480 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4481 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4482 if (Sub)
4483 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4484 cast<VPWidenRecipe>(Sub), Red);
4485 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4486 }
4487 // TODO: Add an expression type for this variant with a negated mul
4488 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4489 return new VPExpressionRecipe(Mul, Red);
4490 }
4491 // TODO: Add an expression type for negated versions of other expression
4492 // variants.
4493 if (Sub)
4494 return nullptr;
4495
4496 // Match reduce.add(ext(mul(A, B))).
4497 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4498 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4499 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4502
4503 // reduce.add(ext(mul(ext, const)))
4504 // -> reduce.add(ext(mul(ext, ext(const))))
4505 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4506
4507 // reduce.add(ext(mul(ext(A), ext(B))))
4508 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4509 // The inner extends must either have the same opcode as the outer extend or
4510 // be the same, in which case the multiply can never result in a negative
4511 // value and the outer extend can be folded away by doing wider
4512 // extends for the operands of the mul.
4513 if (Ext0 && Ext1 &&
4514 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4515 Ext0->getOpcode() == Ext1->getOpcode() &&
4516 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4517 auto *NewExt0 = new VPWidenCastRecipe(
4518 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4519 *Ext0, *Ext0, Ext0->getDebugLoc());
4520 NewExt0->insertBefore(Ext0);
4521
4522 VPWidenCastRecipe *NewExt1 = NewExt0;
4523 if (Ext0 != Ext1) {
4524 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4525 Ext->getResultType(), nullptr, *Ext1,
4526 *Ext1, Ext1->getDebugLoc());
4527 NewExt1->insertBefore(Ext1);
4528 }
4529 Mul->setOperand(0, NewExt0);
4530 Mul->setOperand(1, NewExt1);
4531 Red->setOperand(1, Mul);
4532 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4533 }
4534 }
4535 return nullptr;
4536}
4537
4538/// This function tries to create abstract recipes from the reduction recipe for
4539/// following optimizations and cost estimation.
4541 VPCostContext &Ctx,
4542 VFRange &Range) {
4543 VPExpressionRecipe *AbstractR = nullptr;
4544 auto IP = std::next(Red->getIterator());
4545 auto *VPBB = Red->getParent();
4546 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4547 AbstractR = MulAcc;
4548 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4549 AbstractR = ExtRed;
4550 // Cannot create abstract inloop reduction recipes.
4551 if (!AbstractR)
4552 return;
4553
4554 AbstractR->insertBefore(*VPBB, IP);
4555 Red->replaceAllUsesWith(AbstractR);
4556}
4557
4568
4570 if (Plan.hasScalarVFOnly())
4571 return;
4572
4573#ifndef NDEBUG
4574 VPDominatorTree VPDT(Plan);
4575#endif
4576
4577 SmallVector<VPValue *> VPValues;
4580 append_range(VPValues, Plan.getLiveIns());
4581 for (VPRecipeBase &R : *Plan.getEntry())
4582 append_range(VPValues, R.definedValues());
4583
4584 auto *VectorPreheader = Plan.getVectorPreheader();
4585 for (VPValue *VPV : VPValues) {
4587 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4588 continue;
4589
4590 // Add explicit broadcast at the insert point that dominates all users.
4591 VPBasicBlock *HoistBlock = VectorPreheader;
4592 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4593 for (VPUser *User : VPV->users()) {
4594 if (User->usesScalars(VPV))
4595 continue;
4596 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4597 HoistPoint = HoistBlock->begin();
4598 else
4599 assert(VPDT.dominates(VectorPreheader,
4600 cast<VPRecipeBase>(User)->getParent()) &&
4601 "All users must be in the vector preheader or dominated by it");
4602 }
4603
4604 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4605 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4606 VPV->replaceUsesWithIf(Broadcast,
4607 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4608 return Broadcast != &U && !U.usesScalars(VPV);
4609 });
4610 }
4611}
4612
4614 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4615
4616 // Collect candidate loads with invariant addresses and noalias scopes
4617 // metadata and memory-writing recipes with noalias metadata.
4621 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4622 for (VPRecipeBase &R : *VPBB) {
4623 // Only handle single-scalar replicated loads with invariant addresses.
4624 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4625 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4626 RepR->getOpcode() != Instruction::Load)
4627 continue;
4628
4629 VPValue *Addr = RepR->getOperand(0);
4630 if (Addr->isDefinedOutsideLoopRegions()) {
4632 if (!Loc.AATags.Scope)
4633 continue;
4634 CandidateLoads.push_back({RepR, Loc});
4635 }
4636 }
4637 if (R.mayWriteToMemory()) {
4639 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4640 return;
4641 Stores.push_back(*Loc);
4642 }
4643 }
4644 }
4645
4646 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4647 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4648 // Hoist the load to the preheader if it doesn't alias with any stores
4649 // according to the noalias metadata. Other loads should have been hoisted
4650 // by other passes
4651 const AAMDNodes &LoadAA = LoadLoc.AATags;
4652 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4654 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4655 })) {
4656 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4657 }
4658 }
4659}
4660
4661// Collect common metadata from a group of replicate recipes by intersecting
4662// metadata from all recipes in the group.
4664 VPIRMetadata CommonMetadata = *Recipes.front();
4665 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4666 CommonMetadata.intersect(*Recipe);
4667 return CommonMetadata;
4668}
4669
4670template <unsigned Opcode>
4674 const Loop *L) {
4675 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4676 "Only Load and Store opcodes supported");
4677 constexpr bool IsLoad = (Opcode == Instruction::Load);
4678 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4679 VPDominatorTree VPDT(Plan);
4680 VPTypeAnalysis TypeInfo(Plan);
4681
4682 // Group predicated operations by their address SCEV.
4684 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4685 auto *VPBB = cast<VPBasicBlock>(Block);
4686 for (VPRecipeBase &R : *VPBB) {
4687 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4688 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4689 continue;
4690
4691 // For loads, operand 0 is address; for stores, operand 1 is address.
4692 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4693 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4694 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4695 RecipesByAddress[AddrSCEV].push_back(RepR);
4696 }
4697 }
4698
4699 // For each address, collect operations with the same or complementary masks.
4701 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4702 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4703 };
4704 for (auto &[Addr, Recipes] : RecipesByAddress) {
4705 if (Recipes.size() < 2)
4706 continue;
4707
4708 // Collect groups with the same or complementary masks.
4709 for (VPReplicateRecipe *&RecipeI : Recipes) {
4710 if (!RecipeI)
4711 continue;
4712
4713 VPValue *MaskI = RecipeI->getMask();
4714 Type *TypeI = GetLoadStoreValueType(RecipeI);
4716 Group.push_back(RecipeI);
4717 RecipeI = nullptr;
4718
4719 // Find all operations with the same or complementary masks.
4720 bool HasComplementaryMask = false;
4721 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4722 if (!RecipeJ)
4723 continue;
4724
4725 VPValue *MaskJ = RecipeJ->getMask();
4726 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4727 if (TypeI == TypeJ) {
4728 // Check if any operation in the group has a complementary mask with
4729 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4730 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4731 match(MaskJ, m_Not(m_Specific(MaskI)));
4732 Group.push_back(RecipeJ);
4733 RecipeJ = nullptr;
4734 }
4735 }
4736
4737 if (HasComplementaryMask) {
4738 assert(Group.size() >= 2 && "must have at least 2 entries");
4739 // Sort replicates by dominance order, with earliest (most dominating)
4740 // first.
4741 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4742 return VPDT.properlyDominates(A, B);
4743 });
4744 AllGroups.push_back(std::move(Group));
4745 }
4746 }
4747 }
4748
4749 return AllGroups;
4750}
4751
4752// Find the recipe with minimum alignment in the group.
4753template <typename InstType>
4754static VPReplicateRecipe *
4756 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4757 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4758 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4759 });
4760}
4761
4764 const Loop *L) {
4765 auto Groups =
4767 if (Groups.empty())
4768 return;
4769
4770 // Process each group of loads.
4771 for (auto &Group : Groups) {
4772 // Try to use the earliest (most dominating) load to replace all others.
4773 VPReplicateRecipe *EarliestLoad = Group[0];
4774 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4775 VPBasicBlock *LastBB = Group.back()->getParent();
4776
4777 // Check that the load doesn't alias with stores between first and last.
4778 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4779 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4780 continue;
4781
4782 // Collect common metadata from all loads in the group.
4783 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4784
4785 // Find the load with minimum alignment to use.
4786 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4787
4788 // Create an unpredicated version of the earliest load with common
4789 // metadata.
4790 auto *UnpredicatedLoad = new VPReplicateRecipe(
4791 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4792 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4793 CommonMetadata);
4794
4795 UnpredicatedLoad->insertBefore(EarliestLoad);
4796
4797 // Replace all loads in the group with the unpredicated load.
4798 for (VPReplicateRecipe *Load : Group) {
4799 Load->replaceAllUsesWith(UnpredicatedLoad);
4800 Load->eraseFromParent();
4801 }
4802 }
4803}
4804
4805static bool
4807 PredicatedScalarEvolution &PSE, const Loop &L,
4808 VPTypeAnalysis &TypeInfo) {
4809 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4810 if (!StoreLoc || !StoreLoc->AATags.Scope)
4811 return false;
4812
4813 // When sinking a group of stores, all members of the group alias each other.
4814 // Skip them during the alias checks.
4815 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4816 StoresToSink.end());
4817
4818 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4819 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4820 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4821 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4822}
4823
4826 const Loop *L) {
4827 auto Groups =
4829 if (Groups.empty())
4830 return;
4831
4832 VPTypeAnalysis TypeInfo(Plan);
4833
4834 for (auto &Group : Groups) {
4835 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4836 continue;
4837
4838 // Use the last (most dominated) store's location for the unconditional
4839 // store.
4840 VPReplicateRecipe *LastStore = Group.back();
4841 VPBasicBlock *InsertBB = LastStore->getParent();
4842
4843 // Collect common alias metadata from all stores in the group.
4844 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4845
4846 // Build select chain for stored values.
4847 VPValue *SelectedValue = Group[0]->getOperand(0);
4848 VPBuilder Builder(InsertBB, LastStore->getIterator());
4849
4850 for (unsigned I = 1; I < Group.size(); ++I) {
4851 VPValue *Mask = Group[I]->getMask();
4852 VPValue *Value = Group[I]->getOperand(0);
4853 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4854 Group[I]->getDebugLoc());
4855 }
4856
4857 // Find the store with minimum alignment to use.
4858 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4859
4860 // Create unconditional store with selected value and common metadata.
4861 auto *UnpredicatedStore =
4862 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4863 {SelectedValue, LastStore->getOperand(1)},
4864 /*IsSingleScalar=*/false,
4865 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4866 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4867
4868 // Remove all predicated stores from the group.
4869 for (VPReplicateRecipe *Store : Group)
4870 Store->eraseFromParent();
4871 }
4872}
4873
4875 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4877 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4878 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4879
4880 VPValue *TC = Plan.getTripCount();
4881 // Skip cases for which the trip count may be non-trivial to materialize.
4882 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4883 // tail is required.
4884 if (!Plan.hasScalarTail() ||
4886 Plan.getScalarPreheader() ||
4887 !isa<VPIRValue>(TC))
4888 return;
4889
4890 // Materialize vector trip counts for constants early if it can simply
4891 // be computed as (Original TC / VF * UF) * VF * UF.
4892 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4893 // tail-folded loops.
4894 ScalarEvolution &SE = *PSE.getSE();
4895 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4896 if (!isa<SCEVConstant>(TCScev))
4897 return;
4898 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4899 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4900 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4901 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4902}
4903
4905 VPBasicBlock *VectorPH) {
4907 if (BTC->getNumUsers() == 0)
4908 return;
4909
4910 VPBuilder Builder(VectorPH, VectorPH->begin());
4911 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4912 auto *TCMO =
4913 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4914 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4915 BTC->replaceAllUsesWith(TCMO);
4916}
4917
4919 if (Plan.hasScalarVFOnly())
4920 return;
4921
4922 VPTypeAnalysis TypeInfo(Plan);
4923 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4924 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4926 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4927 vp_depth_first_shallow(LoopRegion->getEntry()));
4928 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4929 // VPInstructions, excluding ones in replicate regions. Those are not
4930 // materialized explicitly yet. Those vector users are still handled in
4931 // VPReplicateRegion::execute(), via shouldPack().
4932 // TODO: materialize build vectors for replicating recipes in replicating
4933 // regions.
4934 for (VPBasicBlock *VPBB :
4935 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4936 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4938 continue;
4939 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4940 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4941 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4942 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4943 };
4944 if ((isa<VPReplicateRecipe>(DefR) &&
4945 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4946 (isa<VPInstruction>(DefR) &&
4948 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4949 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4950 continue;
4951
4952 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4953 unsigned Opcode = ScalarTy->isStructTy()
4956 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4957 BuildVector->insertAfter(DefR);
4958
4959 DefR->replaceUsesWithIf(
4960 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4961 VPUser &U, unsigned) {
4962 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4963 });
4964 }
4965 }
4966
4967 // Create explicit VPInstructions to convert vectors to scalars. The current
4968 // implementation is conservative - it may miss some cases that may or may not
4969 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4970 // if they are known to operate on scalar values.
4971 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4972 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4975 continue;
4976 for (VPValue *Def : R.definedValues()) {
4977 // Skip recipes that are single-scalar or only have their first lane
4978 // used.
4979 // TODO: The Defs skipped here may or may not be vector values.
4980 // Introduce Unpacks, and remove them later, if they are guaranteed to
4981 // produce scalar values.
4983 continue;
4984
4985 // At the moment, we create unpacks only for scalar users outside
4986 // replicate regions. Recipes inside replicate regions still extract the
4987 // required lanes implicitly.
4988 // TODO: Remove once replicate regions are unrolled completely.
4989 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4990 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4991 return U->usesScalars(Def) &&
4992 (!ParentRegion || !ParentRegion->isReplicator());
4993 };
4994 if (none_of(Def->users(), IsCandidateUnpackUser))
4995 continue;
4996
4997 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4998 if (R.isPhi())
4999 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5000 else
5001 Unpack->insertAfter(&R);
5002 Def->replaceUsesWithIf(Unpack,
5003 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5004 return IsCandidateUnpackUser(&U);
5005 });
5006 }
5007 }
5008 }
5009}
5010
5012 VPBasicBlock *VectorPHVPBB,
5013 bool TailByMasking,
5014 bool RequiresScalarEpilogue) {
5015 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5016 // There's nothing to do if there are no users of the vector trip count or its
5017 // IR value has already been set.
5018 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5019 return;
5020
5021 VPValue *TC = Plan.getTripCount();
5022 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5023 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
5024 VPValue *Step = &Plan.getVFxUF();
5025
5026 // If the tail is to be folded by masking, round the number of iterations N
5027 // up to a multiple of Step instead of rounding down. This is done by first
5028 // adding Step-1 and then rounding down. Note that it's ok if this addition
5029 // overflows: the vector induction variable will eventually wrap to zero given
5030 // that it starts at zero and its Step is a power of two; the loop will then
5031 // exit, with the last early-exit vector comparison also producing all-true.
5032 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
5033 // is accounted for in emitIterationCountCheck that adds an overflow check.
5034 if (TailByMasking) {
5035 TC = Builder.createAdd(
5036 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5037 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5038 }
5039
5040 // Now we need to generate the expression for the part of the loop that the
5041 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5042 // iterations are not required for correctness, or N - Step, otherwise. Step
5043 // is equal to the vectorization factor (number of SIMD elements) times the
5044 // unroll factor (number of SIMD instructions).
5045 VPValue *R =
5046 Builder.createNaryOp(Instruction::URem, {TC, Step},
5047 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5048
5049 // There are cases where we *must* run at least one iteration in the remainder
5050 // loop. See the cost model for when this can happen. If the step evenly
5051 // divides the trip count, we set the remainder to be equal to the step. If
5052 // the step does not evenly divide the trip count, no adjustment is necessary
5053 // since there will already be scalar iterations. Note that the minimum
5054 // iterations check ensures that N >= Step.
5055 if (RequiresScalarEpilogue) {
5056 assert(!TailByMasking &&
5057 "requiring scalar epilogue is not supported with fail folding");
5058 VPValue *IsZero =
5059 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
5060 R = Builder.createSelect(IsZero, Step, R);
5061 }
5062
5063 VPValue *Res =
5064 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5065 VectorTC.replaceAllUsesWith(Res);
5066}
5067
5069 ElementCount VFEC) {
5070 VPBuilder Builder(VectorPH, VectorPH->begin());
5071 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5072 VPValue &VF = Plan.getVF();
5073 VPValue &VFxUF = Plan.getVFxUF();
5074 // Note that after the transform, no further uses of Plan.getVF and
5075 // Plan.getVFxUF should be added.
5076 // TODO: Add assertions for this.
5077
5078 VPValue *UF =
5079 Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getConcreteUF()));
5080 Plan.getUF().replaceAllUsesWith(UF);
5081
5082 // If there are no users of the runtime VF, compute VFxUF by constant folding
5083 // the multiplication of VF and UF.
5084 if (VF.getNumUsers() == 0) {
5085 VPValue *RuntimeVFxUF =
5086 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5087 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5088 return;
5089 }
5090
5091 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5092 // vscale) * UF.
5093 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5095 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5097 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5098 }
5099 VF.replaceAllUsesWith(RuntimeVF);
5100
5101 VPValue *MulByUF = Builder.createOverflowingOp(
5102 Instruction::Mul, {RuntimeVF, UF}, {true, false});
5103 VFxUF.replaceAllUsesWith(MulByUF);
5104}
5105
5108 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5109
5110 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5111 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5112 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5113 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5115 continue;
5116 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5117 if (!ExpSCEV)
5118 break;
5119 const SCEV *Expr = ExpSCEV->getSCEV();
5120 Value *Res =
5121 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5122 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5123 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5124 ExpSCEV->replaceAllUsesWith(Exp);
5125 if (Plan.getTripCount() == ExpSCEV)
5126 Plan.resetTripCount(Exp);
5127 ExpSCEV->eraseFromParent();
5128 }
5130 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5131 "before any VPIRInstructions");
5132 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5133 // to the VPIRBasicBlock.
5134 auto EI = Entry->begin();
5135 for (Instruction &I : drop_end(*EntryBB)) {
5136 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5137 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5138 EI++;
5139 continue;
5140 }
5142 }
5143
5144 return ExpandedSCEVs;
5145}
5146
5147/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5148/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5149/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5150/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5151/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5152/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5153/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5154/// is defined at \p Idx of a load interleave group.
5155static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5156 VPValue *OpV, unsigned Idx) {
5157 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5158 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5159 if (!Member0OpR)
5160 return Member0Op == OpV;
5161 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5162 return !W->getMask() && W->isConsecutive() && Member0Op == OpV;
5163 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5164 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5165 return false;
5166}
5167
5170 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5171 if (!WideMember0)
5172 return false;
5173 for (VPValue *V : Ops) {
5175 return false;
5176 auto *R = cast<VPSingleDefRecipe>(V);
5177 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5178 return false;
5179 }
5180
5181 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5183 for (VPValue *Op : Ops)
5184 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5185
5186 if (canNarrowOps(OpsI))
5187 continue;
5188
5189 if (any_of(enumerate(OpsI), [WideMember0, Idx](const auto &P) {
5190 const auto &[OpIdx, OpV] = P;
5191 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx);
5192 }))
5193 return false;
5194 }
5195
5196 return true;
5197}
5198
5199/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5200/// number of members both equal to VF. The interleave group must also access
5201/// the full vector width.
5202static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5204 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5205 if (!InterleaveR || InterleaveR->getMask())
5206 return std::nullopt;
5207
5208 Type *GroupElementTy = nullptr;
5209 if (InterleaveR->getStoredValues().empty()) {
5210 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5211 if (!all_of(InterleaveR->definedValues(),
5212 [&TypeInfo, GroupElementTy](VPValue *Op) {
5213 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5214 }))
5215 return std::nullopt;
5216 } else {
5217 GroupElementTy =
5218 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5219 if (!all_of(InterleaveR->getStoredValues(),
5220 [&TypeInfo, GroupElementTy](VPValue *Op) {
5221 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5222 }))
5223 return std::nullopt;
5224 }
5225
5226 auto IG = InterleaveR->getInterleaveGroup();
5227 if (IG->getFactor() != IG->getNumMembers())
5228 return std::nullopt;
5229
5230 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5231 TypeSize Size = TTI.getRegisterBitWidth(
5234 assert(Size.isScalable() == VF.isScalable() &&
5235 "if Size is scalable, VF must be scalable and vice versa");
5236 return Size.getKnownMinValue();
5237 };
5238
5239 for (ElementCount VF : VFs) {
5240 unsigned MinVal = VF.getKnownMinValue();
5241 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5242 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5243 return {VF};
5244 }
5245 return std::nullopt;
5246}
5247
5248/// Returns true if \p VPValue is a narrow VPValue.
5249static bool isAlreadyNarrow(VPValue *VPV) {
5250 if (isa<VPIRValue>(VPV))
5251 return true;
5252 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5253 return RepR && RepR->isSingleScalar();
5254}
5255
5256// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5257// a narrow variant.
5258static VPValue *
5260 auto *R = V->getDefiningRecipe();
5261 if (!R || NarrowedOps.contains(V))
5262 return V;
5263
5264 if (isAlreadyNarrow(V))
5265 return V;
5266
5268 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5269 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5270 WideMember0->setOperand(
5271 Idx,
5272 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5273 return V;
5274 }
5275
5276 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5277 // Narrow interleave group to wide load, as transformed VPlan will only
5278 // process one original iteration.
5279 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5280 auto *L = new VPWidenLoadRecipe(
5281 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5282 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5283 L->insertBefore(LoadGroup);
5284 NarrowedOps.insert(L);
5285 return L;
5286 }
5287
5288 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5289 assert(RepR->isSingleScalar() &&
5290 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5291 "must be a single scalar load");
5292 NarrowedOps.insert(RepR);
5293 return RepR;
5294 }
5295
5296 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5297 VPValue *PtrOp = WideLoad->getAddr();
5298 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5299 PtrOp = VecPtr->getOperand(0);
5300 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5301 // process one original iteration.
5302 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5303 /*IsUniform*/ true,
5304 /*Mask*/ nullptr, {}, *WideLoad);
5305 N->insertBefore(WideLoad);
5306 NarrowedOps.insert(N);
5307 return N;
5308}
5309
5310std::unique_ptr<VPlan>
5312 const TargetTransformInfo &TTI) {
5313 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5314
5315 if (!VectorLoop)
5316 return nullptr;
5317
5318 // Only handle single-block loops for now.
5319 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5320 return nullptr;
5321
5322 // Skip plans when we may not be able to properly narrow.
5323 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5324 if (!match(&Exiting->back(), m_BranchOnCount()))
5325 return nullptr;
5326
5327 assert(match(&Exiting->back(),
5329 m_Specific(&Plan.getVectorTripCount()))) &&
5330 "unexpected branch-on-count");
5331
5332 VPTypeAnalysis TypeInfo(Plan);
5334 std::optional<ElementCount> VFToOptimize;
5335 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5337 continue;
5338
5341 continue;
5342
5343 // Bail out on recipes not supported at the moment:
5344 // * phi recipes other than the canonical induction
5345 // * recipes writing to memory except interleave groups
5346 // Only support plans with a canonical induction phi.
5347 if (R.isPhi())
5348 return nullptr;
5349
5350 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5351 if (R.mayWriteToMemory() && !InterleaveR)
5352 return nullptr;
5353
5354 // All other ops are allowed, but we reject uses that cannot be converted
5355 // when checking all allowed consumers (store interleave groups) below.
5356 if (!InterleaveR)
5357 continue;
5358
5359 // Try to find a single VF, where all interleave groups are consecutive and
5360 // saturate the full vector width. If we already have a candidate VF, check
5361 // if it is applicable for the current InterleaveR, otherwise look for a
5362 // suitable VF across the Plan's VFs.
5364 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5365 : to_vector(Plan.vectorFactors());
5366 std::optional<ElementCount> NarrowedVF =
5367 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5368 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5369 return nullptr;
5370 VFToOptimize = NarrowedVF;
5371
5372 // Skip read interleave groups.
5373 if (InterleaveR->getStoredValues().empty())
5374 continue;
5375
5376 // Narrow interleave groups, if all operands are already matching narrow
5377 // ops.
5378 auto *Member0 = InterleaveR->getStoredValues()[0];
5379 if (isAlreadyNarrow(Member0) &&
5380 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5381 StoreGroups.push_back(InterleaveR);
5382 continue;
5383 }
5384
5385 // For now, we only support full interleave groups storing load interleave
5386 // groups.
5387 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5388 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5389 if (!DefR)
5390 return false;
5391 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5392 return IR && IR->getInterleaveGroup()->isFull() &&
5393 IR->getVPValue(Op.index()) == Op.value();
5394 })) {
5395 StoreGroups.push_back(InterleaveR);
5396 continue;
5397 }
5398
5399 // Check if all values feeding InterleaveR are matching wide recipes, which
5400 // operands that can be narrowed.
5401 if (!canNarrowOps(InterleaveR->getStoredValues()))
5402 return nullptr;
5403 StoreGroups.push_back(InterleaveR);
5404 }
5405
5406 if (StoreGroups.empty())
5407 return nullptr;
5408
5409 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5410 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5411 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5412 // TODO: Handle cases where only some interleave groups can be narrowed.
5413 std::unique_ptr<VPlan> NewPlan;
5414 if (size(Plan.vectorFactors()) != 1) {
5415 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5416 Plan.setVF(*VFToOptimize);
5417 NewPlan->removeVF(*VFToOptimize);
5418 }
5419
5420 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5421 SmallPtrSet<VPValue *, 4> NarrowedOps;
5422 // Narrow operation tree rooted at store groups.
5423 for (auto *StoreGroup : StoreGroups) {
5424 VPValue *Res =
5425 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5426 auto *SI =
5427 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5428 auto *S = new VPWidenStoreRecipe(
5429 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5430 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5431 S->insertBefore(StoreGroup);
5432 StoreGroup->eraseFromParent();
5433 }
5434
5435 // Adjust induction to reflect that the transformed plan only processes one
5436 // original iteration.
5437 auto *CanIV = VectorLoop->getCanonicalIV();
5438 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5439 VPBuilder PHBuilder(Plan.getVectorPreheader());
5440
5441 VPValue *UF = &Plan.getUF();
5442 if (VFToOptimize->isScalable()) {
5443 VPValue *VScale = PHBuilder.createElementCount(
5445 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5446 Instruction::Mul, {VScale, UF}, {true, false});
5447 Inc->setOperand(1, VScaleUF);
5448 Plan.getVF().replaceAllUsesWith(VScale);
5449 } else {
5450 Inc->setOperand(1, UF);
5452 Plan.getConstantInt(CanIV->getScalarType(), 1));
5453 }
5454 removeDeadRecipes(Plan);
5455 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5457 "All VPVectorPointerRecipes should have been removed");
5458 return NewPlan;
5459}
5460
5461/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5462/// BranchOnCond recipe.
5464 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5465 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5466 auto *MiddleTerm =
5468 // Only add branch metadata if there is a (conditional) terminator.
5469 if (!MiddleTerm)
5470 return;
5471
5472 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5473 "must have a BranchOnCond");
5474 // Assume that `TripCount % VectorStep ` is equally distributed.
5475 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5476 if (VF.isScalable() && VScaleForTuning.has_value())
5477 VectorStep *= *VScaleForTuning;
5478 assert(VectorStep > 0 && "trip count should not be zero");
5479 MDBuilder MDB(Plan.getContext());
5480 MDNode *BranchWeights =
5481 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5482 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5483}
5484
5485/// Compute and return the end value for \p WideIV, unless it is truncated. If
5486/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5487/// compute the end value of the induction.
5489 VPBuilder &VectorPHBuilder,
5490 VPTypeAnalysis &TypeInfo,
5491 VPValue *VectorTC) {
5492 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5493 // Truncated wide inductions resume from the last lane of their vector value
5494 // in the last vector iteration which is handled elsewhere.
5495 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5496 return nullptr;
5497
5498 VPIRValue *Start = WideIV->getStartValue();
5499 VPValue *Step = WideIV->getStepValue();
5501 VPValue *EndValue = VectorTC;
5502 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5503 EndValue = VectorPHBuilder.createDerivedIV(
5504 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5505 Start, VectorTC, Step);
5506 }
5507
5508 // EndValue is derived from the vector trip count (which has the same type as
5509 // the widest induction) and thus may be wider than the induction here.
5510 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5511 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5512 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5513 ScalarTypeOfWideIV,
5514 WideIV->getDebugLoc());
5515 }
5516
5517 return EndValue;
5518}
5519
5521 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues, bool FoldTail) {
5522 VPTypeAnalysis TypeInfo(Plan);
5523 auto *ScalarPH = Plan.getScalarPreheader();
5524 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5525 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5526 VPBuilder VectorPHBuilder(
5527 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5528 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5529 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5530 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5531
5532 // TODO: Extract final value from induction recipe initially, optimize to
5533 // pre-computed end value together in optimizeInductionExitUsers.
5534 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5535 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5536 // TODO: Check if tail is folded directly in VPlan.
5537 VPValue *TC = !FoldTail
5538 ? static_cast<VPValue *>(&Plan.getVectorTripCount())
5539 : Plan.getTripCount();
5541 WideIVR, VectorPHBuilder, TypeInfo, TC)) {
5542 IVEndValues[WideIVR] = EndValue;
5543 ResumePhiR->setOperand(0, EndValue);
5544 ResumePhiR->setName("bc.resume.val");
5545 continue;
5546 }
5547 // TODO: Also handle truncated inductions here. Computing end-values
5548 // separately should be done as VPlan-to-VPlan optimization, after
5549 // legalizing all resume values to use the last lane from the loop.
5550 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5551 "should only skip truncated wide inductions");
5552 continue;
5553 }
5554
5555 // The backedge value provides the value to resume coming out of a loop,
5556 // which for FORs is a vector whose last element needs to be extracted. The
5557 // start value provides the value if the loop is bypassed.
5558 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5559 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5560 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5561 "Cannot handle loops with uncountable early exits");
5562 if (IsFOR) {
5563 auto *ExtractPart = MiddleBuilder.createNaryOp(
5564 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5565 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5567 "vector.recur.extract");
5568 }
5569 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5570 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5571 }
5572}
5573
5575 VFRange &Range) {
5576 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5577 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5578 auto *MiddleVPBB = Plan.getMiddleBlock();
5579 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5580 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5581
5582 auto IsScalableOne = [](ElementCount VF) -> bool {
5583 return VF == ElementCount::getScalable(1);
5584 };
5585
5586 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5587 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5588 if (!FOR)
5589 continue;
5590
5591 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5592 "Cannot handle loops with uncountable early exits");
5593
5594 // This is the second phase of vectorizing first-order recurrences, creating
5595 // extract for users outside the loop. An overview of the transformation is
5596 // described below. Suppose we have the following loop with some use after
5597 // the loop of the last a[i-1],
5598 //
5599 // for (int i = 0; i < n; ++i) {
5600 // t = a[i - 1];
5601 // b[i] = a[i] - t;
5602 // }
5603 // use t;
5604 //
5605 // There is a first-order recurrence on "a". For this loop, the shorthand
5606 // scalar IR looks like:
5607 //
5608 // scalar.ph:
5609 // s.init = a[-1]
5610 // br scalar.body
5611 //
5612 // scalar.body:
5613 // i = phi [0, scalar.ph], [i+1, scalar.body]
5614 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5615 // s2 = a[i]
5616 // b[i] = s2 - s1
5617 // br cond, scalar.body, exit.block
5618 //
5619 // exit.block:
5620 // use = lcssa.phi [s1, scalar.body]
5621 //
5622 // In this example, s1 is a recurrence because it's value depends on the
5623 // previous iteration. In the first phase of vectorization, we created a
5624 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5625 // for users in the scalar preheader and exit block.
5626 //
5627 // vector.ph:
5628 // v_init = vector(..., ..., ..., a[-1])
5629 // br vector.body
5630 //
5631 // vector.body
5632 // i = phi [0, vector.ph], [i+4, vector.body]
5633 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5634 // v2 = a[i, i+1, i+2, i+3]
5635 // b[i] = v2 - v1
5636 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5637 // b[i, i+1, i+2, i+3] = v2 - v1
5638 // br cond, vector.body, middle.block
5639 //
5640 // middle.block:
5641 // vector.recur.extract.for.phi = v2(2)
5642 // vector.recur.extract = v2(3)
5643 // br cond, scalar.ph, exit.block
5644 //
5645 // scalar.ph:
5646 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5647 // [s.init, otherwise]
5648 // br scalar.body
5649 //
5650 // scalar.body:
5651 // i = phi [0, scalar.ph], [i+1, scalar.body]
5652 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5653 // s2 = a[i]
5654 // b[i] = s2 - s1
5655 // br cond, scalar.body, exit.block
5656 //
5657 // exit.block:
5658 // lo = lcssa.phi [s1, scalar.body],
5659 // [vector.recur.extract.for.phi, middle.block]
5660 //
5661 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5662 // Extract the penultimate value of the recurrence and use it as operand for
5663 // the VPIRInstruction modeling the phi.
5665 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5667 continue;
5668
5669 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5670 // penultimate value of the recurrence. Instead we rely on the existing
5671 // extract of the last element from the result of
5672 // VPInstruction::FirstOrderRecurrenceSplice.
5673 // TODO: Consider vscale_range info and UF.
5675 Range))
5676 return;
5677 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5678 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5679 "vector.recur.extract.for.phi");
5680 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5681 }
5682 }
5683}
5684
5687 Loop &L) {
5688 ScalarEvolution &SE = *PSE.getSE();
5689 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5690
5691 // Helper lambda to check if the IV range excludes the sentinel value.
5692 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5693 bool Signed) -> std::optional<APInt> {
5694 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5695 APInt Sentinel =
5696 UseMax
5699
5700 ConstantRange IVRange =
5701 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5702 if (!IVRange.contains(Sentinel))
5703 return Sentinel;
5704 return std::nullopt;
5705 };
5706
5707 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5708 for (VPRecipeBase &Phi :
5709 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5710 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5712 PhiR->getRecurrenceKind()))
5713 continue;
5714
5715 // If there's a header mask, the backedge select will not be the find-last
5716 // select.
5717 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5718 VPValue *CondSelect = BackedgeVal;
5719 if (HeaderMask &&
5720 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5721 m_VPValue(CondSelect), m_Specific(PhiR))))
5722 llvm_unreachable("expected header mask select");
5723
5724 // Get the IV from the conditional select of the reduction phi.
5725 // The conditional select should be a select between the phi and the IV.
5726 VPValue *Cond, *TrueVal, *FalseVal;
5727 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5728 m_VPValue(FalseVal))))
5729 continue;
5730
5731 // The non-phi operand of the select is the IV.
5732 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5733 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5734
5735 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5736 const SCEV *Step;
5737 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5738 continue;
5739
5740 // Determine direction from SCEV step.
5741 if (!SE.isKnownNonZero(Step))
5742 continue;
5743
5744 // Positive step means we need UMax/SMax to find the last IV value, and
5745 // UMin/SMin otherwise.
5746 bool UseMax = SE.isKnownPositive(Step);
5747 bool UseSigned = true;
5748 std::optional<APInt> SentinelVal =
5749 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5750 if (!SentinelVal) {
5751 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5752 UseSigned = false;
5753 }
5754
5755 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5756 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5757 // cannot use min/max.
5758 if (!SentinelVal) {
5759 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5760 if (AR->hasNoSignedWrap())
5761 UseSigned = true;
5762 else if (AR->hasNoUnsignedWrap())
5763 UseSigned = false;
5764 else
5765 continue;
5766 }
5767
5769 BackedgeVal,
5771
5772 RecurKind MinMaxKind =
5773 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5774 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5775 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5776 FastMathFlags());
5777 DebugLoc ExitDL = RdxResult->getDebugLoc();
5778 VPBuilder MiddleBuilder(RdxResult);
5779 VPValue *ReducedIV =
5781 RdxResult->getOperand(0), Flags, ExitDL);
5782
5783 VPValue *NewRdxResult;
5784 VPValue *StartVPV = PhiR->getStartValue();
5785 if (SentinelVal) {
5786 // Sentinel-based approach: reduce IVs with min/max, compare against
5787 // sentinel to detect if condition was ever true, select accordingly.
5788 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5789 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5790 Sentinel, ExitDL);
5791 NewRdxResult =
5792 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5793 StartVPV = Sentinel;
5794 } else {
5795 // Introduce a boolean AnyOf reduction to track if the condition was ever
5796 // true in the loop. Use it to select the initial start value, if it was
5797 // never true.
5798 auto *AnyOfPhi = new VPReductionPHIRecipe(
5799 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5800 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5801 AnyOfPhi->insertAfter(PhiR);
5802
5803 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5804 VPValue *AnyOfCond = Cond;
5805 if (TrueVal == PhiR)
5806 AnyOfCond = LoopBuilder.createNot(Cond);
5807 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5808 AnyOfPhi->setOperand(1, OrVal);
5809
5810 NewRdxResult =
5812 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5813
5814 // Initialize the IV reduction phi with the neutral element, not the
5815 // original start value, to ensure correct min/max reduction results.
5816 StartVPV = Plan.getOrAddLiveIn(
5817 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5818 }
5819 RdxResult->replaceAllUsesWith(NewRdxResult);
5820 RdxResult->eraseFromParent();
5821
5822 auto *NewPhiR = new VPReductionPHIRecipe(
5823 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5824 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5825 NewPhiR->insertBefore(PhiR);
5826 PhiR->replaceAllUsesWith(NewPhiR);
5827 PhiR->eraseFromParent();
5828 }
5829}
5830
5831namespace {
5832
5833/// A chain of recipes that form a partial reduction. Matches either
5834/// reduction_bin_op (extend (A), accumulator), or
5835/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5836struct VPPartialReductionChain {
5837 /// The top-level binary operation that forms the reduction to a scalar
5838 /// after the loop body.
5839 VPWidenRecipe *ReductionBinOp;
5840 /// The extension of each of the inner binary operation's operands.
5841 VPWidenCastRecipe *ExtendA;
5842 VPWidenCastRecipe *ExtendB;
5843 /// The user of the extends that is then reduced.
5844 VPWidenRecipe *BinOp;
5845 unsigned ScaleFactor;
5846 /// The recurrence kind for the entire partial reduction chain.
5847 /// This allows distinguishing between Sub and AddWithSub recurrences,
5848 /// when the ReductionBinOp is a Instruction::Sub.
5849 RecurKind RK;
5850};
5851
5852// Helper to transform a partial reduction chain into a partial reduction
5853// recipe. Assumes profitability has been checked.
5854static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5855 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5856 VPReductionPHIRecipe *RdxPhi) {
5857 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5858 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5859
5860 VPValue *BinOp = WidenRecipe->getOperand(0);
5861 VPValue *Accumulator = WidenRecipe->getOperand(1);
5862
5863 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5865 std::swap(BinOp, Accumulator);
5866
5867 // Sub-reductions can be implemented in two ways:
5868 // (1) negate the operand in the vector loop (the default way).
5869 // (2) subtract the reduced value from the init value in the middle block.
5870 // Both ways keep the reduction itself as an 'add' reduction.
5871 //
5872 // The ISD nodes for partial reductions don't support folding the
5873 // sub/negation into its operands because the following is not a valid
5874 // transformation:
5875 // sub(0, mul(ext(a), ext(b)))
5876 // -> mul(ext(a), ext(sub(0, b)))
5877 //
5878 // It's therefore better to choose option (2) such that the partial
5879 // reduction is always positive (starting at '0') and to do a final
5880 // subtract in the middle block.
5881 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5882 Chain.RK != RecurKind::Sub) {
5883 VPBuilder Builder(WidenRecipe);
5884 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5885 auto *Zero = Plan.getConstantInt(ElemTy, 0);
5886 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5887 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5888 : VPIRFlags();
5889 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5891 Builder.insert(NegRecipe);
5892 BinOp = NegRecipe;
5893 }
5894
5895 // Check if WidenRecipe is the final result of the reduction. If so look
5896 // through selects for predicated reductions.
5897 VPValue *Cond = nullptr;
5899 WidenRecipe,
5900 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5901 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5902 RdxPhi->getBackedgeValue() == ExitValue;
5903 assert((!ExitValue || IsLastInChain) &&
5904 "if we found ExitValue, it must match RdxPhi's backedge value");
5905
5906 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
5907 RecurKind RdxKind =
5909 auto *PartialRed = new VPReductionRecipe(
5910 RdxKind,
5911 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5912 : FastMathFlags(),
5913 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
5914 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
5915 PartialRed->insertBefore(WidenRecipe);
5916
5917 if (Cond)
5918 ExitValue->replaceAllUsesWith(PartialRed);
5919 WidenRecipe->replaceAllUsesWith(PartialRed);
5920
5921 // We only need to update the PHI node once, which is when we find the
5922 // last reduction in the chain.
5923 if (!IsLastInChain)
5924 return;
5925
5926 // Scale the PHI and ReductionStartVector by the VFScaleFactor
5927 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
5928 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
5929
5930 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
5931 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
5932 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
5933 StartInst->setOperand(2, NewScaleFactor);
5934
5935 // If this is the last value in a sub-reduction chain, then update the PHI
5936 // node to start at `0` and update the reduction-result to subtract from
5937 // the PHI's start value.
5938 if (Chain.RK != RecurKind::Sub)
5939 return;
5940
5941 VPValue *OldStartValue = StartInst->getOperand(0);
5942 StartInst->setOperand(0, StartInst->getOperand(1));
5943
5944 // Replace reduction_result by 'sub (startval, reductionresult)'.
5946 assert(RdxResult && "Could not find reduction result");
5947
5948 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
5949 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
5950 VPInstruction *NewResult = Builder.createNaryOp(
5951 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
5952 RdxPhi->getDebugLoc());
5953 RdxResult->replaceUsesWithIf(
5954 NewResult,
5955 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
5956}
5957
5958/// Check if a partial reduction chain is is supported by the target (i.e. does
5959/// not have an invalid cost) for the given VF range. Clamps the range and
5960/// returns true if profitable for any VF.
5961static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
5962 Type *PhiType, VPCostContext &CostCtx,
5963 VFRange &Range) {
5964 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
5965 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
5966 if (!Ext)
5967 return {nullptr, TargetTransformInfo::PR_None};
5968 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
5970 static_cast<Instruction::CastOps>(Ext->getOpcode()));
5971 return {ExtOpType, ExtKind};
5972 };
5973 auto ExtInfoA = GetExtInfo(Chain.ExtendA);
5974 auto ExtInfoB = GetExtInfo(Chain.ExtendB);
5975 Type *ExtOpTypeA = ExtInfoA.first;
5976 Type *ExtOpTypeB = ExtInfoB.first;
5977 auto ExtKindA = ExtInfoA.second;
5978 auto ExtKindB = ExtInfoB.second;
5979
5980 // If ExtendB is nullptr but there's a separate BinOp, the second operand
5981 // was a constant that can use the same extend kind as the first.
5982 if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) {
5983 const APInt *Const = nullptr;
5984 for (VPValue *Op : Chain.BinOp->operands()) {
5985 if (match(Op, m_APInt(Const)))
5986 break;
5987 }
5988 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
5989 return false;
5990 ExtOpTypeB = ExtOpTypeA;
5991 ExtKindB = ExtKindA;
5992 }
5993
5994 std::optional<unsigned> BinOpc =
5995 (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp)
5996 ? std::make_optional(Chain.BinOp->getOpcode())
5997 : std::nullopt;
5998 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6000 [&](ElementCount VF) {
6001 return CostCtx.TTI
6003 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6004 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6005 PhiType->isFloatingPointTy()
6006 ? std::optional{WidenRecipe->getFastMathFlags()}
6007 : std::nullopt)
6008 .isValid();
6009 },
6010 Range);
6011}
6012
6013/// Examines reduction operations to see if the target can use a cheaper
6014/// operation with a wider per-iteration input VF and narrower PHI VF.
6015/// Recursively calls itself to identify chained scaled reductions.
6016/// Returns true if this invocation added an entry to Chains, otherwise false.
6017static bool
6018getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPValue *PrevValue,
6020 VPCostContext &CostCtx, VFRange &Range) {
6021 auto *UpdateR = dyn_cast<VPWidenRecipe>(PrevValue);
6022 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6023 return false;
6024
6025 VPValue *Op = UpdateR->getOperand(0);
6026 VPValue *PhiOp = UpdateR->getOperand(1);
6027 if (Op == RedPhiR)
6028 std::swap(Op, PhiOp);
6029
6030 // If Op is an extend, then it's still a valid partial reduction if the
6031 // extended mul fulfills the other requirements.
6032 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6033 // reduction since the inner extends will be widened. We already have oneUse
6034 // checks on the inner extends so widening them is safe.
6035 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6038 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Op);
6039 if (!CastRecipe)
6040 return false;
6041 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6042 OuterExtKind = TTI::getPartialReductionExtendKind(CastOp);
6043 Op = CastRecipe->getOperand(0);
6044 }
6045
6046 // Try and get a scaled reduction from the first non-phi operand.
6047 // If one is found, we use the discovered reduction instruction in
6048 // place of the accumulator for costing.
6049 if (getScaledReductions(RedPhiR, Op, Chains, CostCtx, Range)) {
6050 Op = UpdateR->getOperand(0);
6051 PhiOp = UpdateR->getOperand(1);
6052 if (Op == Chains.rbegin()->ReductionBinOp)
6053 std::swap(Op, PhiOp);
6054 assert(PhiOp == Chains.rbegin()->ReductionBinOp &&
6055 "PhiOp must be the chain value");
6056 assert(CostCtx.Types.inferScalarType(RedPhiR) ==
6057 CostCtx.Types.inferScalarType(PhiOp) &&
6058 "Unexpected type for chain values");
6059 } else if (RedPhiR != PhiOp) {
6060 // If neither operand of this instruction is the reduction PHI node or a
6061 // link in the reduction chain, then this is just an operand to the chain
6062 // and not a link in the chain itself.
6063 return false;
6064 }
6065
6066 // If the update is a binary op, check both of its operands to see if
6067 // they are extends. Otherwise, see if the update comes directly from an
6068 // extend.
6069 VPWidenCastRecipe *CastRecipes[2] = {nullptr};
6070
6071 // Match extends and populate CastRecipes. Returns false if matching fails.
6072 auto MatchExtends = [OuterExtKind,
6073 &CastRecipes](ArrayRef<VPValue *> Operands) {
6074 assert(Operands.size() <= 2 && "expected at most 2 operands");
6075
6076 for (const auto &[I, OpVal] : enumerate(Operands)) {
6077 // Allow constant as second operand - validation happens in
6078 // isValidPartialReduction.
6079 const APInt *Unused;
6080 if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused)))
6081 continue;
6082
6083 VPValue *ExtInput;
6084 if (!match(OpVal, m_ZExtOrSExt(m_VPValue(ExtInput))) &&
6085 !match(OpVal, m_FPExt(m_VPValue(ExtInput))))
6086 return false;
6087
6088 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(OpVal);
6089 if (!CastRecipes[I])
6090 return false;
6091
6092 // The outer extend kind must match the inner extends for folding.
6093 if (OuterExtKind) {
6094 auto CastOp =
6095 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
6096 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOp))
6097 return false;
6098 }
6099 }
6100 return CastRecipes[0] != nullptr;
6101 };
6102
6103 // If Op is a binary operator, check both of its operands to see if they are
6104 // extends. Otherwise, see if the update comes directly from an extend.
6105 auto *BinOp = dyn_cast<VPWidenRecipe>(Op);
6106 if (BinOp && Instruction::isBinaryOp(BinOp->getOpcode())) {
6107 if (!BinOp->hasOneUse())
6108 return false;
6109
6110 // Handle neg(binop(ext, ext)) pattern.
6111 VPValue *OtherOp = nullptr;
6112 if (match(BinOp, m_Sub(m_ZeroInt(), m_VPValue(OtherOp))))
6113 BinOp = dyn_cast<VPWidenRecipe>(OtherOp);
6114
6115 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6116 !MatchExtends(BinOp->operands()))
6117 return false;
6118 } else if (match(UpdateR, m_Add(m_VPValue(), m_VPValue())) ||
6119 match(UpdateR, m_FAdd(m_VPValue(), m_VPValue()))) {
6120 // We already know the operands for Update are Op and PhiOp.
6121 if (!MatchExtends({Op}))
6122 return false;
6123 BinOp = UpdateR;
6124 } else {
6125 return false;
6126 }
6127
6128 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6129 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6130 Type *ExtOpType =
6131 CostCtx.Types.inferScalarType(CastRecipes[0]->getOperand(0));
6132 TypeSize ASize = ExtOpType->getPrimitiveSizeInBits();
6133 if (!PHISize.hasKnownScalarFactor(ASize))
6134 return false;
6135
6136 RecurKind RK = cast<VPReductionPHIRecipe>(RedPhiR)->getRecurrenceKind();
6137 VPPartialReductionChain Chain(
6138 {UpdateR, CastRecipes[0], CastRecipes[1], BinOp,
6139 static_cast<unsigned>(PHISize.getKnownScalarFactor(ASize)), RK});
6140 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6141 return false;
6142
6143 Chains.push_back(Chain);
6144 return true;
6145}
6146} // namespace
6147
6149 VPCostContext &CostCtx,
6150 VFRange &Range) {
6151 // Find all possible valid partial reductions, grouping chains by their PHI.
6152 // This grouping allows invalidating the whole chain, if any link is not a
6153 // valid partial reduction.
6155 ChainsByPhi;
6156 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6157 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6158 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6159 if (!RedPhiR)
6160 continue;
6161
6162 // Get the backedge value from the reduction PHI and find the
6163 // ComputeReductionResult that uses it (directly or through a select for
6164 // predicated reductions).
6165 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6166 VPValue *ExitValue = RdxResult->getOperand(0);
6167 match(ExitValue,
6168 m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6169 getScaledReductions(RedPhiR, ExitValue, ChainsByPhi[RedPhiR], CostCtx,
6170 Range);
6171 }
6172 }
6173
6174 if (ChainsByPhi.empty())
6175 return;
6176
6177 // Build set of partial reduction operations for extend user validation and
6178 // a map of reduction bin ops to their scale factors for scale validation.
6179 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6180 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6181 for (const auto &[_, Chains] : ChainsByPhi)
6182 for (const VPPartialReductionChain &Chain : Chains) {
6183 PartialReductionOps.insert(Chain.BinOp);
6184 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6185 }
6186
6187 // A partial reduction is invalid if any of its extends are used by
6188 // something that isn't another partial reduction. This is because the
6189 // extends are intended to be lowered along with the reduction itself.
6190 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6191 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6192 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6193 });
6194 };
6195
6196 // Validate chains: check that extends are only used by partial reductions,
6197 // and that reduction bin ops are only used by other partial reductions with
6198 // matching scale factors, are outside the loop region or the select
6199 // introduced by tail-folding. Otherwise we would create users of scaled
6200 // reductions where the types of the other operands don't match.
6201 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6202 for (const VPPartialReductionChain &Chain : Chains) {
6203 if (!ExtendUsersValid(Chain.ExtendA) ||
6204 !ExtendUsersValid(Chain.ExtendB)) {
6205 Chains.clear();
6206 break;
6207 }
6208 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6209 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6210 return PhiR == RedPhiR;
6211 auto *R = cast<VPSingleDefRecipe>(U);
6212 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6214 m_Specific(Chain.ReductionBinOp))) ||
6215 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6216 m_Specific(RedPhiR)));
6217 };
6218 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6219 Chains.clear();
6220 break;
6221 }
6222
6223 // Check if the compute-reduction-result is used by a sunk store.
6224 // TODO: Also form partial reductions in those cases.
6225 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6226 if (any_of(RdxResult->users(), [](VPUser *U) {
6227 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6228 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6229 })) {
6230 Chains.clear();
6231 break;
6232 }
6233 }
6234 }
6235 }
6236
6237 for (auto &[Phi, Chains] : ChainsByPhi)
6238 for (const VPPartialReductionChain &Chain : Chains)
6239 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6240}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool canNarrowOps(ArrayRef< VPValue * > Ops)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3814
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4181
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4256
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4208
iterator end()
Definition VPlan.h:4218
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4216
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4269
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4228
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4230
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2719
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2755
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2745
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2761
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2741
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:164
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:269
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:290
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:202
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:221
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:239
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3223
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3756
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3846
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3926
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3268
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2232
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2274
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2263
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4334
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4358
Class to record and manage LLVM IR flags.
Definition VPlan.h:670
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1105
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1160
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1262
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1207
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1204
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1256
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1199
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1196
@ CanonicalIVIncrementForPart
Definition VPlan.h:1180
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2864
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2856
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2885
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2937
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2895
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1524
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3410
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4486
VPBasicBlock * getParent()
Definition VPlan.h:462
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:536
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3097
A recipe for handling reduction phis.
Definition VPlan.h:2625
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2672
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2665
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2988
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4369
const VPBlockBase * getEntry() const
Definition VPlan.h:4405
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4480
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4437
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4422
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4467
const VPBlockBase * getExiting() const
Definition VPlan.h:4417
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4430
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3142
bool isSingleScalar() const
Definition VPlan.h:3183
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3207
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3998
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:588
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:656
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
unsigned getNumOperands() const
Definition VPlanValue.h:296
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1400
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
bool hasOneUse() const
Definition VPlanValue.h:142
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:172
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1403
unsigned getNumUsers() const
Definition VPlanValue.h:104
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1407
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2080
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3889
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1766
Instruction::CastOps getOpcode() const
Definition VPlan.h:1804
A recipe for handling GEP instructions.
Definition VPlan.h:2016
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2298
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2326
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2344
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2329
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2349
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2380
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2427
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2431
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2458
A recipe for widening vector intrinsics.
Definition VPlan.h:1818
A common base class for widening memory operations.
Definition VPlan.h:3453
A recipe for widened phis.
Definition VPlan.h:2516
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1710
unsigned getOpcode() const
Definition VPlan.h:1747
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4499
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4795
bool hasVF(ElementCount VF) const
Definition VPlan.h:4708
LLVMContext & getContext() const
Definition VPlan.h:4690
VPBasicBlock * getEntry()
Definition VPlan.h:4591
bool hasScalableVF() const
Definition VPlan.h:4709
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4688
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4681
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4649
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4670
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4715
VPValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4685
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4774
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4843
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4798
bool hasUF(unsigned UF) const
Definition VPlan.h:4726
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4639
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4678
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4751
void setVF(ElementCount VF)
Definition VPlan.h:4696
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4742
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1033
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4729
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4663
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4616
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4821
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4771
bool hasScalarVFOnly() const
Definition VPlan.h:4719
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4630
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4635
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4596
void setUF(unsigned UF)
Definition VPlan.h:4734
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4875
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1181
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4777
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:262
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:289
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:282
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1762
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:275
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2607
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2563
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:183
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:223
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3586
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3544
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3671
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3627
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues, bool FoldTail)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...