LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Return true if we do not know how to (mechanically) hoist or sink \p R out
234/// of a loop region.
236 // Assumes don't alias anything or throw; as long as they're guaranteed to
237 // execute, they're safe to hoist.
239 return false;
240
241 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
242 // memory location is not modified in the vector loop.
243 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
244 return true;
245
246 // Allocas cannot be hoisted.
247 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
248 return RepR && RepR->getOpcode() == Instruction::Alloca;
249}
250
251static bool sinkScalarOperands(VPlan &Plan) {
252 auto Iter = vp_depth_first_deep(Plan.getEntry());
253 bool ScalarVFOnly = Plan.hasScalarVFOnly();
254 bool Changed = false;
255
257 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
258 VPBasicBlock *SinkTo, VPValue *Op) {
259 auto *Candidate =
260 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
261 if (!Candidate)
262 return;
263
264 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
265 // for now.
267 return;
268
269 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
270 return;
271
272 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
273 if (!ScalarVFOnly && RepR->isSingleScalar())
274 return;
275
276 WorkList.insert({SinkTo, Candidate});
277 };
278
279 // First, collect the operands of all recipes in replicate blocks as seeds for
280 // sinking.
282 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
283 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
284 continue;
285 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
286 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
287 continue;
288 for (auto &Recipe : *VPBB)
289 for (VPValue *Op : Recipe.operands())
290 InsertIfValidSinkCandidate(VPBB, Op);
291 }
292
293 // Try to sink each replicate or scalar IV steps recipe in the worklist.
294 for (unsigned I = 0; I != WorkList.size(); ++I) {
295 VPBasicBlock *SinkTo;
296 VPSingleDefRecipe *SinkCandidate;
297 std::tie(SinkTo, SinkCandidate) = WorkList[I];
298
299 // All recipe users of SinkCandidate must be in the same block SinkTo or all
300 // users outside of SinkTo must only use the first lane of SinkCandidate. In
301 // the latter case, we need to duplicate SinkCandidate.
302 auto UsersOutsideSinkTo =
303 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
304 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
305 });
306 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
307 return !U->usesFirstLaneOnly(SinkCandidate);
308 }))
309 continue;
310 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
311
312 if (NeedsDuplicating) {
313 if (ScalarVFOnly)
314 continue;
315 VPSingleDefRecipe *Clone;
316 if (auto *SinkCandidateRepR =
317 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
318 // TODO: Handle converting to uniform recipes as separate transform,
319 // then cloning should be sufficient here.
320 Instruction *I = SinkCandidate->getUnderlyingInstr();
321 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
322 nullptr /*Mask*/, *SinkCandidateRepR,
323 *SinkCandidateRepR);
324 // TODO: add ".cloned" suffix to name of Clone's VPValue.
325 } else {
326 Clone = SinkCandidate->clone();
327 }
328
329 Clone->insertBefore(SinkCandidate);
330 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
331 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
332 });
333 }
334 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
335 for (VPValue *Op : SinkCandidate->operands())
336 InsertIfValidSinkCandidate(SinkTo, Op);
337 Changed = true;
338 }
339 return Changed;
340}
341
342/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
343/// the mask.
345 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
346 if (!EntryBB || EntryBB->size() != 1 ||
347 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
348 return nullptr;
349
350 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
351}
352
353/// If \p R is a triangle region, return the 'then' block of the triangle.
355 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
356 if (EntryBB->getNumSuccessors() != 2)
357 return nullptr;
358
359 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
360 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
361 if (!Succ0 || !Succ1)
362 return nullptr;
363
364 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
365 return nullptr;
366 if (Succ0->getSingleSuccessor() == Succ1)
367 return Succ0;
368 if (Succ1->getSingleSuccessor() == Succ0)
369 return Succ1;
370 return nullptr;
371}
372
373// Merge replicate regions in their successor region, if a replicate region
374// is connected to a successor replicate region with the same predicate by a
375// single, empty VPBasicBlock.
377 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
378
379 // Collect replicate regions followed by an empty block, followed by another
380 // replicate region with matching masks to process front. This is to avoid
381 // iterator invalidation issues while merging regions.
384 vp_depth_first_deep(Plan.getEntry()))) {
385 if (!Region1->isReplicator())
386 continue;
387 auto *MiddleBasicBlock =
388 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
389 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
390 continue;
391
392 auto *Region2 =
393 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
394 if (!Region2 || !Region2->isReplicator())
395 continue;
396
397 VPValue *Mask1 = getPredicatedMask(Region1);
398 VPValue *Mask2 = getPredicatedMask(Region2);
399 if (!Mask1 || Mask1 != Mask2)
400 continue;
401
402 assert(Mask1 && Mask2 && "both region must have conditions");
403 WorkList.push_back(Region1);
404 }
405
406 // Move recipes from Region1 to its successor region, if both are triangles.
407 for (VPRegionBlock *Region1 : WorkList) {
408 if (TransformedRegions.contains(Region1))
409 continue;
410 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
411 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
412
413 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
414 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
415 if (!Then1 || !Then2)
416 continue;
417
418 // Note: No fusion-preventing memory dependencies are expected in either
419 // region. Such dependencies should be rejected during earlier dependence
420 // checks, which guarantee accesses can be re-ordered for vectorization.
421 //
422 // Move recipes to the successor region.
423 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
424 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
425
426 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
427 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
428
429 // Move VPPredInstPHIRecipes from the merge block to the successor region's
430 // merge block. Update all users inside the successor region to use the
431 // original values.
432 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
433 VPValue *PredInst1 =
434 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
435 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
436 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
437 return cast<VPRecipeBase>(&U)->getParent() == Then2;
438 });
439
440 // Remove phi recipes that are unused after merging the regions.
441 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
442 Phi1ToMove.eraseFromParent();
443 continue;
444 }
445 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
446 }
447
448 // Remove the dead recipes in Region1's entry block.
449 for (VPRecipeBase &R :
450 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
451 R.eraseFromParent();
452
453 // Finally, remove the first region.
454 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
455 VPBlockUtils::disconnectBlocks(Pred, Region1);
456 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
457 }
458 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
459 TransformedRegions.insert(Region1);
460 }
461
462 return !TransformedRegions.empty();
463}
464
466 VPlan &Plan) {
467 Instruction *Instr = PredRecipe->getUnderlyingInstr();
468 // Build the triangular if-then region.
469 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
470 assert(Instr->getParent() && "Predicated instruction not in any basic block");
471 auto *BlockInMask = PredRecipe->getMask();
472 auto *MaskDef = BlockInMask->getDefiningRecipe();
473 auto *BOMRecipe = new VPBranchOnMaskRecipe(
474 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
475 auto *Entry =
476 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
477
478 // Replace predicated replicate recipe with a replicate recipe without a
479 // mask but in the replicate region.
480 auto *RecipeWithoutMask = new VPReplicateRecipe(
481 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
482 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
483 PredRecipe->getDebugLoc());
484 auto *Pred =
485 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
486
487 VPPredInstPHIRecipe *PHIRecipe = nullptr;
488 if (PredRecipe->getNumUsers() != 0) {
489 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
490 RecipeWithoutMask->getDebugLoc());
491 PredRecipe->replaceAllUsesWith(PHIRecipe);
492 PHIRecipe->setOperand(0, RecipeWithoutMask);
493 }
494 PredRecipe->eraseFromParent();
495 auto *Exiting =
496 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
498 Plan.createReplicateRegion(Entry, Exiting, RegionName);
499
500 // Note: first set Entry as region entry and then connect successors starting
501 // from it in order, to propagate the "parent" of each VPBasicBlock.
502 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
503 VPBlockUtils::connectBlocks(Pred, Exiting);
504
505 return Region;
506}
507
508static void addReplicateRegions(VPlan &Plan) {
511 vp_depth_first_deep(Plan.getEntry()))) {
512 for (VPRecipeBase &R : *VPBB)
513 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
514 if (RepR->isPredicated())
515 WorkList.push_back(RepR);
516 }
517 }
518
519 unsigned BBNum = 0;
520 for (VPReplicateRecipe *RepR : WorkList) {
521 VPBasicBlock *CurrentBlock = RepR->getParent();
522 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
523
524 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
525 SplitBlock->setName(
526 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
527 // Record predicated instructions for above packing optimizations.
529 Region->setParent(CurrentBlock->getParent());
531
532 VPRegionBlock *ParentRegion = Region->getParent();
533 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
534 ParentRegion->setExiting(SplitBlock);
535 }
536}
537
538/// Remove redundant VPBasicBlocks by merging them into their predecessor if
539/// the predecessor has a single successor.
543 vp_depth_first_deep(Plan.getEntry()))) {
544 // Don't fold the blocks in the skeleton of the Plan into their single
545 // predecessors for now.
546 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
547 if (!VPBB->getParent())
548 continue;
549 auto *PredVPBB =
550 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
551 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
552 isa<VPIRBasicBlock>(PredVPBB))
553 continue;
554 WorkList.push_back(VPBB);
555 }
556
557 for (VPBasicBlock *VPBB : WorkList) {
558 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
559 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
560 R.moveBefore(*PredVPBB, PredVPBB->end());
561 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
562 auto *ParentRegion = VPBB->getParent();
563 if (ParentRegion && ParentRegion->getExiting() == VPBB)
564 ParentRegion->setExiting(PredVPBB);
565 for (auto *Succ : to_vector(VPBB->successors())) {
567 VPBlockUtils::connectBlocks(PredVPBB, Succ);
568 }
569 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
570 }
571 return !WorkList.empty();
572}
573
575 // Convert masked VPReplicateRecipes to if-then region blocks.
577
578 bool ShouldSimplify = true;
579 while (ShouldSimplify) {
580 ShouldSimplify = sinkScalarOperands(Plan);
581 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
582 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
583 }
584}
585
586/// Remove redundant casts of inductions.
587///
588/// Such redundant casts are casts of induction variables that can be ignored,
589/// because we already proved that the casted phi is equal to the uncasted phi
590/// in the vectorized loop. There is no need to vectorize the cast - the same
591/// value can be used for both the phi and casts in the vector loop.
593 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
595 if (!IV || IV->getTruncInst())
596 continue;
597
598 // A sequence of IR Casts has potentially been recorded for IV, which
599 // *must be bypassed* when the IV is vectorized, because the vectorized IV
600 // will produce the desired casted value. This sequence forms a def-use
601 // chain and is provided in reverse order, ending with the cast that uses
602 // the IV phi. Search for the recipe of the last cast in the chain and
603 // replace it with the original IV. Note that only the final cast is
604 // expected to have users outside the cast-chain and the dead casts left
605 // over will be cleaned up later.
606 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
607 VPValue *FindMyCast = IV;
608 for (Instruction *IRCast : reverse(Casts)) {
609 VPSingleDefRecipe *FoundUserCast = nullptr;
610 for (auto *U : FindMyCast->users()) {
611 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
612 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
613 FoundUserCast = UserCast;
614 break;
615 }
616 }
617 FindMyCast = FoundUserCast;
618 }
619 FindMyCast->replaceAllUsesWith(IV);
620 }
621}
622
623/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
624/// recipe, if it exists.
626 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
627 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
628 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
629 for (VPUser *U : CanonicalIV->users()) {
631 if (WidenNewIV)
632 break;
633 }
634
635 if (!WidenNewIV)
636 return;
637
638 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
639 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
640 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
641
642 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
643 continue;
644
645 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
646 // everything WidenNewIV's users need. That is, WidenOriginalIV will
647 // generate a vector phi or all users of WidenNewIV demand the first lane
648 // only.
649 if (Plan.hasScalarVFOnly() ||
650 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
651 vputils::onlyFirstLaneUsed(WidenNewIV)) {
652 // We are replacing a wide canonical iv with a suitable wide induction.
653 // This is used to compute header mask, hence all lanes will be used and
654 // we need to drop wrap flags only applying to lanes guranteed to execute
655 // in the original scalar loop.
656 WidenOriginalIV->dropPoisonGeneratingFlags();
657 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
658 WidenNewIV->eraseFromParent();
659 return;
660 }
661 }
662}
663
664/// Returns true if \p R is dead and can be removed.
665static bool isDeadRecipe(VPRecipeBase &R) {
666 // Do remove conditional assume instructions as their conditions may be
667 // flattened.
668 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
669 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
671 if (IsConditionalAssume)
672 return true;
673
674 if (R.mayHaveSideEffects())
675 return false;
676
677 // Recipe is dead if no user keeps the recipe alive.
678 return all_of(R.definedValues(),
679 [](VPValue *V) { return V->getNumUsers() == 0; });
680}
681
684 vp_post_order_deep(Plan.getEntry()))) {
685 // The recipes in the block are processed in reverse order, to catch chains
686 // of dead recipes.
687 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
688 if (isDeadRecipe(R)) {
689 R.eraseFromParent();
690 continue;
691 }
692
693 // Check if R is a dead VPPhi <-> update cycle and remove it.
694 auto *PhiR = dyn_cast<VPPhi>(&R);
695 if (!PhiR || PhiR->getNumOperands() != 2)
696 continue;
697 VPUser *PhiUser = PhiR->getSingleUser();
698 if (!PhiUser)
699 continue;
700 VPValue *Incoming = PhiR->getOperand(1);
701 if (PhiUser != Incoming->getDefiningRecipe() ||
702 Incoming->getNumUsers() != 1)
703 continue;
704 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
705 PhiR->eraseFromParent();
706 Incoming->getDefiningRecipe()->eraseFromParent();
707 }
708 }
709}
710
713 Instruction::BinaryOps InductionOpcode,
714 FPMathOperator *FPBinOp, Instruction *TruncI,
715 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
716 VPBuilder &Builder) {
717 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
718 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
719 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
720 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
721 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
722
723 // Truncate base induction if needed.
724 VPTypeAnalysis TypeInfo(Plan);
725 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
726 if (TruncI) {
727 Type *TruncTy = TruncI->getType();
728 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
729 "Not truncating.");
730 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
731 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
732 ResultTy = TruncTy;
733 }
734
735 // Truncate step if needed.
736 Type *StepTy = TypeInfo.inferScalarType(Step);
737 if (ResultTy != StepTy) {
738 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
739 "Not truncating.");
740 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
741 auto *VecPreheader =
743 VPBuilder::InsertPointGuard Guard(Builder);
744 Builder.setInsertPoint(VecPreheader);
745 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
746 }
747 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
748 &Plan.getVF(), DL);
749}
750
753 for (unsigned I = 0; I != Users.size(); ++I) {
755 if (isa<VPHeaderPHIRecipe>(Cur))
756 continue;
757 for (VPValue *V : Cur->definedValues())
758 Users.insert_range(V->users());
759 }
760 return Users.takeVector();
761}
762
763/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
764/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
765/// generates scalar values.
766static VPValue *
768 VPlan &Plan, VPBuilder &Builder) {
770 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
771 VPValue *StepV = PtrIV->getOperand(1);
773 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
774 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
775
776 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
777 PtrIV->getDebugLoc(), "next.gep");
778}
779
780/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
781/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
782/// VPWidenPointerInductionRecipe will generate vectors only. If some users
783/// require vectors while other require scalars, the scalar uses need to extract
784/// the scalars from the generated vectors (Note that this is different to how
785/// int/fp inductions are handled). Legalize extract-from-ends using uniform
786/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
787/// the correct end value is available. Also optimize
788/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
789/// providing them scalar steps built on the canonical scalar IV and update the
790/// original IV's users. This is an optional optimization to reduce the needs of
791/// vector extracts.
794 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
795 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
796 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
797 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
798 if (!PhiR)
799 continue;
800
801 // Try to narrow wide and replicating recipes to uniform recipes, based on
802 // VPlan analysis.
803 // TODO: Apply to all recipes in the future, to replace legacy uniformity
804 // analysis.
805 auto Users = collectUsersRecursively(PhiR);
806 for (VPUser *U : reverse(Users)) {
807 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
808 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
809 // Skip recipes that shouldn't be narrowed.
810 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
811 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
812 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
813 continue;
814
815 // Skip recipes that may have other lanes than their first used.
817 continue;
818
819 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
820 Def->operands(), /*IsUniform*/ true,
821 /*Mask*/ nullptr, /*Flags*/ *Def);
822 Clone->insertAfter(Def);
823 Def->replaceAllUsesWith(Clone);
824 }
825
826 // Replace wide pointer inductions which have only their scalars used by
827 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
828 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
829 if (!Plan.hasScalarVFOnly() &&
830 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
831 continue;
832
833 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
834 PtrIV->replaceAllUsesWith(PtrAdd);
835 continue;
836 }
837
838 // Replace widened induction with scalar steps for users that only use
839 // scalars.
840 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
841 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
842 return U->usesScalars(WideIV);
843 }))
844 continue;
845
846 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
848 Plan, ID.getKind(), ID.getInductionOpcode(),
849 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
850 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
851 WideIV->getDebugLoc(), Builder);
852
853 // Update scalar users of IV to use Step instead.
854 if (!HasOnlyVectorVFs) {
855 assert(!Plan.hasScalableVF() &&
856 "plans containing a scalar VF cannot also include scalable VFs");
857 WideIV->replaceAllUsesWith(Steps);
858 } else {
859 bool HasScalableVF = Plan.hasScalableVF();
860 WideIV->replaceUsesWithIf(Steps,
861 [WideIV, HasScalableVF](VPUser &U, unsigned) {
862 if (HasScalableVF)
863 return U.usesFirstLaneOnly(WideIV);
864 return U.usesScalars(WideIV);
865 });
866 }
867 }
868}
869
870/// Check if \p VPV is an untruncated wide induction, either before or after the
871/// increment. If so return the header IV (before the increment), otherwise
872/// return null.
875 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
876 if (WideIV) {
877 // VPV itself is a wide induction, separately compute the end value for exit
878 // users if it is not a truncated IV.
879 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
880 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
881 }
882
883 // Check if VPV is an optimizable induction increment.
884 VPRecipeBase *Def = VPV->getDefiningRecipe();
885 if (!Def || Def->getNumOperands() != 2)
886 return nullptr;
887 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
888 if (!WideIV)
889 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
890 if (!WideIV)
891 return nullptr;
892
893 auto IsWideIVInc = [&]() {
894 auto &ID = WideIV->getInductionDescriptor();
895
896 // Check if VPV increments the induction by the induction step.
897 VPValue *IVStep = WideIV->getStepValue();
898 switch (ID.getInductionOpcode()) {
899 case Instruction::Add:
900 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
901 case Instruction::FAdd:
902 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
903 case Instruction::FSub:
904 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
905 m_Specific(IVStep)));
906 case Instruction::Sub: {
907 // IVStep will be the negated step of the subtraction. Check if Step == -1
908 // * IVStep.
909 VPValue *Step;
910 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
911 return false;
912 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
913 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
914 ScalarEvolution &SE = *PSE.getSE();
915 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
916 !isa<SCEVCouldNotCompute>(StepSCEV) &&
917 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
918 }
919 default:
920 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
921 match(VPV, m_GetElementPtr(m_Specific(WideIV),
922 m_Specific(WideIV->getStepValue())));
923 }
924 llvm_unreachable("should have been covered by switch above");
925 };
926 return IsWideIVInc() ? WideIV : nullptr;
927}
928
929/// Attempts to optimize the induction variable exit values for users in the
930/// early exit block.
932 VPTypeAnalysis &TypeInfo,
933 VPBlockBase *PredVPBB,
934 VPValue *Op,
936 VPValue *Incoming, *Mask;
939 return nullptr;
940
941 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
942 if (!WideIV)
943 return nullptr;
944
945 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
946 if (WideIntOrFp && WideIntOrFp->getTruncInst())
947 return nullptr;
948
949 // Calculate the final index.
950 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
951 auto *CanonicalIV = LoopRegion->getCanonicalIV();
952 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
953 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
954
955 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
956 VPValue *FirstActiveLane =
957 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
958 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
959 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
960 FirstActiveLaneType, DL);
961 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
962
963 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
964 // changed it means the exit is using the incremented value, so we need to
965 // add the step.
966 if (Incoming != WideIV) {
967 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
968 EndValue = B.createAdd(EndValue, One, DL);
969 }
970
971 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
972 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
973 VPIRValue *Start = WideIV->getStartValue();
974 VPValue *Step = WideIV->getStepValue();
975 EndValue = B.createDerivedIV(
976 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
977 Start, EndValue, Step);
978 }
979
980 return EndValue;
981}
982
983/// Attempts to optimize the induction variable exit values for users in the
984/// exit block coming from the latch in the original scalar loop.
986 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
989 VPWidenInductionRecipe *WideIV = nullptr;
991 WideIV = getOptimizableIVOf(Incoming, PSE);
992 assert(WideIV && "must have an optimizable IV");
993 return EndValues.lookup(WideIV);
994 }
995
997 WideIV = getOptimizableIVOf(Incoming, PSE);
998
999 if (!WideIV)
1000 return nullptr;
1001
1002 VPValue *EndValue = EndValues.lookup(WideIV);
1003 assert(EndValue && "end value must have been pre-computed");
1004
1005 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1006 // changed it means the exit is using the incremented value, so we don't
1007 // need to subtract the step.
1008 if (Incoming != WideIV)
1009 return EndValue;
1010
1011 // Otherwise, subtract the step from the EndValue.
1012 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1013 VPValue *Step = WideIV->getStepValue();
1014 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1015 if (ScalarTy->isIntegerTy())
1016 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1017 if (ScalarTy->isPointerTy()) {
1018 Type *StepTy = TypeInfo.inferScalarType(Step);
1019 auto *Zero = Plan.getConstantInt(StepTy, 0);
1020 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1021 DebugLoc::getUnknown(), "ind.escape");
1022 }
1023 if (ScalarTy->isFloatingPointTy()) {
1024 const auto &ID = WideIV->getInductionDescriptor();
1025 return B.createNaryOp(
1026 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1027 ? Instruction::FSub
1028 : Instruction::FAdd,
1029 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1030 }
1031 llvm_unreachable("all possible induction types must be handled");
1032 return nullptr;
1033}
1034
1036 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1038 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1039 VPTypeAnalysis TypeInfo(Plan);
1040 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1041 for (VPRecipeBase &R : ExitVPBB->phis()) {
1042 auto *ExitIRI = cast<VPIRPhi>(&R);
1043
1044 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1045 VPValue *Escape = nullptr;
1046 if (PredVPBB == MiddleVPBB)
1047 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1048 ExitIRI->getOperand(Idx),
1049 EndValues, PSE);
1050 else
1052 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1053 if (Escape)
1054 ExitIRI->setOperand(Idx, Escape);
1055 }
1056 }
1057 }
1058}
1059
1060/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1061/// them with already existing recipes expanding the same SCEV expression.
1064
1065 for (VPRecipeBase &R :
1067 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1068 if (!ExpR)
1069 continue;
1070
1071 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1072 if (Inserted)
1073 continue;
1074 ExpR->replaceAllUsesWith(V->second);
1075 ExpR->eraseFromParent();
1076 }
1077}
1078
1080 SmallVector<VPValue *> WorkList;
1082 WorkList.push_back(V);
1083
1084 while (!WorkList.empty()) {
1085 VPValue *Cur = WorkList.pop_back_val();
1086 if (!Seen.insert(Cur).second)
1087 continue;
1088 VPRecipeBase *R = Cur->getDefiningRecipe();
1089 if (!R)
1090 continue;
1091 if (!isDeadRecipe(*R))
1092 continue;
1093 append_range(WorkList, R->operands());
1094 R->eraseFromParent();
1095 }
1096}
1097
1098/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1099/// Returns an optional pair, where the first element indicates whether it is
1100/// an intrinsic ID.
1101static std::optional<std::pair<bool, unsigned>>
1103 return TypeSwitch<const VPSingleDefRecipe *,
1104 std::optional<std::pair<bool, unsigned>>>(R)
1107 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1108 .Case([](const VPWidenIntrinsicRecipe *I) {
1109 return std::make_pair(true, I->getVectorIntrinsicID());
1110 })
1111 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1112 // For recipes that do not directly map to LLVM IR instructions,
1113 // assign opcodes after the last VPInstruction opcode (which is also
1114 // after the last IR Instruction opcode), based on the VPRecipeID.
1115 return std::make_pair(false,
1116 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1117 })
1118 .Default([](auto *) { return std::nullopt; });
1119}
1120
1121/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1122/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1123/// Operands are foldable live-ins.
1125 ArrayRef<VPValue *> Operands,
1126 const DataLayout &DL,
1127 VPTypeAnalysis &TypeInfo) {
1128 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1129 if (!OpcodeOrIID)
1130 return nullptr;
1131
1133 for (VPValue *Op : Operands) {
1134 if (!match(Op, m_LiveIn()))
1135 return nullptr;
1136 Value *V = Op->getUnderlyingValue();
1137 if (!V)
1138 return nullptr;
1139 Ops.push_back(V);
1140 }
1141
1142 auto FoldToIRValue = [&]() -> Value * {
1143 InstSimplifyFolder Folder(DL);
1144 if (OpcodeOrIID->first) {
1145 if (R.getNumOperands() != 2)
1146 return nullptr;
1147 unsigned ID = OpcodeOrIID->second;
1148 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1149 TypeInfo.inferScalarType(&R));
1150 }
1151 unsigned Opcode = OpcodeOrIID->second;
1152 if (Instruction::isBinaryOp(Opcode))
1153 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1154 Ops[0], Ops[1]);
1155 if (Instruction::isCast(Opcode))
1156 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1157 TypeInfo.inferScalarType(R.getVPSingleValue()));
1158 switch (Opcode) {
1160 return Folder.FoldSelect(Ops[0], Ops[1],
1162 case VPInstruction::Not:
1163 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1165 case Instruction::Select:
1166 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1167 case Instruction::ICmp:
1168 case Instruction::FCmp:
1169 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1170 Ops[1]);
1171 case Instruction::GetElementPtr: {
1172 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1173 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1174 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1175 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1176 }
1179 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1180 Ops[0], Ops[1],
1181 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1182 // An extract of a live-in is an extract of a broadcast, so return the
1183 // broadcasted element.
1184 case Instruction::ExtractElement:
1185 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1186 return Ops[0];
1187 }
1188 return nullptr;
1189 };
1190
1191 if (Value *V = FoldToIRValue())
1192 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1193 return nullptr;
1194}
1195
1196/// Try to simplify VPSingleDefRecipe \p Def.
1198 VPlan *Plan = Def->getParent()->getPlan();
1199
1200 // Simplification of live-in IR values for SingleDef recipes using
1201 // InstSimplifyFolder.
1202 const DataLayout &DL =
1204 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1205 return Def->replaceAllUsesWith(V);
1206
1207 // Fold PredPHI LiveIn -> LiveIn.
1208 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1209 VPValue *Op = PredPHI->getOperand(0);
1210 if (isa<VPIRValue>(Op))
1211 PredPHI->replaceAllUsesWith(Op);
1212 }
1213
1214 VPBuilder Builder(Def);
1215
1216 // Avoid replacing VPInstructions with underlying values with new
1217 // VPInstructions, as we would fail to create widen/replicate recpes from the
1218 // new VPInstructions without an underlying value, and miss out on some
1219 // transformations that only apply to widened/replicated recipes later, by
1220 // doing so.
1221 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1222 // VPInstructions without underlying values, as those will get skipped during
1223 // cost computation.
1224 bool CanCreateNewRecipe =
1225 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1226
1227 VPValue *A;
1228 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1229 Type *TruncTy = TypeInfo.inferScalarType(Def);
1230 Type *ATy = TypeInfo.inferScalarType(A);
1231 if (TruncTy == ATy) {
1232 Def->replaceAllUsesWith(A);
1233 } else {
1234 // Don't replace a non-widened cast recipe with a widened cast.
1235 if (!isa<VPWidenCastRecipe>(Def))
1236 return;
1237 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1238
1239 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1240 ? Instruction::SExt
1241 : Instruction::ZExt;
1242 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1243 TruncTy);
1244 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1245 // UnderlyingExt has distinct return type, used to retain legacy cost.
1246 Ext->setUnderlyingValue(UnderlyingExt);
1247 }
1248 Def->replaceAllUsesWith(Ext);
1249 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1250 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1251 Def->replaceAllUsesWith(Trunc);
1252 }
1253 }
1254#ifndef NDEBUG
1255 // Verify that the cached type info is for both A and its users is still
1256 // accurate by comparing it to freshly computed types.
1257 VPTypeAnalysis TypeInfo2(*Plan);
1258 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1259 for (VPUser *U : A->users()) {
1260 auto *R = cast<VPRecipeBase>(U);
1261 for (VPValue *VPV : R->definedValues())
1262 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1263 }
1264#endif
1265 }
1266
1267 // Simplify (X && Y) | (X && !Y) -> X.
1268 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1269 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1270 // recipes to be visited during simplification.
1271 VPValue *X, *Y, *Z;
1272 if (match(Def,
1275 Def->replaceAllUsesWith(X);
1276 Def->eraseFromParent();
1277 return;
1278 }
1279
1280 // x | 1 -> 1
1281 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1282 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1283
1284 // x | 0 -> x
1285 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1286 return Def->replaceAllUsesWith(X);
1287
1288 // x | !x -> AllOnes
1289 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1290 return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
1292 }
1293
1294 // x & 0 -> 0
1295 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1296 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1297
1298 // x & AllOnes -> x
1299 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1300 return Def->replaceAllUsesWith(X);
1301
1302 // x && false -> false
1303 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1304 return Def->replaceAllUsesWith(Plan->getFalse());
1305
1306 // x && true -> x
1307 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1308 return Def->replaceAllUsesWith(X);
1309
1310 // (x && y) | (x && z) -> x && (y | z)
1311 if (CanCreateNewRecipe &&
1314 // Simplify only if one of the operands has one use to avoid creating an
1315 // extra recipe.
1316 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1317 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1318 return Def->replaceAllUsesWith(
1319 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1320
1321 // x && !x -> 0
1323 return Def->replaceAllUsesWith(Plan->getFalse());
1324
1325 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1326 return Def->replaceAllUsesWith(X);
1327
1328 // select c, false, true -> not c
1329 VPValue *C;
1330 if (CanCreateNewRecipe &&
1331 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1332 return Def->replaceAllUsesWith(Builder.createNot(C));
1333
1334 // select !c, x, y -> select c, y, x
1335 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1336 Def->setOperand(0, C);
1337 Def->setOperand(1, Y);
1338 Def->setOperand(2, X);
1339 return;
1340 }
1341
1342 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1343 return Def->replaceAllUsesWith(A);
1344
1345 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1346 return Def->replaceAllUsesWith(A);
1347
1348 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1349 return Def->replaceAllUsesWith(
1350 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1351
1352 const APInt *APC;
1353 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1354 APC->isPowerOf2())
1355 return Def->replaceAllUsesWith(Builder.createNaryOp(
1356 Instruction::Shl,
1357 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1358 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1359
1360 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1361 // not allowed in them.
1362 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1363 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1364 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1365 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1366 return Def->replaceAllUsesWith(Builder.createNaryOp(
1367 Instruction::LShr,
1368 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1369 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1370
1371 if (match(Def, m_Not(m_VPValue(A)))) {
1372 if (match(A, m_Not(m_VPValue(A))))
1373 return Def->replaceAllUsesWith(A);
1374
1375 // Try to fold Not into compares by adjusting the predicate in-place.
1376 CmpPredicate Pred;
1377 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1378 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1379 if (all_of(Cmp->users(),
1381 m_Not(m_Specific(Cmp)),
1382 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1383 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1384 for (VPUser *U : to_vector(Cmp->users())) {
1385 auto *R = cast<VPSingleDefRecipe>(U);
1386 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1387 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1388 R->setOperand(1, Y);
1389 R->setOperand(2, X);
1390 } else {
1391 // not (cmp pred) -> cmp inv_pred
1392 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1393 R->replaceAllUsesWith(Cmp);
1394 }
1395 }
1396 // If Cmp doesn't have a debug location, use the one from the negation,
1397 // to preserve the location.
1398 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1399 Cmp->setDebugLoc(Def->getDebugLoc());
1400 }
1401 }
1402 }
1403
1404 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1405 // any-of (fcmp uno %A, %B), ...
1406 if (match(Def, m_AnyOf())) {
1408 VPRecipeBase *UnpairedCmp = nullptr;
1409 for (VPValue *Op : Def->operands()) {
1410 VPValue *X;
1411 if (Op->getNumUsers() > 1 ||
1413 m_Deferred(X)))) {
1414 NewOps.push_back(Op);
1415 } else if (!UnpairedCmp) {
1416 UnpairedCmp = Op->getDefiningRecipe();
1417 } else {
1418 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1419 UnpairedCmp->getOperand(0), X));
1420 UnpairedCmp = nullptr;
1421 }
1422 }
1423
1424 if (UnpairedCmp)
1425 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1426
1427 if (NewOps.size() < Def->getNumOperands()) {
1428 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1429 return Def->replaceAllUsesWith(NewAnyOf);
1430 }
1431 }
1432
1433 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1434 // This is useful for fmax/fmin without fast-math flags, where we need to
1435 // check if any operand is NaN.
1436 if (CanCreateNewRecipe &&
1438 m_Deferred(X)),
1440 m_Deferred(Y))))) {
1441 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1442 return Def->replaceAllUsesWith(NewCmp);
1443 }
1444
1445 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1446 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1447 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1448 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1449 TypeInfo.inferScalarType(Def))
1450 return Def->replaceAllUsesWith(Def->getOperand(1));
1451
1453 m_One()))) {
1454 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1455 if (TypeInfo.inferScalarType(X) != WideStepTy)
1456 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1457 Def->replaceAllUsesWith(X);
1458 return;
1459 }
1460
1461 // For i1 vp.merges produced by AnyOf reductions:
1462 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1464 m_VPValue(X), m_VPValue())) &&
1466 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1467 Def->setOperand(1, Def->getOperand(0));
1468 Def->setOperand(0, Y);
1469 return;
1470 }
1471
1472 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1473 if (Phi->getOperand(0) == Phi->getOperand(1))
1474 Phi->replaceAllUsesWith(Phi->getOperand(0));
1475 return;
1476 }
1477
1478 // Simplify MaskedCond with no block mask to its single operand.
1480 !cast<VPInstruction>(Def)->isMasked())
1481 return Def->replaceAllUsesWith(Def->getOperand(0));
1482
1483 // Look through ExtractLastLane.
1484 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1485 if (match(A, m_BuildVector())) {
1486 auto *BuildVector = cast<VPInstruction>(A);
1487 Def->replaceAllUsesWith(
1488 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1489 return;
1490 }
1491 if (Plan->hasScalarVFOnly())
1492 return Def->replaceAllUsesWith(A);
1493 }
1494
1495 // Look through ExtractPenultimateElement (BuildVector ....).
1497 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1498 Def->replaceAllUsesWith(
1499 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1500 return;
1501 }
1502
1503 uint64_t Idx;
1505 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1506 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1507 return;
1508 }
1509
1510 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1511 Def->replaceAllUsesWith(
1512 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1513 return;
1514 }
1515
1516 // Look through broadcast of single-scalar when used as select conditions; in
1517 // that case the scalar condition can be used directly.
1518 if (match(Def,
1521 "broadcast operand must be single-scalar");
1522 Def->setOperand(0, C);
1523 return;
1524 }
1525
1527 if (Def->getNumOperands() == 1)
1528 Def->replaceAllUsesWith(Def->getOperand(0));
1529 return;
1530 }
1531
1532 VPIRValue *IRV;
1533 if (Def->getNumOperands() == 1 &&
1535 return Def->replaceAllUsesWith(IRV);
1536
1537 // Some simplifications can only be applied after unrolling. Perform them
1538 // below.
1539 if (!Plan->isUnrolled())
1540 return;
1541
1542 // After unrolling, extract-lane may be used to extract values from multiple
1543 // scalar sources. Only simplify when extracting from a single scalar source.
1544 VPValue *LaneToExtract;
1545 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1546 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1548 return Def->replaceAllUsesWith(A);
1549
1550 // Simplify extract-lane with single source to extract-element.
1551 Def->replaceAllUsesWith(Builder.createNaryOp(
1552 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1553 return;
1554 }
1555
1556 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1557 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1558 isa<VPPhi>(X)) {
1559 auto *Phi = cast<VPPhi>(X);
1560 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1561 Phi->getSingleUser() == Def) {
1562 Phi->setOperand(0, Y);
1563 Def->replaceAllUsesWith(Phi);
1564 return;
1565 }
1566 }
1567
1568 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1569 // just the pointer operand.
1570 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1571 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1572 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1573
1574 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1575 // the start index is zero and only the first lane 0 is demanded.
1576 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1577 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1578 Steps->replaceAllUsesWith(Steps->getOperand(0));
1579 return;
1580 }
1581 }
1582 // Simplify redundant ReductionStartVector recipes after unrolling.
1583 VPValue *StartV;
1585 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1586 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1587 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1588 return PhiR && PhiR->isInLoop();
1589 });
1590 return;
1591 }
1592
1594 Def->replaceAllUsesWith(A);
1595 return;
1596 }
1597
1598 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1601 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1602 all_of(A->users(),
1603 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1604 return Def->replaceAllUsesWith(A);
1605 }
1606
1607 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1608 return Def->replaceAllUsesWith(A);
1609}
1610
1613 Plan.getEntry());
1614 VPTypeAnalysis TypeInfo(Plan);
1616 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1617 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1618 simplifyRecipe(Def, TypeInfo);
1619 }
1620}
1621
1622/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1623/// header mask to be simplified further when tail folding, e.g. in
1624/// optimizeEVLMasks.
1625static void reassociateHeaderMask(VPlan &Plan) {
1626 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1627 if (!HeaderMask)
1628 return;
1629
1630 SmallVector<VPUser *> Worklist;
1631 for (VPUser *U : HeaderMask->users())
1632 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1634
1635 while (!Worklist.empty()) {
1636 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1637 VPValue *X, *Y;
1638 if (!R || !match(R, m_LogicalAnd(
1639 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1640 m_VPValue(Y))))
1641 continue;
1642 append_range(Worklist, R->users());
1643 VPBuilder Builder(R);
1644 R->replaceAllUsesWith(
1645 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1646 }
1647}
1648
1650 if (Plan.hasScalarVFOnly())
1651 return;
1652
1653 // Try to narrow wide and replicating recipes to single scalar recipes,
1654 // based on VPlan analysis. Only process blocks in the loop region for now,
1655 // without traversing into nested regions, as recipes in replicate regions
1656 // cannot be converted yet.
1659 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1661 VPWidenStoreRecipe>(&R))
1662 continue;
1663 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1664 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1665 continue;
1666
1667 // Convert an unmasked scatter with an uniform address into
1668 // extract-last-lane + scalar store.
1669 // TODO: Add a profitability check comparing the cost of a scatter vs.
1670 // extract + scalar store.
1671 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1672 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1673 !WidenStoreR->isConsecutive()) {
1674 assert(!WidenStoreR->isReverse() &&
1675 "Not consecutive memory recipes shouldn't be reversed");
1676 VPValue *Mask = WidenStoreR->getMask();
1677
1678 // Only convert the scatter to a scalar store if it is unmasked.
1679 // TODO: Support converting scatter masked by the header mask to scalar
1680 // store.
1681 if (Mask)
1682 continue;
1683
1685 {WidenStoreR->getOperand(1)});
1686 Extract->insertBefore(WidenStoreR);
1687
1688 // TODO: Sink the scalar store recipe to middle block if possible.
1689 auto *ScalarStore = new VPReplicateRecipe(
1690 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1691 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1692 *WidenStoreR /*Metadata*/);
1693 ScalarStore->insertBefore(WidenStoreR);
1694 WidenStoreR->eraseFromParent();
1695 continue;
1696 }
1697
1698 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1699 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1700 vputils::isSingleScalar(RepR->getOperand(1))) {
1701 auto *Clone = new VPReplicateRecipe(
1702 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1703 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1704 *RepR /*Metadata*/, RepR->getDebugLoc());
1705 Clone->insertBefore(RepOrWidenR);
1706 VPBuilder Builder(Clone);
1707 VPValue *ExtractOp = Clone->getOperand(0);
1708 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1709 ExtractOp =
1710 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1711 ExtractOp =
1712 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1713 Clone->setOperand(0, ExtractOp);
1714 RepR->eraseFromParent();
1715 continue;
1716 }
1717
1718 // Skip recipes that aren't single scalars.
1719 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1720 continue;
1721
1722 // Predicate to check if a user of Op introduces extra broadcasts.
1723 auto IntroducesBCastOf = [](const VPValue *Op) {
1724 return [Op](const VPUser *U) {
1725 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1729 VPI->getOpcode()))
1730 return false;
1731 }
1732 return !U->usesScalars(Op);
1733 };
1734 };
1735
1736 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1737 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1738 if (any_of(
1739 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1740 IntroducesBCastOf(Op)))
1741 return false;
1742 // Non-constant live-ins require broadcasts, while constants do not
1743 // need explicit broadcasts.
1744 auto *IRV = dyn_cast<VPIRValue>(Op);
1745 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1746 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1747 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1748 }))
1749 continue;
1750
1751 auto *Clone = new VPReplicateRecipe(
1752 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1753 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1754 Clone->insertBefore(RepOrWidenR);
1755 RepOrWidenR->replaceAllUsesWith(Clone);
1756 if (isDeadRecipe(*RepOrWidenR))
1757 RepOrWidenR->eraseFromParent();
1758 }
1759 }
1760}
1761
1762/// Try to see if all of \p Blend's masks share a common value logically and'ed
1763/// and remove it from the masks.
1765 if (Blend->isNormalized())
1766 return;
1767 VPValue *CommonEdgeMask;
1768 if (!match(Blend->getMask(0),
1769 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1770 return;
1771 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1772 if (!match(Blend->getMask(I),
1773 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1774 return;
1775 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1776 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1777}
1778
1779/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1780/// to make sure the masks are simplified.
1781static void simplifyBlends(VPlan &Plan) {
1784 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1785 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1786 if (!Blend)
1787 continue;
1788
1789 removeCommonBlendMask(Blend);
1790
1791 // Try to remove redundant blend recipes.
1792 SmallPtrSet<VPValue *, 4> UniqueValues;
1793 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1794 UniqueValues.insert(Blend->getIncomingValue(0));
1795 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1796 if (!match(Blend->getMask(I), m_False()))
1797 UniqueValues.insert(Blend->getIncomingValue(I));
1798
1799 if (UniqueValues.size() == 1) {
1800 Blend->replaceAllUsesWith(*UniqueValues.begin());
1801 Blend->eraseFromParent();
1802 continue;
1803 }
1804
1805 if (Blend->isNormalized())
1806 continue;
1807
1808 // Normalize the blend so its first incoming value is used as the initial
1809 // value with the others blended into it.
1810
1811 unsigned StartIndex = 0;
1812 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1813 // If a value's mask is used only by the blend then is can be deadcoded.
1814 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1815 // that's used by multiple blends where it can be removed from them all.
1816 VPValue *Mask = Blend->getMask(I);
1817 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1818 StartIndex = I;
1819 break;
1820 }
1821 }
1822
1823 SmallVector<VPValue *, 4> OperandsWithMask;
1824 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1825
1826 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1827 if (I == StartIndex)
1828 continue;
1829 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1830 OperandsWithMask.push_back(Blend->getMask(I));
1831 }
1832
1833 auto *NewBlend =
1834 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1835 OperandsWithMask, *Blend, Blend->getDebugLoc());
1836 NewBlend->insertBefore(&R);
1837
1838 VPValue *DeadMask = Blend->getMask(StartIndex);
1839 Blend->replaceAllUsesWith(NewBlend);
1840 Blend->eraseFromParent();
1842
1843 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1844 VPValue *NewMask;
1845 if (NewBlend->getNumOperands() == 3 &&
1846 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1847 VPValue *Inc0 = NewBlend->getOperand(0);
1848 VPValue *Inc1 = NewBlend->getOperand(1);
1849 VPValue *OldMask = NewBlend->getOperand(2);
1850 NewBlend->setOperand(0, Inc1);
1851 NewBlend->setOperand(1, Inc0);
1852 NewBlend->setOperand(2, NewMask);
1853 if (OldMask->getNumUsers() == 0)
1854 cast<VPInstruction>(OldMask)->eraseFromParent();
1855 }
1856 }
1857 }
1858}
1859
1860/// Optimize the width of vector induction variables in \p Plan based on a known
1861/// constant Trip Count, \p BestVF and \p BestUF.
1863 ElementCount BestVF,
1864 unsigned BestUF) {
1865 // Only proceed if we have not completely removed the vector region.
1866 if (!Plan.getVectorLoopRegion())
1867 return false;
1868
1869 const APInt *TC;
1870 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1871 return false;
1872
1873 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1874 // and UF. Returns at least 8.
1875 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1876 APInt AlignedTC =
1879 APInt MaxVal = AlignedTC - 1;
1880 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1881 };
1882 unsigned NewBitWidth =
1883 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1884
1885 LLVMContext &Ctx = Plan.getContext();
1886 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1887
1888 bool MadeChange = false;
1889
1890 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1891 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1892 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1893
1894 // Currently only handle canonical IVs as it is trivial to replace the start
1895 // and stop values, and we currently only perform the optimization when the
1896 // IV has a single use.
1897 if (!WideIV || !WideIV->isCanonical() ||
1898 WideIV->hasMoreThanOneUniqueUser() ||
1899 NewIVTy == WideIV->getScalarType())
1900 continue;
1901
1902 // Currently only handle cases where the single user is a header-mask
1903 // comparison with the backedge-taken-count.
1904 VPUser *SingleUser = WideIV->getSingleUser();
1905 if (!SingleUser ||
1906 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1909 continue;
1910
1911 // Update IV operands and comparison bound to use new narrower type.
1912 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1913 WideIV->setStartValue(NewStart);
1914 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1915 WideIV->setStepValue(NewStep);
1916
1917 auto *NewBTC = new VPWidenCastRecipe(
1918 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
1919 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
1920 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1921 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1922 Cmp->setOperand(1, NewBTC);
1923
1924 MadeChange = true;
1925 }
1926
1927 return MadeChange;
1928}
1929
1930/// Return true if \p Cond is known to be true for given \p BestVF and \p
1931/// BestUF.
1933 ElementCount BestVF, unsigned BestUF,
1936 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1937 &PSE](VPValue *C) {
1938 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1939 });
1940
1941 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1943 m_Specific(CanIV->getBackedgeValue()),
1944 m_Specific(&Plan.getVectorTripCount()))))
1945 return false;
1946
1947 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1948 // count is not conveniently available as SCEV so far, so we compare directly
1949 // against the original trip count. This is stricter than necessary, as we
1950 // will only return true if the trip count == vector trip count.
1951 const SCEV *VectorTripCount =
1953 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1954 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1955 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1956 "Trip count SCEV must be computable");
1957 ScalarEvolution &SE = *PSE.getSE();
1958 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1959 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1960 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1961}
1962
1963/// Try to replace multiple active lane masks used for control flow with
1964/// a single, wide active lane mask instruction followed by multiple
1965/// extract subvector intrinsics. This applies to the active lane mask
1966/// instructions both in the loop and in the preheader.
1967/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1968/// new extracts from the first active lane mask, which has it's last
1969/// operand (multiplier) set to UF.
1971 unsigned UF) {
1972 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1973 return false;
1974
1975 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1976 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1977 auto *Term = &ExitingVPBB->back();
1978
1979 using namespace llvm::VPlanPatternMatch;
1981 m_VPValue(), m_VPValue(), m_VPValue())))))
1982 return false;
1983
1984 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1985 LLVMContext &Ctx = Plan.getContext();
1986
1987 auto ExtractFromALM = [&](VPInstruction *ALM,
1988 SmallVectorImpl<VPValue *> &Extracts) {
1989 DebugLoc DL = ALM->getDebugLoc();
1990 for (unsigned Part = 0; Part < UF; ++Part) {
1992 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
1993 auto *Ext =
1994 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1995 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1996 Extracts[Part] = Ext;
1997 Ext->insertAfter(ALM);
1998 }
1999 };
2000
2001 // Create a list of each active lane mask phi, ordered by unroll part.
2003 for (VPRecipeBase &R : Header->phis()) {
2005 if (!Phi)
2006 continue;
2007 VPValue *Index = nullptr;
2008 match(Phi->getBackedgeValue(),
2010 assert(Index && "Expected index from ActiveLaneMask instruction");
2011
2012 uint64_t Part;
2013 if (match(Index,
2015 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2016 Phis[Part] = Phi;
2017 else {
2018 // Anything other than a CanonicalIVIncrementForPart is part 0
2019 assert(!match(
2020 Index,
2022 Phis[0] = Phi;
2023 }
2024 }
2025
2026 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2027 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2028
2029 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2030 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2031
2032 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2033 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2034 "Expected incoming values of Phi to be ActiveLaneMasks");
2035
2036 // When using wide lane masks, the return type of the get.active.lane.mask
2037 // intrinsic is VF x UF (last operand).
2038 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2039 EntryALM->setOperand(2, ALMMultiplier);
2040 LoopALM->setOperand(2, ALMMultiplier);
2041
2042 // Create UF x extract vectors and insert into preheader.
2043 SmallVector<VPValue *> EntryExtracts(UF);
2044 ExtractFromALM(EntryALM, EntryExtracts);
2045
2046 // Create UF x extract vectors and insert before the loop compare & branch,
2047 // updating the compare to use the first extract.
2048 SmallVector<VPValue *> LoopExtracts(UF);
2049 ExtractFromALM(LoopALM, LoopExtracts);
2050 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2051 Not->setOperand(0, LoopExtracts[0]);
2052
2053 // Update the incoming values of active lane mask phis.
2054 for (unsigned Part = 0; Part < UF; ++Part) {
2055 Phis[Part]->setStartValue(EntryExtracts[Part]);
2056 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2057 }
2058
2059 return true;
2060}
2061
2062/// Try to simplify the branch condition of \p Plan. This may restrict the
2063/// resulting plan to \p BestVF and \p BestUF.
2065 unsigned BestUF,
2067 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2068 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2069 auto *Term = &ExitingVPBB->back();
2070 VPValue *Cond;
2071 if (match(Term,
2073 m_VPValue())) ||
2075 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2076 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2077 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2078 const SCEV *VectorTripCount =
2080 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2081 VectorTripCount =
2083 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2084 "Trip count SCEV must be computable");
2085 ScalarEvolution &SE = *PSE.getSE();
2086 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2087 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2088 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2089 return false;
2090 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2092 // For BranchOnCond, check if we can prove the condition to be true using VF
2093 // and UF.
2094 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2095 return false;
2096 } else {
2097 return false;
2098 }
2099
2100 // The vector loop region only executes once. If possible, completely remove
2101 // the region, otherwise replace the terminator controlling the latch with
2102 // (BranchOnCond true).
2103 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2104 // support for other non-canonical widen induction recipes (e.g.,
2105 // VPWidenPointerInductionRecipe).
2106 // TODO: fold branch-on-constant after dissolving region.
2107 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2108 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2109 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2110 return R->isCanonical();
2111 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2112 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2113 })) {
2114 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2115 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2116 VPBuilder Builder(Plan.getVectorPreheader());
2117 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2118 R->getScalarType());
2119 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2120 HeaderR.eraseFromParent();
2121 continue;
2122 }
2123 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2124 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2125 HeaderR.eraseFromParent();
2126 }
2127
2128 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2129 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2130 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2131 for (VPBlockBase *Exit : Exits)
2132 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2133
2134 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2135 B->setParent(nullptr);
2136
2137 VPBlockUtils::connectBlocks(Preheader, Header);
2138
2139 for (VPBlockBase *Exit : Exits)
2140 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2141
2142 // Replace terminating branch-on-two-conds with branch-on-cond to early
2143 // exit.
2144 if (Exits.size() != 1) {
2145 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2146 "BranchOnTwoConds needs 2 remaining exits");
2148 Term->getOperand(0));
2149 }
2151 } else {
2152 // The vector region contains header phis for which we cannot remove the
2153 // loop region yet.
2154
2155 // For BranchOnTwoConds, set the latch exit condition to true directly.
2156 if (match(Term, m_BranchOnTwoConds())) {
2157 Term->setOperand(1, Plan.getTrue());
2158 return true;
2159 }
2160
2161 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2162 {}, {}, Term->getDebugLoc());
2163 ExitingVPBB->appendRecipe(BOC);
2164 }
2165
2166 Term->eraseFromParent();
2167
2168 return true;
2169}
2170
2171/// From the definition of llvm.experimental.get.vector.length,
2172/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2176 vp_depth_first_deep(Plan.getEntry()))) {
2177 for (VPRecipeBase &R : *VPBB) {
2178 VPValue *AVL;
2179 if (!match(&R, m_EVL(m_VPValue(AVL))))
2180 continue;
2181
2182 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2183 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2184 continue;
2185 ScalarEvolution &SE = *PSE.getSE();
2186 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2187 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2188 continue;
2189
2191 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2192 R.getDebugLoc());
2193 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2194 return true;
2195 }
2196 }
2197 return false;
2198}
2199
2201 unsigned BestUF,
2203 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2204 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2205
2206 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2207 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2208 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2209 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2210
2211 if (MadeChange) {
2212 Plan.setVF(BestVF);
2213 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2214 }
2215}
2216
2217/// Sink users of \p FOR after the recipe defining the previous value \p
2218/// Previous of the recurrence. \returns true if all users of \p FOR could be
2219/// re-arranged as needed or false if it is not possible.
2220static bool
2222 VPRecipeBase *Previous,
2223 VPDominatorTree &VPDT) {
2224 // If Previous is a live-in (no defining recipe), it naturally dominates all
2225 // recipes in the loop, so no sinking is needed.
2226 if (!Previous)
2227 return true;
2228
2229 // Collect recipes that need sinking.
2232 Seen.insert(Previous);
2233 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2234 // The previous value must not depend on the users of the recurrence phi. In
2235 // that case, FOR is not a fixed order recurrence.
2236 if (SinkCandidate == Previous)
2237 return false;
2238
2239 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2240 !Seen.insert(SinkCandidate).second ||
2241 VPDT.properlyDominates(Previous, SinkCandidate))
2242 return true;
2243
2244 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2245 return false;
2246
2247 WorkList.push_back(SinkCandidate);
2248 return true;
2249 };
2250
2251 // Recursively sink users of FOR after Previous.
2252 WorkList.push_back(FOR);
2253 for (unsigned I = 0; I != WorkList.size(); ++I) {
2254 VPRecipeBase *Current = WorkList[I];
2255 assert(Current->getNumDefinedValues() == 1 &&
2256 "only recipes with a single defined value expected");
2257
2258 for (VPUser *User : Current->getVPSingleValue()->users()) {
2259 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2260 return false;
2261 }
2262 }
2263
2264 // Keep recipes to sink ordered by dominance so earlier instructions are
2265 // processed first.
2266 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2267 return VPDT.properlyDominates(A, B);
2268 });
2269
2270 for (VPRecipeBase *SinkCandidate : WorkList) {
2271 if (SinkCandidate == FOR)
2272 continue;
2273
2274 SinkCandidate->moveAfter(Previous);
2275 Previous = SinkCandidate;
2276 }
2277 return true;
2278}
2279
2280/// Try to hoist \p Previous and its operands before all users of \p FOR.
2282 VPRecipeBase *Previous,
2283 VPDominatorTree &VPDT) {
2284 if (cannotHoistOrSinkRecipe(*Previous))
2285 return false;
2286
2287 // Collect recipes that need hoisting.
2288 SmallVector<VPRecipeBase *> HoistCandidates;
2290 VPRecipeBase *HoistPoint = nullptr;
2291 // Find the closest hoist point by looking at all users of FOR and selecting
2292 // the recipe dominating all other users.
2293 for (VPUser *U : FOR->users()) {
2294 auto *R = cast<VPRecipeBase>(U);
2295 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2296 HoistPoint = R;
2297 }
2298 assert(all_of(FOR->users(),
2299 [&VPDT, HoistPoint](VPUser *U) {
2300 auto *R = cast<VPRecipeBase>(U);
2301 return HoistPoint == R ||
2302 VPDT.properlyDominates(HoistPoint, R);
2303 }) &&
2304 "HoistPoint must dominate all users of FOR");
2305
2306 auto NeedsHoisting = [HoistPoint, &VPDT,
2307 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2308 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2309 if (!HoistCandidate)
2310 return nullptr;
2311 VPRegionBlock *EnclosingLoopRegion =
2312 HoistCandidate->getParent()->getEnclosingLoopRegion();
2313 assert((!HoistCandidate->getRegion() ||
2314 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2315 "CFG in VPlan should still be flat, without replicate regions");
2316 // Hoist candidate was already visited, no need to hoist.
2317 if (!Visited.insert(HoistCandidate).second)
2318 return nullptr;
2319
2320 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2321 // hoisting.
2322 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2323 return nullptr;
2324
2325 // If we reached a recipe that dominates HoistPoint, we don't need to
2326 // hoist the recipe.
2327 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2328 return nullptr;
2329 return HoistCandidate;
2330 };
2331
2332 if (!NeedsHoisting(Previous->getVPSingleValue()))
2333 return true;
2334
2335 // Recursively try to hoist Previous and its operands before all users of FOR.
2336 HoistCandidates.push_back(Previous);
2337
2338 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2339 VPRecipeBase *Current = HoistCandidates[I];
2340 assert(Current->getNumDefinedValues() == 1 &&
2341 "only recipes with a single defined value expected");
2342 if (cannotHoistOrSinkRecipe(*Current))
2343 return false;
2344
2345 for (VPValue *Op : Current->operands()) {
2346 // If we reach FOR, it means the original Previous depends on some other
2347 // recurrence that in turn depends on FOR. If that is the case, we would
2348 // also need to hoist recipes involving the other FOR, which may break
2349 // dependencies.
2350 if (Op == FOR)
2351 return false;
2352
2353 if (auto *R = NeedsHoisting(Op)) {
2354 // Bail out if the recipe defines multiple values.
2355 // TODO: Hoisting such recipes requires additional handling.
2356 if (R->getNumDefinedValues() != 1)
2357 return false;
2358 HoistCandidates.push_back(R);
2359 }
2360 }
2361 }
2362
2363 // Order recipes to hoist by dominance so earlier instructions are processed
2364 // first.
2365 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2366 return VPDT.properlyDominates(A, B);
2367 });
2368
2369 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2370 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2371 HoistPoint->getIterator());
2372 }
2373
2374 return true;
2375}
2376
2378 VPBuilder &LoopBuilder) {
2379 VPDominatorTree VPDT(Plan);
2380
2382 for (VPRecipeBase &R :
2385 RecurrencePhis.push_back(FOR);
2386
2387 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2389 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2390 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2391 // to terminate.
2392 while (auto *PrevPhi =
2394 assert(PrevPhi->getParent() == FOR->getParent());
2395 assert(SeenPhis.insert(PrevPhi).second);
2396 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2397 }
2398
2399 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2400 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2401 return false;
2402
2403 // Introduce a recipe to combine the incoming and previous values of a
2404 // fixed-order recurrence.
2405 VPBasicBlock *InsertBlock =
2406 Previous ? Previous->getParent() : FOR->getParent();
2407 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2408 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2409 else
2410 LoopBuilder.setInsertPoint(InsertBlock,
2411 std::next(Previous->getIterator()));
2412
2413 auto *RecurSplice =
2415 {FOR, FOR->getBackedgeValue()});
2416
2417 FOR->replaceAllUsesWith(RecurSplice);
2418 // Set the first operand of RecurSplice to FOR again, after replacing
2419 // all users.
2420 RecurSplice->setOperand(0, FOR);
2421
2422 // Check for users extracting at the penultimate active lane of the FOR.
2423 // If only a single lane is active in the current iteration, we need to
2424 // select the last element from the previous iteration (from the FOR phi
2425 // directly).
2426 for (VPUser *U : RecurSplice->users()) {
2428 m_Specific(RecurSplice))))
2429 continue;
2430
2432 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2433 VPValue *Zero = Plan.getConstantInt(64, 0);
2434 VPValue *One = Plan.getConstantInt(64, 1);
2435 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2436 VPValue *PenultimateLastIter =
2437 B.createNaryOp(VPInstruction::ExtractLane,
2438 {PenultimateIndex, FOR->getBackedgeValue()});
2439 VPValue *LastPrevIter =
2440 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2441
2442 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2443 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2444 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2445 }
2446 }
2447 return true;
2448}
2449
2451 for (VPRecipeBase &R :
2453 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2454 if (!PhiR)
2455 continue;
2456 RecurKind RK = PhiR->getRecurrenceKind();
2457 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2459 continue;
2460
2461 for (VPUser *U : collectUsersRecursively(PhiR))
2462 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2463 RecWithFlags->dropPoisonGeneratingFlags();
2464 }
2465 }
2466}
2467
2468namespace {
2469struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2470 static bool isSentinel(const VPSingleDefRecipe *Def) {
2471 return Def == getEmptyKey() || Def == getTombstoneKey();
2472 }
2473
2474 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2475 /// return that source element type.
2476 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2477 // All VPInstructions that lower to GEPs must have the i8 source element
2478 // type (as they are PtrAdds), so we omit it.
2480 .Case([](const VPReplicateRecipe *I) -> Type * {
2481 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2482 return GEP->getSourceElementType();
2483 return nullptr;
2484 })
2485 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2486 [](auto *I) { return I->getSourceElementType(); })
2487 .Default([](auto *) { return nullptr; });
2488 }
2489
2490 /// Returns true if recipe \p Def can be safely handed for CSE.
2491 static bool canHandle(const VPSingleDefRecipe *Def) {
2492 // We can extend the list of handled recipes in the future,
2493 // provided we account for the data embedded in them while checking for
2494 // equality or hashing.
2495 auto C = getOpcodeOrIntrinsicID(Def);
2496
2497 // The issue with (Insert|Extract)Value is that the index of the
2498 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2499 // VPlan.
2500 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2501 C->second == Instruction::ExtractValue)))
2502 return false;
2503
2504 // During CSE, we can only handle recipes that don't read from memory: if
2505 // they read from memory, there could be an intervening write to memory
2506 // before the next instance is CSE'd, leading to an incorrect result.
2507 return !Def->mayReadFromMemory();
2508 }
2509
2510 /// Hash the underlying data of \p Def.
2511 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2512 const VPlan *Plan = Def->getParent()->getPlan();
2513 VPTypeAnalysis TypeInfo(*Plan);
2514 hash_code Result = hash_combine(
2515 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2516 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2518 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2519 if (RFlags->hasPredicate())
2520 return hash_combine(Result, RFlags->getPredicate());
2521 return Result;
2522 }
2523
2524 /// Check equality of underlying data of \p L and \p R.
2525 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2526 if (isSentinel(L) || isSentinel(R))
2527 return L == R;
2528 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2530 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2532 !equal(L->operands(), R->operands()))
2533 return false;
2535 "must have valid opcode info for both recipes");
2536 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2537 if (LFlags->hasPredicate() &&
2538 LFlags->getPredicate() !=
2539 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2540 return false;
2541 // Recipes in replicate regions implicitly depend on predicate. If either
2542 // recipe is in a replicate region, only consider them equal if both have
2543 // the same parent.
2544 const VPRegionBlock *RegionL = L->getRegion();
2545 const VPRegionBlock *RegionR = R->getRegion();
2546 if (((RegionL && RegionL->isReplicator()) ||
2547 (RegionR && RegionR->isReplicator())) &&
2548 L->getParent() != R->getParent())
2549 return false;
2550 const VPlan *Plan = L->getParent()->getPlan();
2551 VPTypeAnalysis TypeInfo(*Plan);
2552 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2553 }
2554};
2555} // end anonymous namespace
2556
2557/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2558/// Plan.
2560 VPDominatorTree VPDT(Plan);
2562
2564 vp_depth_first_deep(Plan.getEntry()))) {
2565 for (VPRecipeBase &R : *VPBB) {
2566 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2567 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2568 continue;
2569 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2570 // V must dominate Def for a valid replacement.
2571 if (!VPDT.dominates(V->getParent(), VPBB))
2572 continue;
2573 // Only keep flags present on both V and Def.
2574 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2575 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2576 Def->replaceAllUsesWith(V);
2577 continue;
2578 }
2579 CSEMap[Def] = Def;
2580 }
2581 }
2582}
2583
2584/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2585static void licm(VPlan &Plan) {
2586 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2587
2588 // Hoist any loop invariant recipes from the vector loop region to the
2589 // preheader. Preform a shallow traversal of the vector loop region, to
2590 // exclude recipes in replicate regions. Since the top-level blocks in the
2591 // vector loop region are guaranteed to execute if the vector pre-header is,
2592 // we don't need to check speculation safety.
2593 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2594 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2595 "Expected vector prehader's successor to be the vector loop region");
2597 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2598 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2600 continue;
2601 if (any_of(R.operands(), [](VPValue *Op) {
2602 return !Op->isDefinedOutsideLoopRegions();
2603 }))
2604 continue;
2605 R.moveBefore(*Preheader, Preheader->end());
2606 }
2607 }
2608
2609#ifndef NDEBUG
2610 VPDominatorTree VPDT(Plan);
2611#endif
2612 // Sink recipes with no users inside the vector loop region if all users are
2613 // in the same exit block of the region.
2614 // TODO: Extend to sink recipes from inner loops.
2616 vp_post_order_shallow(LoopRegion->getEntry()))) {
2617 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2619 continue;
2620
2621 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2622 // handles sunk recipes correctly.
2623 if (isa<VPReplicateRecipe>(&R))
2624 continue;
2625
2626 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2627 // support recipes with multiple defined values (e.g., interleaved loads).
2628 auto *Def = cast<VPSingleDefRecipe>(&R);
2629 // Skip recipes without users as we cannot determine a sink block.
2630 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2631 // their execution frequency.
2632 if (Def->getNumUsers() == 0)
2633 continue;
2634
2635 VPBasicBlock *SinkBB = nullptr;
2636 // Cannot sink the recipe if any user
2637 // * is defined in any loop region, or
2638 // * is a phi, or
2639 // * multiple users in different blocks.
2640 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2641 auto *UserR = cast<VPRecipeBase>(U);
2642 VPBasicBlock *Parent = UserR->getParent();
2643 // TODO: If the user is a PHI node, we should check the block of
2644 // incoming value. Support PHI node users if needed.
2645 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2646 return true;
2647 // TODO: Support sinking when users are in multiple blocks.
2648 if (SinkBB && SinkBB != Parent)
2649 return true;
2650 SinkBB = Parent;
2651 return false;
2652 }))
2653 continue;
2654
2655 // Only sink to dedicated exit blocks of the loop region.
2656 if (SinkBB->getSinglePredecessor() != LoopRegion)
2657 continue;
2658
2659 // TODO: This will need to be a check instead of a assert after
2660 // conditional branches in vectorized loops are supported.
2661 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2662 "Defining block must dominate sink block");
2663 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2664 // just moving.
2665 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2666 }
2667 }
2668}
2669
2671 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2672 if (Plan.hasScalarVFOnly())
2673 return;
2674 // Keep track of created truncates, so they can be re-used. Note that we
2675 // cannot use RAUW after creating a new truncate, as this would could make
2676 // other uses have different types for their operands, making them invalidly
2677 // typed.
2679 VPTypeAnalysis TypeInfo(Plan);
2680 VPBasicBlock *PH = Plan.getVectorPreheader();
2683 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2686 continue;
2687
2688 VPValue *ResultVPV = R.getVPSingleValue();
2689 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2690 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2691 if (!NewResSizeInBits)
2692 continue;
2693
2694 // If the value wasn't vectorized, we must maintain the original scalar
2695 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2696 // skip casts which do not need to be handled explicitly here, as
2697 // redundant casts will be removed during recipe simplification.
2699 continue;
2700
2701 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2702 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2703 assert(OldResTy->isIntegerTy() && "only integer types supported");
2704 (void)OldResSizeInBits;
2705
2706 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2707
2708 // Any wrapping introduced by shrinking this operation shouldn't be
2709 // considered undefined behavior. So, we can't unconditionally copy
2710 // arithmetic wrapping flags to VPW.
2711 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2712 VPW->dropPoisonGeneratingFlags();
2713
2714 if (OldResSizeInBits != NewResSizeInBits &&
2715 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2716 // Extend result to original width.
2717 auto *Ext = new VPWidenCastRecipe(
2718 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2719 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2720 Ext->insertAfter(&R);
2721 ResultVPV->replaceAllUsesWith(Ext);
2722 Ext->setOperand(0, ResultVPV);
2723 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2724 } else {
2725 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2726 "Only ICmps should not need extending the result.");
2727 }
2728
2729 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2731 continue;
2732
2733 // Shrink operands by introducing truncates as needed.
2734 unsigned StartIdx =
2735 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2736 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2737 auto *Op = R.getOperand(Idx);
2738 unsigned OpSizeInBits =
2740 if (OpSizeInBits == NewResSizeInBits)
2741 continue;
2742 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2743 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2744 if (!IterIsEmpty) {
2745 R.setOperand(Idx, ProcessedIter->second);
2746 continue;
2747 }
2748
2749 VPBuilder Builder;
2750 if (isa<VPIRValue>(Op))
2751 Builder.setInsertPoint(PH);
2752 else
2753 Builder.setInsertPoint(&R);
2754 VPWidenCastRecipe *NewOp =
2755 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2756 ProcessedIter->second = NewOp;
2757 R.setOperand(Idx, NewOp);
2758 }
2759
2760 }
2761 }
2762}
2763
2767 VPValue *Cond;
2768 // Skip blocks that are not terminated by BranchOnCond.
2769 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2770 continue;
2771
2772 assert(VPBB->getNumSuccessors() == 2 &&
2773 "Two successors expected for BranchOnCond");
2774 unsigned RemovedIdx;
2775 if (match(Cond, m_True()))
2776 RemovedIdx = 1;
2777 else if (match(Cond, m_False()))
2778 RemovedIdx = 0;
2779 else
2780 continue;
2781
2782 VPBasicBlock *RemovedSucc =
2783 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2784 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2785 "There must be a single edge between VPBB and its successor");
2786 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2787 // these recipes.
2788 for (VPRecipeBase &R : RemovedSucc->phis())
2789 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2790
2791 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2792 // automatically on VPlan destruction if it becomes unreachable.
2793 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2794 VPBB->back().eraseFromParent();
2795 }
2796}
2797
2819
2820// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2821// the loop terminator with a branch-on-cond recipe with the negated
2822// active-lane-mask as operand. Note that this turns the loop into an
2823// uncountable one. Only the existing terminator is replaced, all other existing
2824// recipes/users remain unchanged, except for poison-generating flags being
2825// dropped from the canonical IV increment. Return the created
2826// VPActiveLaneMaskPHIRecipe.
2827//
2828// The function adds the following recipes:
2829//
2830// vector.ph:
2831// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2832// %EntryALM = active-lane-mask %EntryInc, TC
2833//
2834// vector.body:
2835// ...
2836// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2837// ...
2838// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2839// %ALM = active-lane-mask %InLoopInc, TC
2840// %Negated = Not %ALM
2841// branch-on-cond %Negated
2842//
2845 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2846 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2847 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2848 VPValue *StartV = CanonicalIVPHI->getStartValue();
2849
2850 auto *CanonicalIVIncrement =
2851 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2852 // TODO: Check if dropping the flags is needed.
2853 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2854 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2855 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2856 // we have to take unrolling into account. Each part needs to start at
2857 // Part * VF
2858 auto *VecPreheader = Plan.getVectorPreheader();
2859 VPBuilder Builder(VecPreheader);
2860
2861 // Create the ActiveLaneMask instruction using the correct start values.
2862 VPValue *TC = Plan.getTripCount();
2863 VPValue *VF = &Plan.getVF();
2864
2865 auto *EntryIncrement = Builder.createOverflowingOp(
2866 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2867 DL, "index.part.next");
2868
2869 // Create the active lane mask instruction in the VPlan preheader.
2870 VPValue *ALMMultiplier =
2871 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2872 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2873 {EntryIncrement, TC, ALMMultiplier}, DL,
2874 "active.lane.mask.entry");
2875
2876 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2877 // preheader ActiveLaneMask instruction.
2878 auto *LaneMaskPhi =
2880 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2881
2882 // Create the active lane mask for the next iteration of the loop before the
2883 // original terminator.
2884 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2885 Builder.setInsertPoint(OriginalTerminator);
2886 auto *InLoopIncrement = Builder.createOverflowingOp(
2888 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2889 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2890 {InLoopIncrement, TC, ALMMultiplier}, DL,
2891 "active.lane.mask.next");
2892 LaneMaskPhi->addOperand(ALM);
2893
2894 // Replace the original terminator with BranchOnCond. We have to invert the
2895 // mask here because a true condition means jumping to the exit block.
2896 auto *NotMask = Builder.createNot(ALM, DL);
2897 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2898 OriginalTerminator->eraseFromParent();
2899 return LaneMaskPhi;
2900}
2901
2903 bool UseActiveLaneMaskForControlFlow) {
2904 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2905 auto *FoundWidenCanonicalIVUser = find_if(
2907 assert(FoundWidenCanonicalIVUser &&
2908 "Must have widened canonical IV when tail folding!");
2909 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2910 auto *WideCanonicalIV =
2911 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2912 VPSingleDefRecipe *LaneMask;
2913 if (UseActiveLaneMaskForControlFlow) {
2914 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2915 } else {
2916 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2917 VPValue *ALMMultiplier =
2918 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2919 LaneMask =
2920 B.createNaryOp(VPInstruction::ActiveLaneMask,
2921 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2922 nullptr, "active.lane.mask");
2923 }
2924
2925 // Walk users of WideCanonicalIV and replace the header mask of the form
2926 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2927 // removing the old one to ensure there is always only a single header mask.
2928 HeaderMask->replaceAllUsesWith(LaneMask);
2929 HeaderMask->eraseFromParent();
2930}
2931
2932template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2933 Op0_t In;
2935
2936 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2937
2938 template <typename OpTy> bool match(OpTy *V) const {
2939 if (m_Specific(In).match(V)) {
2940 Out = nullptr;
2941 return true;
2942 }
2943 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2944 }
2945};
2946
2947/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2948/// Returns the remaining part \p Out if so, or nullptr otherwise.
2949template <typename Op0_t, typename Op1_t>
2950static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2951 Op1_t &Out) {
2952 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2953}
2954
2955/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2956/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2957/// recipe could be created.
2958/// \p HeaderMask Header Mask.
2959/// \p CurRecipe Recipe to be transform.
2960/// \p TypeInfo VPlan-based type analysis.
2961/// \p EVL The explicit vector length parameter of vector-predication
2962/// intrinsics.
2964 VPRecipeBase &CurRecipe,
2965 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2966 VPlan *Plan = CurRecipe.getParent()->getPlan();
2967 DebugLoc DL = CurRecipe.getDebugLoc();
2968 VPValue *Addr, *Mask, *EndPtr;
2969
2970 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2971 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2972 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2973 EVLEndPtr->insertBefore(&CurRecipe);
2974 EVLEndPtr->setOperand(1, &EVL);
2975 return EVLEndPtr;
2976 };
2977
2978 if (match(&CurRecipe,
2979 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2980 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2981 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2982 EVL, Mask);
2983
2984 VPValue *ReversedVal;
2985 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2986 match(ReversedVal,
2987 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2988 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2989 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2990 auto *LoadR = new VPWidenLoadEVLRecipe(
2991 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2992 LoadR->insertBefore(&CurRecipe);
2993 return new VPWidenIntrinsicRecipe(
2994 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2995 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2996 }
2997
2998 VPValue *StoredVal;
2999 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3000 m_RemoveMask(HeaderMask, Mask))) &&
3001 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3002 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3003 StoredVal, EVL, Mask);
3004
3005 if (match(&CurRecipe,
3006 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3007 m_RemoveMask(HeaderMask, Mask))) &&
3008 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3009 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3010 auto *NewReverse = new VPWidenIntrinsicRecipe(
3011 Intrinsic::experimental_vp_reverse,
3012 {ReversedVal, Plan->getTrue(), &EVL},
3013 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3014 NewReverse->insertBefore(&CurRecipe);
3015 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3016 AdjustEndPtr(EndPtr), NewReverse, EVL,
3017 Mask);
3018 }
3019
3020 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3021 if (Rdx->isConditional() &&
3022 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3023 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3024
3025 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3026 if (Interleave->getMask() &&
3027 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3028 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3029
3030 VPValue *LHS, *RHS;
3031 if (match(&CurRecipe,
3032 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3033 return new VPWidenIntrinsicRecipe(
3034 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3035 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3036
3037 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3038 m_VPValue(RHS))))
3039 return new VPWidenIntrinsicRecipe(
3040 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3041 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3042
3043 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3044 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3045 VPValue *ZExt =
3046 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3047 return new VPInstruction(
3048 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3049 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3050 }
3051
3052 return nullptr;
3053}
3054
3055/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3056/// The transforms here need to preserve the original semantics.
3058 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3059 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3062 m_VPValue(EVL))) &&
3063 match(EVL, m_EVL(m_VPValue()))) {
3064 HeaderMask = R.getVPSingleValue();
3065 break;
3066 }
3067 }
3068 if (!HeaderMask)
3069 return;
3070
3071 VPTypeAnalysis TypeInfo(Plan);
3072 SmallVector<VPRecipeBase *> OldRecipes;
3073 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3075 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3076 NewR->insertBefore(R);
3077 for (auto [Old, New] :
3078 zip_equal(R->definedValues(), NewR->definedValues()))
3079 Old->replaceAllUsesWith(New);
3080 OldRecipes.push_back(R);
3081 }
3082 }
3083 // Erase old recipes at the end so we don't invalidate TypeInfo.
3084 for (VPRecipeBase *R : reverse(OldRecipes)) {
3085 SmallVector<VPValue *> PossiblyDead(R->operands());
3086 R->eraseFromParent();
3087 for (VPValue *Op : PossiblyDead)
3089 }
3090}
3091
3092/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3093/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3094/// iteration.
3095static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3096 VPTypeAnalysis TypeInfo(Plan);
3097 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3098 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3099
3100 assert(all_of(Plan.getVF().users(),
3103 "User of VF that we can't transform to EVL.");
3104 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3106 });
3107
3108 assert(all_of(Plan.getVFxUF().users(),
3109 [&LoopRegion, &Plan](VPUser *U) {
3110 return match(U,
3111 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3112 m_Specific(&Plan.getVFxUF()))) ||
3113 isa<VPWidenPointerInductionRecipe>(U);
3114 }) &&
3115 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3116 "increment of the canonical induction.");
3117 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3118 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3119 // canonical induction must not be updated.
3121 });
3122
3123 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3124 // contained.
3125 bool ContainsFORs =
3127 if (ContainsFORs) {
3128 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3129 VPValue *MaxEVL = &Plan.getVF();
3130 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3131 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3132 MaxEVL = Builder.createScalarZExtOrTrunc(
3133 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3134 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3135
3136 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3137 VPValue *PrevEVL = Builder.createScalarPhi(
3138 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3139
3142 for (VPRecipeBase &R : *VPBB) {
3143 VPValue *V1, *V2;
3144 if (!match(&R,
3146 m_VPValue(V1), m_VPValue(V2))))
3147 continue;
3148 VPValue *Imm = Plan.getOrAddLiveIn(
3151 Intrinsic::experimental_vp_splice,
3152 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3153 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3154 R.getDebugLoc());
3155 VPSplice->insertBefore(&R);
3156 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3157 }
3158 }
3159 }
3160
3161 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3162 if (!HeaderMask)
3163 return;
3164
3165 // Replace header masks with a mask equivalent to predicating by EVL:
3166 //
3167 // icmp ule widen-canonical-iv backedge-taken-count
3168 // ->
3169 // icmp ult step-vector, EVL
3170 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3171 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3172 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3173 VPValue *EVLMask = Builder.createICmp(
3175 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3176 HeaderMask->replaceAllUsesWith(EVLMask);
3177}
3178
3179/// Converts a tail folded vector loop region to step by
3180/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3181/// iteration.
3182///
3183/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3184/// replaces all uses except the canonical IV increment of
3185/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3186/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3187/// this transformation.
3188///
3189/// - The header mask is replaced with a header mask based on the EVL.
3190///
3191/// - Plans with FORs have a new phi added to keep track of the EVL of the
3192/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3193/// @llvm.vp.splice.
3194///
3195/// The function uses the following definitions:
3196/// %StartV is the canonical induction start value.
3197///
3198/// The function adds the following recipes:
3199///
3200/// vector.ph:
3201/// ...
3202///
3203/// vector.body:
3204/// ...
3205/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3206/// [ %NextIter, %vector.body ]
3207/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3208/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3209/// ...
3210/// %OpEVL = cast i32 %VPEVL to IVSize
3211/// %NextIter = add IVSize %OpEVL, %CurrentIter
3212/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3213/// ...
3214///
3215/// If MaxSafeElements is provided, the function adds the following recipes:
3216/// vector.ph:
3217/// ...
3218///
3219/// vector.body:
3220/// ...
3221/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3222/// [ %NextIter, %vector.body ]
3223/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3224/// %cmp = cmp ult %AVL, MaxSafeElements
3225/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3226/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3227/// ...
3228/// %OpEVL = cast i32 %VPEVL to IVSize
3229/// %NextIter = add IVSize %OpEVL, %CurrentIter
3230/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3231/// ...
3232///
3234 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3235 if (Plan.hasScalarVFOnly())
3236 return;
3237 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3238 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3239
3240 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3241 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3242 VPValue *StartV = CanonicalIVPHI->getStartValue();
3243
3244 // Create the CurrentIteration recipe in the vector loop.
3245 auto *CurrentIteration =
3247 CurrentIteration->insertAfter(CanonicalIVPHI);
3248 VPBuilder Builder(Header, Header->getFirstNonPhi());
3249 // Create the AVL (application vector length), starting from TC -> 0 in steps
3250 // of EVL.
3251 VPPhi *AVLPhi = Builder.createScalarPhi(
3252 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3253 VPValue *AVL = AVLPhi;
3254
3255 if (MaxSafeElements) {
3256 // Support for MaxSafeDist for correct loop emission.
3257 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3258 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3259 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3260 "safe_avl");
3261 }
3262 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3263 DebugLoc::getUnknown(), "evl");
3264
3265 auto *CanonicalIVIncrement =
3266 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3267 Builder.setInsertPoint(CanonicalIVIncrement);
3268 VPValue *OpVPEVL = VPEVL;
3269
3270 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3271 OpVPEVL = Builder.createScalarZExtOrTrunc(
3272 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3273
3274 auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
3275 CanonicalIVIncrement->getDebugLoc(),
3276 "current.iteration.next",
3277 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3278 CanonicalIVIncrement->hasNoSignedWrap()});
3279 CurrentIteration->addOperand(NextIter);
3280
3281 VPValue *NextAVL =
3282 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3283 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3284 AVLPhi->addOperand(NextAVL);
3285
3286 fixupVFUsersForEVL(Plan, *VPEVL);
3287 removeDeadRecipes(Plan);
3288
3289 // Replace all uses of VPCanonicalIVPHIRecipe by
3290 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3291 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3292 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3293 // TODO: support unroll factor > 1.
3294 Plan.setUF(1);
3295}
3296
3298 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3299 // There should be only one VPCurrentIteration in the entire plan.
3300 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3301
3304 for (VPRecipeBase &R : VPBB->phis())
3305 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3306 assert(!CurrentIteration &&
3307 "Found multiple CurrentIteration. Only one expected");
3308 CurrentIteration = PhiR;
3309 }
3310
3311 // Early return if it is not variable-length stepping.
3312 if (!CurrentIteration)
3313 return;
3314
3315 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3316 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3317
3318 // Convert CurrentIteration to concrete recipe.
3319 auto *ScalarR =
3320 VPBuilder(CurrentIteration)
3322 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3323 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3324 CurrentIteration->replaceAllUsesWith(ScalarR);
3325 CurrentIteration->eraseFromParent();
3326
3327 // Replace CanonicalIVInc with CurrentIteration increment.
3328 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3329 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3330 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3331 m_Specific(&Plan.getVFxUF()))) &&
3332 "Unexpected canonical iv");
3333 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3334
3335 // Remove unused phi and increment.
3336 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3337 CanonicalIVIncrement->eraseFromParent();
3338 CanonicalIV->eraseFromParent();
3339}
3340
3342 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3343 // The canonical IV may not exist at this stage.
3344 if (!LoopRegion ||
3346 return;
3347 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3348 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3349 return;
3350 // The EVL IV is always immediately after the canonical IV.
3352 std::next(CanIV->getIterator()));
3353 if (!EVLPhi)
3354 return;
3355
3356 // Bail if not an EVL tail folded loop.
3357 VPValue *AVL;
3358 if (!match(EVLPhi->getBackedgeValue(),
3359 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3360 return;
3361
3362 // The AVL may be capped to a safe distance.
3363 VPValue *SafeAVL;
3364 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3365 AVL = SafeAVL;
3366
3367 VPValue *AVLNext;
3368 [[maybe_unused]] bool FoundAVLNext =
3370 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3371 assert(FoundAVLNext && "Didn't find AVL backedge?");
3372
3373 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3374 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3375 if (match(LatchBr, m_BranchOnCond(m_True())))
3376 return;
3377
3378 assert(
3379 match(LatchBr,
3382 m_Specific(&Plan.getVectorTripCount())))) &&
3383 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3384 "trip count");
3385
3386 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3387 VPBuilder Builder(LatchBr);
3388 LatchBr->setOperand(0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3389 Plan.getConstantInt(AVLTy, 0)));
3390}
3391
3393 VPlan &Plan, PredicatedScalarEvolution &PSE,
3394 const DenseMap<Value *, const SCEV *> &StridesMap) {
3395 // Replace VPValues for known constant strides guaranteed by predicate scalar
3396 // evolution.
3397 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3398 auto *R = cast<VPRecipeBase>(&U);
3399 return R->getRegion() ||
3400 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3401 };
3402 ValueToSCEVMapTy RewriteMap;
3403 for (const SCEV *Stride : StridesMap.values()) {
3404 using namespace SCEVPatternMatch;
3405 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3406 const APInt *StrideConst;
3407 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3408 // Only handle constant strides for now.
3409 continue;
3410
3411 auto *CI = Plan.getConstantInt(*StrideConst);
3412 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3413 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3414
3415 // The versioned value may not be used in the loop directly but through a
3416 // sext/zext. Add new live-ins in those cases.
3417 for (Value *U : StrideV->users()) {
3419 continue;
3420 VPValue *StrideVPV = Plan.getLiveIn(U);
3421 if (!StrideVPV)
3422 continue;
3423 unsigned BW = U->getType()->getScalarSizeInBits();
3424 APInt C =
3425 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3426 VPValue *CI = Plan.getConstantInt(C);
3427 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3428 }
3429 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3430 }
3431
3432 for (VPRecipeBase &R : *Plan.getEntry()) {
3433 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3434 if (!ExpSCEV)
3435 continue;
3436 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3437 auto *NewSCEV =
3438 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3439 if (NewSCEV != ScevExpr) {
3440 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3441 ExpSCEV->replaceAllUsesWith(NewExp);
3442 if (Plan.getTripCount() == ExpSCEV)
3443 Plan.resetTripCount(NewExp);
3444 }
3445 }
3446}
3447
3449 VPlan &Plan,
3450 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3451 // Collect recipes in the backward slice of `Root` that may generate a poison
3452 // value that is used after vectorization.
3454 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3456 Worklist.push_back(Root);
3457
3458 // Traverse the backward slice of Root through its use-def chain.
3459 while (!Worklist.empty()) {
3460 VPRecipeBase *CurRec = Worklist.pop_back_val();
3461
3462 if (!Visited.insert(CurRec).second)
3463 continue;
3464
3465 // Prune search if we find another recipe generating a widen memory
3466 // instruction. Widen memory instructions involved in address computation
3467 // will lead to gather/scatter instructions, which don't need to be
3468 // handled.
3470 VPHeaderPHIRecipe>(CurRec))
3471 continue;
3472
3473 // This recipe contributes to the address computation of a widen
3474 // load/store. If the underlying instruction has poison-generating flags,
3475 // drop them directly.
3476 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3477 VPValue *A, *B;
3478 // Dropping disjoint from an OR may yield incorrect results, as some
3479 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3480 // for dependence analysis). Instead, replace it with an equivalent Add.
3481 // This is possible as all users of the disjoint OR only access lanes
3482 // where the operands are disjoint or poison otherwise.
3483 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3484 RecWithFlags->isDisjoint()) {
3485 VPBuilder Builder(RecWithFlags);
3486 VPInstruction *New =
3487 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3488 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3489 RecWithFlags->replaceAllUsesWith(New);
3490 RecWithFlags->eraseFromParent();
3491 CurRec = New;
3492 } else
3493 RecWithFlags->dropPoisonGeneratingFlags();
3494 } else {
3497 (void)Instr;
3498 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3499 "found instruction with poison generating flags not covered by "
3500 "VPRecipeWithIRFlags");
3501 }
3502
3503 // Add new definitions to the worklist.
3504 for (VPValue *Operand : CurRec->operands())
3505 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3506 Worklist.push_back(OpDef);
3507 }
3508 });
3509
3510 // Traverse all the recipes in the VPlan and collect the poison-generating
3511 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3512 // VPInterleaveRecipe.
3513 auto Iter = vp_depth_first_deep(Plan.getEntry());
3515 for (VPRecipeBase &Recipe : *VPBB) {
3516 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3517 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3518 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3519 if (AddrDef && WidenRec->isConsecutive() &&
3520 BlockNeedsPredication(UnderlyingInstr.getParent()))
3521 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3522 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3523 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3524 if (AddrDef) {
3525 // Check if any member of the interleave group needs predication.
3526 const InterleaveGroup<Instruction> *InterGroup =
3527 InterleaveRec->getInterleaveGroup();
3528 bool NeedPredication = false;
3529 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3530 I < NumMembers; ++I) {
3531 Instruction *Member = InterGroup->getMember(I);
3532 if (Member)
3533 NeedPredication |= BlockNeedsPredication(Member->getParent());
3534 }
3535
3536 if (NeedPredication)
3537 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3538 }
3539 }
3540 }
3541 }
3542}
3543
3545 VPlan &Plan,
3547 &InterleaveGroups,
3548 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3549 if (InterleaveGroups.empty())
3550 return;
3551
3552 // Interleave memory: for each Interleave Group we marked earlier as relevant
3553 // for this VPlan, replace the Recipes widening its memory instructions with a
3554 // single VPInterleaveRecipe at its insertion point.
3555 VPDominatorTree VPDT(Plan);
3556 for (const auto *IG : InterleaveGroups) {
3557 auto *Start =
3558 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3559 VPIRMetadata InterleaveMD(*Start);
3560 SmallVector<VPValue *, 4> StoredValues;
3561 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3562 StoredValues.push_back(StoreR->getStoredValue());
3563 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3564 Instruction *MemberI = IG->getMember(I);
3565 if (!MemberI)
3566 continue;
3567 VPWidenMemoryRecipe *MemoryR =
3568 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3569 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3570 StoredValues.push_back(StoreR->getStoredValue());
3571 InterleaveMD.intersect(*MemoryR);
3572 }
3573
3574 bool NeedsMaskForGaps =
3575 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3576 (!StoredValues.empty() && !IG->isFull());
3577
3578 Instruction *IRInsertPos = IG->getInsertPos();
3579 auto *InsertPos =
3580 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3581
3583 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3584 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3585 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3586
3587 // Get or create the start address for the interleave group.
3588 VPValue *Addr = Start->getAddr();
3589 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3590 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3591 // We cannot re-use the address of member zero because it does not
3592 // dominate the insert position. Instead, use the address of the insert
3593 // position and create a PtrAdd adjusting it to the address of member
3594 // zero.
3595 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3596 // InsertPos or sink loads above zero members to join it.
3597 assert(IG->getIndex(IRInsertPos) != 0 &&
3598 "index of insert position shouldn't be zero");
3599 auto &DL = IRInsertPos->getDataLayout();
3600 APInt Offset(32,
3601 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3602 IG->getIndex(IRInsertPos),
3603 /*IsSigned=*/true);
3604 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3605 VPBuilder B(InsertPos);
3606 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3607 }
3608 // If the group is reverse, adjust the index to refer to the last vector
3609 // lane instead of the first. We adjust the index from the first vector
3610 // lane, rather than directly getting the pointer for lane VF - 1, because
3611 // the pointer operand of the interleaved access is supposed to be uniform.
3612 if (IG->isReverse()) {
3613 auto *ReversePtr = new VPVectorEndPointerRecipe(
3614 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3615 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3616 ReversePtr->insertBefore(InsertPos);
3617 Addr = ReversePtr;
3618 }
3619 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3620 InsertPos->getMask(), NeedsMaskForGaps,
3621 InterleaveMD, InsertPos->getDebugLoc());
3622 VPIG->insertBefore(InsertPos);
3623
3624 unsigned J = 0;
3625 for (unsigned i = 0; i < IG->getFactor(); ++i)
3626 if (Instruction *Member = IG->getMember(i)) {
3627 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3628 if (!Member->getType()->isVoidTy()) {
3629 VPValue *OriginalV = MemberR->getVPSingleValue();
3630 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3631 J++;
3632 }
3633 MemberR->eraseFromParent();
3634 }
3635 }
3636}
3637
3638/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3639/// value, phi and backedge value. In the following example:
3640///
3641/// vector.ph:
3642/// Successor(s): vector loop
3643///
3644/// <x1> vector loop: {
3645/// vector.body:
3646/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3647/// ...
3648/// EMIT branch-on-count ...
3649/// No successors
3650/// }
3651///
3652/// WIDEN-INDUCTION will get expanded to:
3653///
3654/// vector.ph:
3655/// ...
3656/// vp<%induction.start> = ...
3657/// vp<%induction.increment> = ...
3658///
3659/// Successor(s): vector loop
3660///
3661/// <x1> vector loop: {
3662/// vector.body:
3663/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3664/// ...
3665/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3666/// EMIT branch-on-count ...
3667/// No successors
3668/// }
3669static void
3671 VPTypeAnalysis &TypeInfo) {
3672 VPlan *Plan = WidenIVR->getParent()->getPlan();
3673 VPValue *Start = WidenIVR->getStartValue();
3674 VPValue *Step = WidenIVR->getStepValue();
3675 VPValue *VF = WidenIVR->getVFValue();
3676 DebugLoc DL = WidenIVR->getDebugLoc();
3677
3678 // The value from the original loop to which we are mapping the new induction
3679 // variable.
3680 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3681
3682 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3685 VPIRFlags Flags = *WidenIVR;
3686 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3687 AddOp = Instruction::Add;
3688 MulOp = Instruction::Mul;
3689 } else {
3690 AddOp = ID.getInductionOpcode();
3691 MulOp = Instruction::FMul;
3692 }
3693
3694 // If the phi is truncated, truncate the start and step values.
3695 VPBuilder Builder(Plan->getVectorPreheader());
3696 Type *StepTy = TypeInfo.inferScalarType(Step);
3697 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3698 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3699 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3700 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3701 // Truncation doesn't preserve WrapFlags.
3702 Flags.dropPoisonGeneratingFlags();
3703 StepTy = Ty;
3704 }
3705
3706 // Construct the initial value of the vector IV in the vector loop preheader.
3707 Type *IVIntTy =
3709 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3710 if (StepTy->isFloatingPointTy())
3711 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3712
3713 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3714 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3715
3716 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3717 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3718 DebugLoc::getUnknown(), "induction");
3719
3720 // Create the widened phi of the vector IV.
3721 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3722 WidenIVR->getDebugLoc(), "vec.ind");
3723 WidePHI->insertBefore(WidenIVR);
3724
3725 // Create the backedge value for the vector IV.
3726 VPValue *Inc;
3727 VPValue *Prev;
3728 // If unrolled, use the increment and prev value from the operands.
3729 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3730 Inc = SplatVF;
3731 Prev = WidenIVR->getLastUnrolledPartOperand();
3732 } else {
3733 if (VPRecipeBase *R = VF->getDefiningRecipe())
3734 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3735 // Multiply the vectorization factor by the step using integer or
3736 // floating-point arithmetic as appropriate.
3737 if (StepTy->isFloatingPointTy())
3738 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3739 DL);
3740 else
3741 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3742 TypeInfo.inferScalarType(VF), DL);
3743
3744 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3745 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3746 Prev = WidePHI;
3747 }
3748
3750 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3751 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3752 WidenIVR->getDebugLoc(), "vec.ind.next");
3753
3754 WidePHI->addOperand(Next);
3755
3756 WidenIVR->replaceAllUsesWith(WidePHI);
3757}
3758
3759/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3760/// initial value, phi and backedge value. In the following example:
3761///
3762/// <x1> vector loop: {
3763/// vector.body:
3764/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3765/// ...
3766/// EMIT branch-on-count ...
3767/// }
3768///
3769/// WIDEN-POINTER-INDUCTION will get expanded to:
3770///
3771/// <x1> vector loop: {
3772/// vector.body:
3773/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3774/// EMIT %mul = mul %stepvector, %step
3775/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3776/// ...
3777/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3778/// EMIT branch-on-count ...
3779/// }
3781 VPTypeAnalysis &TypeInfo) {
3782 VPlan *Plan = R->getParent()->getPlan();
3783 VPValue *Start = R->getStartValue();
3784 VPValue *Step = R->getStepValue();
3785 VPValue *VF = R->getVFValue();
3786
3787 assert(R->getInductionDescriptor().getKind() ==
3789 "Not a pointer induction according to InductionDescriptor!");
3790 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3791 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3792 "Recipe should have been replaced");
3793
3794 VPBuilder Builder(R);
3795 DebugLoc DL = R->getDebugLoc();
3796
3797 // Build a scalar pointer phi.
3798 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3799
3800 // Create actual address geps that use the pointer phi as base and a
3801 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3802 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3803 Type *StepTy = TypeInfo.inferScalarType(Step);
3804 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3805 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3806 VPValue *PtrAdd =
3807 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3808 R->replaceAllUsesWith(PtrAdd);
3809
3810 // Create the backedge value for the scalar pointer phi.
3812 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3813 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3814 DL);
3815 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3816
3817 VPValue *InductionGEP =
3818 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3819 ScalarPtrPhi->addOperand(InductionGEP);
3820}
3821
3823 // Replace loop regions with explicity CFG.
3824 SmallVector<VPRegionBlock *> LoopRegions;
3826 vp_depth_first_deep(Plan.getEntry()))) {
3827 if (!R->isReplicator())
3828 LoopRegions.push_back(R);
3829 }
3830 for (VPRegionBlock *R : LoopRegions)
3831 R->dissolveToCFGLoop();
3832}
3833
3836 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3837 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3840 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3841 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3842 }
3843
3844 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3845 // single-condition branches:
3846 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3847 // the first condition is true, and otherwise jumps to a new interim block.
3848 // 2. A branch that ends the interim block, jumps to the second successor if
3849 // the second condition is true, and otherwise jumps to the third
3850 // successor.
3851 for (VPInstruction *Br : WorkList) {
3852 assert(Br->getNumOperands() == 2 &&
3853 "BranchOnTwoConds must have exactly 2 conditions");
3854 DebugLoc DL = Br->getDebugLoc();
3855 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3856 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3857 assert(Successors.size() == 3 &&
3858 "BranchOnTwoConds must have exactly 3 successors");
3859
3860 for (VPBlockBase *Succ : Successors)
3861 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3862
3863 VPValue *Cond0 = Br->getOperand(0);
3864 VPValue *Cond1 = Br->getOperand(1);
3865 VPBlockBase *Succ0 = Successors[0];
3866 VPBlockBase *Succ1 = Successors[1];
3867 VPBlockBase *Succ2 = Successors[2];
3868 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3869 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3870
3871 VPBasicBlock *InterimBB =
3872 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3873
3874 VPBuilder(BrOnTwoCondsBB)
3876 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3877 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3878
3880 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3881 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3882 Br->eraseFromParent();
3883 }
3884}
3885
3887 VPTypeAnalysis TypeInfo(Plan);
3890 vp_depth_first_deep(Plan.getEntry()))) {
3891 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3892 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3893 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3894 ToRemove.push_back(WidenIVR);
3895 continue;
3896 }
3897
3898 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3899 // If the recipe only generates scalars, scalarize it instead of
3900 // expanding it.
3901 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3902 VPBuilder Builder(WidenIVR);
3903 VPValue *PtrAdd =
3904 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3905 WidenIVR->replaceAllUsesWith(PtrAdd);
3906 ToRemove.push_back(WidenIVR);
3907 continue;
3908 }
3909 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3910 ToRemove.push_back(WidenIVR);
3911 continue;
3912 }
3913
3914 // Expand VPBlendRecipe into VPInstruction::Select.
3915 VPBuilder Builder(&R);
3916 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3917 VPValue *Select = Blend->getIncomingValue(0);
3918 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3919 Select = Builder.createSelect(Blend->getMask(I),
3920 Blend->getIncomingValue(I), Select,
3921 R.getDebugLoc(), "predphi", *Blend);
3922 Blend->replaceAllUsesWith(Select);
3923 ToRemove.push_back(Blend);
3924 }
3925
3926 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
3927 if (!VEPR->getOffset()) {
3928 assert(Plan.getConcreteUF() == 1 &&
3929 "Expected unroller to have materialized offset for UF != 1");
3930 VEPR->materializeOffset();
3931 }
3932 }
3933
3934 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3935 Expr->decompose();
3936 ToRemove.push_back(Expr);
3937 }
3938
3939 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3940 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3941 if (LastActiveL &&
3942 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3943 // Create Not(Mask) for all operands.
3945 for (VPValue *Op : LastActiveL->operands()) {
3946 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3947 NotMasks.push_back(NotMask);
3948 }
3949
3950 // Create FirstActiveLane on the inverted masks.
3951 VPValue *FirstInactiveLane = Builder.createNaryOp(
3953 LastActiveL->getDebugLoc(), "first.inactive.lane");
3954
3955 // Subtract 1 to get the last active lane.
3956 VPValue *One = Plan.getConstantInt(64, 1);
3957 VPValue *LastLane =
3958 Builder.createSub(FirstInactiveLane, One,
3959 LastActiveL->getDebugLoc(), "last.active.lane");
3960
3961 LastActiveL->replaceAllUsesWith(LastLane);
3962 ToRemove.push_back(LastActiveL);
3963 continue;
3964 }
3965
3966 // Lower MaskedCond with block mask to LogicalAnd.
3968 auto *VPI = cast<VPInstruction>(&R);
3969 assert(VPI->isMasked() &&
3970 "Unmasked MaskedCond should be simplified earlier");
3971 VPI->replaceAllUsesWith(Builder.createNaryOp(
3972 VPInstruction::LogicalAnd, {VPI->getOperand(0), VPI->getMask()}));
3973 ToRemove.push_back(VPI);
3974 continue;
3975 }
3976
3977 // Lower BranchOnCount to ICmp + BranchOnCond.
3978 VPValue *IV, *TC;
3979 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3980 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3981 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3982 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3983 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3984 ToRemove.push_back(BranchOnCountInst);
3985 continue;
3986 }
3987
3988 VPValue *VectorStep;
3989 VPValue *ScalarStep;
3991 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3992 continue;
3993
3994 // Expand WideIVStep.
3995 auto *VPI = cast<VPInstruction>(&R);
3996 Type *IVTy = TypeInfo.inferScalarType(VPI);
3997 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3999 ? Instruction::UIToFP
4000 : Instruction::Trunc;
4001 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4002 }
4003
4004 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4005 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4006 ScalarStep =
4007 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4008 }
4009
4010 VPIRFlags Flags;
4011 unsigned MulOpc;
4012 if (IVTy->isFloatingPointTy()) {
4013 MulOpc = Instruction::FMul;
4014 Flags = VPI->getFastMathFlags();
4015 } else {
4016 MulOpc = Instruction::Mul;
4017 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4018 }
4019
4020 VPInstruction *Mul = Builder.createNaryOp(
4021 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4022 VectorStep = Mul;
4023 VPI->replaceAllUsesWith(VectorStep);
4024 ToRemove.push_back(VPI);
4025 }
4026 }
4027
4028 for (VPRecipeBase *R : ToRemove)
4029 R->eraseFromParent();
4030}
4031
4033 VPBasicBlock *HeaderVPBB,
4034 VPBasicBlock *LatchVPBB,
4035 VPBasicBlock *MiddleVPBB) {
4036 struct EarlyExitInfo {
4037 VPBasicBlock *EarlyExitingVPBB;
4038 VPIRBasicBlock *EarlyExitVPBB;
4039 VPValue *CondToExit;
4040 };
4041
4042 VPDominatorTree VPDT(Plan);
4043 VPBuilder Builder(LatchVPBB->getTerminator());
4045 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4046 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4047 if (Pred == MiddleVPBB)
4048 continue;
4049 // Collect condition for this early exit.
4050 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4051 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4052 VPValue *CondOfEarlyExitingVPBB;
4053 [[maybe_unused]] bool Matched =
4054 match(EarlyExitingVPBB->getTerminator(),
4055 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4056 assert(Matched && "Terminator must be BranchOnCond");
4057
4058 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4059 // the correct block mask.
4060 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4061 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4063 TrueSucc == ExitBlock
4064 ? CondOfEarlyExitingVPBB
4065 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4066 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4067 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4068 VPDT.properlyDominates(
4069 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4070 LatchVPBB)) &&
4071 "exit condition must dominate the latch");
4072 Exits.push_back({
4073 EarlyExitingVPBB,
4074 ExitBlock,
4075 CondToEarlyExit,
4076 });
4077 }
4078 }
4079
4080 assert(!Exits.empty() && "must have at least one early exit");
4081 // Sort exits by RPO order to get correct program order. RPO gives a
4082 // topological ordering of the CFG, ensuring upstream exits are checked
4083 // before downstream exits in the dispatch chain.
4085 HeaderVPBB);
4087 for (const auto &[Num, VPB] : enumerate(RPOT))
4088 RPOIdx[VPB] = Num;
4089 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4090 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4091 });
4092#ifndef NDEBUG
4093 // After RPO sorting, verify that for any pair where one exit dominates
4094 // another, the dominating exit comes first. This is guaranteed by RPO
4095 // (topological order) and is required for the dispatch chain correctness.
4096 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4097 for (unsigned J = I + 1; J < Exits.size(); ++J)
4098 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4099 Exits[I].EarlyExitingVPBB) &&
4100 "RPO sort must place dominating exits before dominated ones");
4101#endif
4102
4103 // Build the AnyOf condition for the latch terminator using logical OR
4104 // to avoid poison propagation from later exit conditions when an earlier
4105 // exit is taken.
4106 VPValue *Combined = Exits[0].CondToExit;
4107 for (const EarlyExitInfo &Info : drop_begin(Exits))
4108 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4109
4110 VPValue *IsAnyExitTaken =
4111 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4112
4113 // Create the vector.early.exit blocks.
4114 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4115 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4116 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4117 VPBasicBlock *VectorEarlyExitVPBB =
4118 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4119 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4120 }
4121
4122 // Create the dispatch block (or reuse the single exit block if only one
4123 // exit). The dispatch block computes the first active lane of the combined
4124 // condition and, for multiple exits, chains through conditions to determine
4125 // which exit to take.
4126 VPBasicBlock *DispatchVPBB =
4127 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4128 : Plan.createVPBasicBlock("vector.early.exit.check");
4129 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4130 VPValue *FirstActiveLane =
4131 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4132 DebugLoc::getUnknown(), "first.active.lane");
4133
4134 // For each early exit, disconnect the original exiting block
4135 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4136 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4137 // values at the first active lane:
4138 //
4139 // Input:
4140 // early.exiting.I:
4141 // ...
4142 // EMIT branch-on-cond vp<%cond.I>
4143 // Successor(s): in.loop.succ, ir-bb<exit.I>
4144 //
4145 // ir-bb<exit.I>:
4146 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4147 //
4148 // Output:
4149 // early.exiting.I:
4150 // ...
4151 // Successor(s): in.loop.succ
4152 //
4153 // vector.early.exit.I:
4154 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4155 // Successor(s): ir-bb<exit.I>
4156 //
4157 // ir-bb<exit.I>:
4158 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4159 // vector.early.exit.I)
4160 //
4161 for (auto [Exit, VectorEarlyExitVPBB] :
4162 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4163 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4164 // Adjust the phi nodes in EarlyExitVPBB.
4165 // 1. remove incoming values from EarlyExitingVPBB,
4166 // 2. extract the incoming value at FirstActiveLane
4167 // 3. add back the extracts as last operands for the phis
4168 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4169 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4170 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4171 // values from VectorEarlyExitVPBB.
4172 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4173 auto *ExitIRI = cast<VPIRPhi>(&R);
4174 VPValue *IncomingVal =
4175 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4176 VPValue *NewIncoming = IncomingVal;
4177 if (!isa<VPIRValue>(IncomingVal)) {
4178 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4179 NewIncoming = EarlyExitBuilder.createNaryOp(
4180 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4181 DebugLoc::getUnknown(), "early.exit.value");
4182 }
4183 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4184 ExitIRI->addOperand(NewIncoming);
4185 }
4186
4187 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4188 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4189 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4190 }
4191
4192 // Chain through exits: for each exit, check if its condition is true at
4193 // the first active lane. If so, take that exit; otherwise, try the next.
4194 // The last exit needs no check since it must be taken if all others fail.
4195 //
4196 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4197 //
4198 // latch:
4199 // ...
4200 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4201 // ...
4202 //
4203 // vector.early.exit.check:
4204 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4205 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4206 // EMIT branch-on-cond vp<%at.cond.0>
4207 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4208 //
4209 // vector.early.exit.check.0:
4210 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4211 // EMIT branch-on-cond vp<%at.cond.1>
4212 // Successor(s): vector.early.exit.1, vector.early.exit.2
4213 VPBasicBlock *CurrentBB = DispatchVPBB;
4214 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4215 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4216 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4217 DebugLoc::getUnknown(), "exit.cond.at.lane");
4218
4219 // For the last dispatch, branch directly to the last exit on false;
4220 // otherwise, create a new check block.
4221 bool IsLastDispatch = (I + 2 == Exits.size());
4222 VPBasicBlock *FalseBB =
4223 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4224 : Plan.createVPBasicBlock(
4225 Twine("vector.early.exit.check.") + Twine(I));
4226
4227 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4228 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4229 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4230 FalseBB->setPredecessors({CurrentBB});
4231
4232 CurrentBB = FalseBB;
4233 DispatchBuilder.setInsertPoint(CurrentBB);
4234 }
4235
4236 // Replace the latch terminator with the new branching logic.
4237 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4238 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4239 "Unexpected terminator");
4240 auto *IsLatchExitTaken =
4241 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4242 LatchExitingBranch->getOperand(1));
4243
4244 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4245 LatchExitingBranch->eraseFromParent();
4246 Builder.setInsertPoint(LatchVPBB);
4247 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4248 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4249 LatchVPBB->clearSuccessors();
4250 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4251 DispatchVPBB->setPredecessors({LatchVPBB});
4252}
4253
4254/// This function tries convert extended in-loop reductions to
4255/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4256/// valid. The created recipe must be decomposed to its constituent
4257/// recipes before execution.
4258static VPExpressionRecipe *
4260 VFRange &Range) {
4261 Type *RedTy = Ctx.Types.inferScalarType(Red);
4262 VPValue *VecOp = Red->getVecOp();
4263
4264 // Clamp the range if using extended-reduction is profitable.
4265 auto IsExtendedRedValidAndClampRange =
4266 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4268 [&](ElementCount VF) {
4269 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4271
4273 InstructionCost ExtCost =
4274 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4275 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4276
4277 if (Red->isPartialReduction()) {
4280 // FIXME: Move partial reduction creation, costing and clamping
4281 // here from LoopVectorize.cpp.
4282 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4283 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4284 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4285 RedTy->isFloatingPointTy()
4286 ? std::optional{Red->getFastMathFlags()}
4287 : std::nullopt);
4288 } else if (!RedTy->isFloatingPointTy()) {
4289 // TTI::getExtendedReductionCost only supports integer types.
4290 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4291 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4292 Red->getFastMathFlags(), CostKind);
4293 }
4294 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4295 },
4296 Range);
4297 };
4298
4299 VPValue *A;
4300 // Match reduce(ext)).
4301 if (isa<VPWidenCastRecipe>(VecOp) &&
4302 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4303 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4304 IsExtendedRedValidAndClampRange(
4305 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4306 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4307 Ctx.Types.inferScalarType(A)))
4308 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4309
4310 return nullptr;
4311}
4312
4313/// This function tries convert extended in-loop reductions to
4314/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4315/// and valid. The created VPExpressionRecipe must be decomposed to its
4316/// constituent recipes before execution. Patterns of the
4317/// VPExpressionRecipe:
4318/// reduce.add(mul(...)),
4319/// reduce.add(mul(ext(A), ext(B))),
4320/// reduce.add(ext(mul(ext(A), ext(B)))).
4321/// reduce.fadd(fmul(ext(A), ext(B)))
4322static VPExpressionRecipe *
4324 VPCostContext &Ctx, VFRange &Range) {
4325 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4326 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4327 Opcode != Instruction::FAdd)
4328 return nullptr;
4329
4330 Type *RedTy = Ctx.Types.inferScalarType(Red);
4331
4332 // Clamp the range if using multiply-accumulate-reduction is profitable.
4333 auto IsMulAccValidAndClampRange =
4335 VPWidenCastRecipe *OuterExt) -> bool {
4337 [&](ElementCount VF) {
4339 Type *SrcTy =
4340 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4341 InstructionCost MulAccCost;
4342
4343 if (Red->isPartialReduction()) {
4344 Type *SrcTy2 =
4345 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4346 // FIXME: Move partial reduction creation, costing and clamping
4347 // here from LoopVectorize.cpp.
4348 MulAccCost = Ctx.TTI.getPartialReductionCost(
4349 Opcode, SrcTy, SrcTy2, RedTy, VF,
4351 Ext0->getOpcode())
4354 Ext1->getOpcode())
4356 Mul->getOpcode(), CostKind,
4357 RedTy->isFloatingPointTy()
4358 ? std::optional{Red->getFastMathFlags()}
4359 : std::nullopt);
4360 } else {
4361 // Only partial reductions support mixed or floating-point extends
4362 // at the moment.
4363 if (Ext0 && Ext1 &&
4364 (Ext0->getOpcode() != Ext1->getOpcode() ||
4365 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4366 return false;
4367
4368 bool IsZExt =
4369 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4370 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4371 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4372 SrcVecTy, CostKind);
4373 }
4374
4375 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4376 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4377 InstructionCost ExtCost = 0;
4378 if (Ext0)
4379 ExtCost += Ext0->computeCost(VF, Ctx);
4380 if (Ext1)
4381 ExtCost += Ext1->computeCost(VF, Ctx);
4382 if (OuterExt)
4383 ExtCost += OuterExt->computeCost(VF, Ctx);
4384
4385 return MulAccCost.isValid() &&
4386 MulAccCost < ExtCost + MulCost + RedCost;
4387 },
4388 Range);
4389 };
4390
4391 VPValue *VecOp = Red->getVecOp();
4392 VPRecipeBase *Sub = nullptr;
4393 VPValue *A, *B;
4394 VPValue *Tmp = nullptr;
4395
4396 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4397 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4398 assert(Opcode == Instruction::FAdd &&
4399 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4400 "instruction");
4401 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4402 if (!FMul)
4403 return nullptr;
4404
4405 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4406 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4407
4408 if (RecipeA && RecipeB &&
4409 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4410 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4411 }
4412 }
4413 if (RedTy->isFloatingPointTy())
4414 return nullptr;
4415
4416 // Sub reductions could have a sub between the add reduction and vec op.
4417 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4418 Sub = VecOp->getDefiningRecipe();
4419 VecOp = Tmp;
4420 }
4421
4422 // If ValB is a constant and can be safely extended, truncate it to the same
4423 // type as ExtA's operand, then extend it to the same type as ExtA. This
4424 // creates two uniform extends that can more easily be matched by the rest of
4425 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4426 // replaced with the new extend of the constant.
4427 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4428 VPWidenCastRecipe *&ExtB,
4429 VPValue *&ValB,
4430 VPWidenRecipe *Mul) {
4431 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4432 return;
4433 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4434 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4435 const APInt *Const;
4436 if (!match(ValB, m_APInt(Const)) ||
4438 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4439 return;
4440 // The truncate ensures that the type of each extended operand is the
4441 // same, and it's been proven that the constant can be extended from
4442 // NarrowTy safely. Necessary since ExtA's extended operand would be
4443 // e.g. an i8, while the const will likely be an i32. This will be
4444 // elided by later optimisations.
4445 VPBuilder Builder(Mul);
4446 auto *Trunc =
4447 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4448 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4449 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4450 Mul->setOperand(1, ExtB);
4451 };
4452
4453 // Try to match reduce.add(mul(...)).
4454 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4457 auto *Mul = cast<VPWidenRecipe>(VecOp);
4458
4459 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4460 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4461
4462 // Match reduce.add/sub(mul(ext, ext)).
4463 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4464 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4465 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4466 if (Sub)
4467 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4468 cast<VPWidenRecipe>(Sub), Red);
4469 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4470 }
4471 // TODO: Add an expression type for this variant with a negated mul
4472 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4473 return new VPExpressionRecipe(Mul, Red);
4474 }
4475 // TODO: Add an expression type for negated versions of other expression
4476 // variants.
4477 if (Sub)
4478 return nullptr;
4479
4480 // Match reduce.add(ext(mul(A, B))).
4481 if (!Red->isPartialReduction() &&
4482 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4483 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4484 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4487
4488 // reduce.add(ext(mul(ext, const)))
4489 // -> reduce.add(ext(mul(ext, ext(const))))
4490 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4491
4492 // reduce.add(ext(mul(ext(A), ext(B))))
4493 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4494 // The inner extends must either have the same opcode as the outer extend or
4495 // be the same, in which case the multiply can never result in a negative
4496 // value and the outer extend can be folded away by doing wider
4497 // extends for the operands of the mul.
4498 if (Ext0 && Ext1 &&
4499 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4500 Ext0->getOpcode() == Ext1->getOpcode() &&
4501 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4502 auto *NewExt0 = new VPWidenCastRecipe(
4503 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4504 *Ext0, *Ext0, Ext0->getDebugLoc());
4505 NewExt0->insertBefore(Ext0);
4506
4507 VPWidenCastRecipe *NewExt1 = NewExt0;
4508 if (Ext0 != Ext1) {
4509 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4510 Ext->getResultType(), nullptr, *Ext1,
4511 *Ext1, Ext1->getDebugLoc());
4512 NewExt1->insertBefore(Ext1);
4513 }
4514 Mul->setOperand(0, NewExt0);
4515 Mul->setOperand(1, NewExt1);
4516 Red->setOperand(1, Mul);
4517 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4518 }
4519 }
4520 return nullptr;
4521}
4522
4523/// This function tries to create abstract recipes from the reduction recipe for
4524/// following optimizations and cost estimation.
4526 VPCostContext &Ctx,
4527 VFRange &Range) {
4528 VPExpressionRecipe *AbstractR = nullptr;
4529 auto IP = std::next(Red->getIterator());
4530 auto *VPBB = Red->getParent();
4531 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4532 AbstractR = MulAcc;
4533 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4534 AbstractR = ExtRed;
4535 // Cannot create abstract inloop reduction recipes.
4536 if (!AbstractR)
4537 return;
4538
4539 AbstractR->insertBefore(*VPBB, IP);
4540 Red->replaceAllUsesWith(AbstractR);
4541}
4542
4553
4555 if (Plan.hasScalarVFOnly())
4556 return;
4557
4558#ifndef NDEBUG
4559 VPDominatorTree VPDT(Plan);
4560#endif
4561
4562 SmallVector<VPValue *> VPValues;
4565 append_range(VPValues, Plan.getLiveIns());
4566 for (VPRecipeBase &R : *Plan.getEntry())
4567 append_range(VPValues, R.definedValues());
4568
4569 auto *VectorPreheader = Plan.getVectorPreheader();
4570 for (VPValue *VPV : VPValues) {
4572 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4573 continue;
4574
4575 // Add explicit broadcast at the insert point that dominates all users.
4576 VPBasicBlock *HoistBlock = VectorPreheader;
4577 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4578 for (VPUser *User : VPV->users()) {
4579 if (User->usesScalars(VPV))
4580 continue;
4581 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4582 HoistPoint = HoistBlock->begin();
4583 else
4584 assert(VPDT.dominates(VectorPreheader,
4585 cast<VPRecipeBase>(User)->getParent()) &&
4586 "All users must be in the vector preheader or dominated by it");
4587 }
4588
4589 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4590 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4591 VPV->replaceUsesWithIf(Broadcast,
4592 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4593 return Broadcast != &U && !U.usesScalars(VPV);
4594 });
4595 }
4596}
4597
4599 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4600
4601 // Collect candidate loads with invariant addresses and noalias scopes
4602 // metadata and memory-writing recipes with noalias metadata.
4606 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4607 for (VPRecipeBase &R : *VPBB) {
4608 // Only handle single-scalar replicated loads with invariant addresses.
4609 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4610 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4611 RepR->getOpcode() != Instruction::Load)
4612 continue;
4613
4614 VPValue *Addr = RepR->getOperand(0);
4615 if (Addr->isDefinedOutsideLoopRegions()) {
4617 if (!Loc.AATags.Scope)
4618 continue;
4619 CandidateLoads.push_back({RepR, Loc});
4620 }
4621 }
4622 if (R.mayWriteToMemory()) {
4624 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4625 return;
4626 Stores.push_back(*Loc);
4627 }
4628 }
4629 }
4630
4631 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4632 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4633 // Hoist the load to the preheader if it doesn't alias with any stores
4634 // according to the noalias metadata. Other loads should have been hoisted
4635 // by other passes
4636 const AAMDNodes &LoadAA = LoadLoc.AATags;
4637 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4639 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4640 })) {
4641 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4642 }
4643 }
4644}
4645
4646// Collect common metadata from a group of replicate recipes by intersecting
4647// metadata from all recipes in the group.
4649 VPIRMetadata CommonMetadata = *Recipes.front();
4650 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4651 CommonMetadata.intersect(*Recipe);
4652 return CommonMetadata;
4653}
4654
4655template <unsigned Opcode>
4659 const Loop *L) {
4660 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4661 "Only Load and Store opcodes supported");
4662 constexpr bool IsLoad = (Opcode == Instruction::Load);
4663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4664 VPDominatorTree VPDT(Plan);
4665 VPTypeAnalysis TypeInfo(Plan);
4666
4667 // Group predicated operations by their address SCEV.
4669 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4670 auto *VPBB = cast<VPBasicBlock>(Block);
4671 for (VPRecipeBase &R : *VPBB) {
4672 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4673 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4674 continue;
4675
4676 // For loads, operand 0 is address; for stores, operand 1 is address.
4677 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4678 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4679 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4680 RecipesByAddress[AddrSCEV].push_back(RepR);
4681 }
4682 }
4683
4684 // For each address, collect operations with the same or complementary masks.
4686 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4687 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4688 };
4689 for (auto &[Addr, Recipes] : RecipesByAddress) {
4690 if (Recipes.size() < 2)
4691 continue;
4692
4693 // Collect groups with the same or complementary masks.
4694 for (VPReplicateRecipe *&RecipeI : Recipes) {
4695 if (!RecipeI)
4696 continue;
4697
4698 VPValue *MaskI = RecipeI->getMask();
4699 Type *TypeI = GetLoadStoreValueType(RecipeI);
4701 Group.push_back(RecipeI);
4702 RecipeI = nullptr;
4703
4704 // Find all operations with the same or complementary masks.
4705 bool HasComplementaryMask = false;
4706 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4707 if (!RecipeJ)
4708 continue;
4709
4710 VPValue *MaskJ = RecipeJ->getMask();
4711 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4712 if (TypeI == TypeJ) {
4713 // Check if any operation in the group has a complementary mask with
4714 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4715 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4716 match(MaskJ, m_Not(m_Specific(MaskI)));
4717 Group.push_back(RecipeJ);
4718 RecipeJ = nullptr;
4719 }
4720 }
4721
4722 if (HasComplementaryMask) {
4723 assert(Group.size() >= 2 && "must have at least 2 entries");
4724 // Sort replicates by dominance order, with earliest (most dominating)
4725 // first.
4726 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4727 return VPDT.properlyDominates(A, B);
4728 });
4729 AllGroups.push_back(std::move(Group));
4730 }
4731 }
4732 }
4733
4734 return AllGroups;
4735}
4736
4737// Find the recipe with minimum alignment in the group.
4738template <typename InstType>
4739static VPReplicateRecipe *
4741 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4742 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4743 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4744 });
4745}
4746
4749 const Loop *L) {
4750 auto Groups =
4752 if (Groups.empty())
4753 return;
4754
4755 // Process each group of loads.
4756 for (auto &Group : Groups) {
4757 // Try to use the earliest (most dominating) load to replace all others.
4758 VPReplicateRecipe *EarliestLoad = Group[0];
4759 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4760 VPBasicBlock *LastBB = Group.back()->getParent();
4761
4762 // Check that the load doesn't alias with stores between first and last.
4763 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4764 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4765 continue;
4766
4767 // Collect common metadata from all loads in the group.
4768 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4769
4770 // Find the load with minimum alignment to use.
4771 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4772
4773 // Create an unpredicated version of the earliest load with common
4774 // metadata.
4775 auto *UnpredicatedLoad = new VPReplicateRecipe(
4776 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4777 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4778 CommonMetadata);
4779
4780 UnpredicatedLoad->insertBefore(EarliestLoad);
4781
4782 // Replace all loads in the group with the unpredicated load.
4783 for (VPReplicateRecipe *Load : Group) {
4784 Load->replaceAllUsesWith(UnpredicatedLoad);
4785 Load->eraseFromParent();
4786 }
4787 }
4788}
4789
4790static bool
4792 PredicatedScalarEvolution &PSE, const Loop &L,
4793 VPTypeAnalysis &TypeInfo) {
4794 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4795 if (!StoreLoc || !StoreLoc->AATags.Scope)
4796 return false;
4797
4798 // When sinking a group of stores, all members of the group alias each other.
4799 // Skip them during the alias checks.
4800 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4801 StoresToSink.end());
4802
4803 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4804 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4805 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4806 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4807}
4808
4811 const Loop *L) {
4812 auto Groups =
4814 if (Groups.empty())
4815 return;
4816
4817 VPTypeAnalysis TypeInfo(Plan);
4818
4819 for (auto &Group : Groups) {
4820 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4821 continue;
4822
4823 // Use the last (most dominated) store's location for the unconditional
4824 // store.
4825 VPReplicateRecipe *LastStore = Group.back();
4826 VPBasicBlock *InsertBB = LastStore->getParent();
4827
4828 // Collect common alias metadata from all stores in the group.
4829 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4830
4831 // Build select chain for stored values.
4832 VPValue *SelectedValue = Group[0]->getOperand(0);
4833 VPBuilder Builder(InsertBB, LastStore->getIterator());
4834
4835 for (unsigned I = 1; I < Group.size(); ++I) {
4836 VPValue *Mask = Group[I]->getMask();
4837 VPValue *Value = Group[I]->getOperand(0);
4838 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4839 Group[I]->getDebugLoc());
4840 }
4841
4842 // Find the store with minimum alignment to use.
4843 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4844
4845 // Create unconditional store with selected value and common metadata.
4846 auto *UnpredicatedStore =
4847 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4848 {SelectedValue, LastStore->getOperand(1)},
4849 /*IsSingleScalar=*/false,
4850 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4851 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4852
4853 // Remove all predicated stores from the group.
4854 for (VPReplicateRecipe *Store : Group)
4855 Store->eraseFromParent();
4856 }
4857}
4858
4860 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4862 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4863 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4864
4865 VPValue *TC = Plan.getTripCount();
4866 // Skip cases for which the trip count may be non-trivial to materialize.
4867 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4868 // tail is required.
4869 if (!Plan.hasScalarTail() ||
4871 Plan.getScalarPreheader() ||
4872 !isa<VPIRValue>(TC))
4873 return;
4874
4875 // Materialize vector trip counts for constants early if it can simply
4876 // be computed as (Original TC / VF * UF) * VF * UF.
4877 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4878 // tail-folded loops.
4879 ScalarEvolution &SE = *PSE.getSE();
4880 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4881 if (!isa<SCEVConstant>(TCScev))
4882 return;
4883 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4884 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4885 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4886 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4887}
4888
4890 VPBasicBlock *VectorPH) {
4892 if (BTC->getNumUsers() == 0)
4893 return;
4894
4895 VPBuilder Builder(VectorPH, VectorPH->begin());
4896 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4897 auto *TCMO =
4898 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4899 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4900 BTC->replaceAllUsesWith(TCMO);
4901}
4902
4904 if (Plan.hasScalarVFOnly())
4905 return;
4906
4907 VPTypeAnalysis TypeInfo(Plan);
4908 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4909 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4911 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4912 vp_depth_first_shallow(LoopRegion->getEntry()));
4913 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4914 // VPInstructions, excluding ones in replicate regions. Those are not
4915 // materialized explicitly yet. Those vector users are still handled in
4916 // VPReplicateRegion::execute(), via shouldPack().
4917 // TODO: materialize build vectors for replicating recipes in replicating
4918 // regions.
4919 for (VPBasicBlock *VPBB :
4920 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4921 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4923 continue;
4924 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4925 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4926 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4927 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4928 };
4929 if ((isa<VPReplicateRecipe>(DefR) &&
4930 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4931 (isa<VPInstruction>(DefR) &&
4933 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4934 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4935 continue;
4936
4937 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4938 unsigned Opcode = ScalarTy->isStructTy()
4941 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4942 BuildVector->insertAfter(DefR);
4943
4944 DefR->replaceUsesWithIf(
4945 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4946 VPUser &U, unsigned) {
4947 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4948 });
4949 }
4950 }
4951
4952 // Create explicit VPInstructions to convert vectors to scalars. The current
4953 // implementation is conservative - it may miss some cases that may or may not
4954 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4955 // if they are known to operate on scalar values.
4956 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4957 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4960 continue;
4961 for (VPValue *Def : R.definedValues()) {
4962 // Skip recipes that are single-scalar or only have their first lane
4963 // used.
4964 // TODO: The Defs skipped here may or may not be vector values.
4965 // Introduce Unpacks, and remove them later, if they are guaranteed to
4966 // produce scalar values.
4968 continue;
4969
4970 // At the moment, we create unpacks only for scalar users outside
4971 // replicate regions. Recipes inside replicate regions still extract the
4972 // required lanes implicitly.
4973 // TODO: Remove once replicate regions are unrolled completely.
4974 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4975 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4976 return U->usesScalars(Def) &&
4977 (!ParentRegion || !ParentRegion->isReplicator());
4978 };
4979 if (none_of(Def->users(), IsCandidateUnpackUser))
4980 continue;
4981
4982 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4983 if (R.isPhi())
4984 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4985 else
4986 Unpack->insertAfter(&R);
4987 Def->replaceUsesWithIf(Unpack,
4988 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4989 return IsCandidateUnpackUser(&U);
4990 });
4991 }
4992 }
4993 }
4994}
4995
4997 VPBasicBlock *VectorPHVPBB,
4998 bool TailByMasking,
4999 bool RequiresScalarEpilogue) {
5000 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5001 // There's nothing to do if there are no users of the vector trip count or its
5002 // IR value has already been set.
5003 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5004 return;
5005
5006 VPValue *TC = Plan.getTripCount();
5007 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5008 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
5009 VPValue *Step = &Plan.getVFxUF();
5010
5011 // If the tail is to be folded by masking, round the number of iterations N
5012 // up to a multiple of Step instead of rounding down. This is done by first
5013 // adding Step-1 and then rounding down. Note that it's ok if this addition
5014 // overflows: the vector induction variable will eventually wrap to zero given
5015 // that it starts at zero and its Step is a power of two; the loop will then
5016 // exit, with the last early-exit vector comparison also producing all-true.
5017 if (TailByMasking) {
5018 TC = Builder.createAdd(
5019 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5020 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5021 }
5022
5023 // Now we need to generate the expression for the part of the loop that the
5024 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5025 // iterations are not required for correctness, or N - Step, otherwise. Step
5026 // is equal to the vectorization factor (number of SIMD elements) times the
5027 // unroll factor (number of SIMD instructions).
5028 VPValue *R =
5029 Builder.createNaryOp(Instruction::URem, {TC, Step},
5030 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5031
5032 // There are cases where we *must* run at least one iteration in the remainder
5033 // loop. See the cost model for when this can happen. If the step evenly
5034 // divides the trip count, we set the remainder to be equal to the step. If
5035 // the step does not evenly divide the trip count, no adjustment is necessary
5036 // since there will already be scalar iterations. Note that the minimum
5037 // iterations check ensures that N >= Step.
5038 if (RequiresScalarEpilogue) {
5039 assert(!TailByMasking &&
5040 "requiring scalar epilogue is not supported with fail folding");
5041 VPValue *IsZero =
5042 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
5043 R = Builder.createSelect(IsZero, Step, R);
5044 }
5045
5046 VPValue *Res =
5047 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5048 VectorTC.replaceAllUsesWith(Res);
5049}
5050
5052 ElementCount VFEC) {
5053 VPBuilder Builder(VectorPH, VectorPH->begin());
5054 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5055 VPValue &VF = Plan.getVF();
5056 VPValue &VFxUF = Plan.getVFxUF();
5057 // Note that after the transform, no further uses of Plan.getVF and
5058 // Plan.getVFxUF should be added.
5059 // TODO: Add assertions for this.
5060
5061 // If there are no users of the runtime VF, compute VFxUF by constant folding
5062 // the multiplication of VF and UF.
5063 if (VF.getNumUsers() == 0) {
5064 VPValue *RuntimeVFxUF =
5065 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5066 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5067 return;
5068 }
5069
5070 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5071 // vscale) * UF.
5072 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5074 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5076 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5077 }
5078 VF.replaceAllUsesWith(RuntimeVF);
5079
5080 VPValue *MulByUF = Builder.createOverflowingOp(
5081 Instruction::Mul,
5082 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5083 {true, false});
5084 VFxUF.replaceAllUsesWith(MulByUF);
5085}
5086
5089 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5090
5091 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5092 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5093 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5094 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5096 continue;
5097 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5098 if (!ExpSCEV)
5099 break;
5100 const SCEV *Expr = ExpSCEV->getSCEV();
5101 Value *Res =
5102 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5103 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5104 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5105 ExpSCEV->replaceAllUsesWith(Exp);
5106 if (Plan.getTripCount() == ExpSCEV)
5107 Plan.resetTripCount(Exp);
5108 ExpSCEV->eraseFromParent();
5109 }
5111 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5112 "before any VPIRInstructions");
5113 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5114 // to the VPIRBasicBlock.
5115 auto EI = Entry->begin();
5116 for (Instruction &I : drop_end(*EntryBB)) {
5117 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5118 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5119 EI++;
5120 continue;
5121 }
5123 }
5124
5125 return ExpandedSCEVs;
5126}
5127
5128/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5129/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5130/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5131/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5132/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5133/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5134/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5135/// is defined at \p Idx of a load interleave group.
5136static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5137 VPValue *OpV, unsigned Idx) {
5138 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5139 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5140 if (!Member0OpR)
5141 return Member0Op == OpV;
5142 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5143 return !W->getMask() && W->isConsecutive() && Member0Op == OpV;
5144 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5145 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5146 return false;
5147}
5148
5151 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5152 if (!WideMember0)
5153 return false;
5154 for (VPValue *V : Ops) {
5156 return false;
5157 auto *R = cast<VPSingleDefRecipe>(V);
5158 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5159 return false;
5160 }
5161
5162 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5164 for (VPValue *Op : Ops)
5165 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5166
5167 if (canNarrowOps(OpsI))
5168 continue;
5169
5170 if (any_of(enumerate(OpsI), [WideMember0, Idx](const auto &P) {
5171 const auto &[OpIdx, OpV] = P;
5172 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx);
5173 }))
5174 return false;
5175 }
5176
5177 return true;
5178}
5179
5180/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5181/// number of members both equal to VF. The interleave group must also access
5182/// the full vector width.
5183static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5185 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5186 if (!InterleaveR || InterleaveR->getMask())
5187 return std::nullopt;
5188
5189 Type *GroupElementTy = nullptr;
5190 if (InterleaveR->getStoredValues().empty()) {
5191 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5192 if (!all_of(InterleaveR->definedValues(),
5193 [&TypeInfo, GroupElementTy](VPValue *Op) {
5194 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5195 }))
5196 return std::nullopt;
5197 } else {
5198 GroupElementTy =
5199 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5200 if (!all_of(InterleaveR->getStoredValues(),
5201 [&TypeInfo, GroupElementTy](VPValue *Op) {
5202 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5203 }))
5204 return std::nullopt;
5205 }
5206
5207 auto IG = InterleaveR->getInterleaveGroup();
5208 if (IG->getFactor() != IG->getNumMembers())
5209 return std::nullopt;
5210
5211 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5212 TypeSize Size = TTI.getRegisterBitWidth(
5215 assert(Size.isScalable() == VF.isScalable() &&
5216 "if Size is scalable, VF must be scalable and vice versa");
5217 return Size.getKnownMinValue();
5218 };
5219
5220 for (ElementCount VF : VFs) {
5221 unsigned MinVal = VF.getKnownMinValue();
5222 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5223 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5224 return {VF};
5225 }
5226 return std::nullopt;
5227}
5228
5229/// Returns true if \p VPValue is a narrow VPValue.
5230static bool isAlreadyNarrow(VPValue *VPV) {
5231 if (isa<VPIRValue>(VPV))
5232 return true;
5233 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5234 return RepR && RepR->isSingleScalar();
5235}
5236
5237// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5238// a narrow variant.
5239static VPValue *
5241 auto *R = V->getDefiningRecipe();
5242 if (!R || NarrowedOps.contains(V))
5243 return V;
5244
5245 if (isAlreadyNarrow(V))
5246 return V;
5247
5249 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5250 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5251 WideMember0->setOperand(
5252 Idx,
5253 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5254 return V;
5255 }
5256
5257 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5258 // Narrow interleave group to wide load, as transformed VPlan will only
5259 // process one original iteration.
5260 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5261 auto *L = new VPWidenLoadRecipe(
5262 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5263 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5264 L->insertBefore(LoadGroup);
5265 NarrowedOps.insert(L);
5266 return L;
5267 }
5268
5269 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5270 assert(RepR->isSingleScalar() &&
5271 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5272 "must be a single scalar load");
5273 NarrowedOps.insert(RepR);
5274 return RepR;
5275 }
5276
5277 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5278 VPValue *PtrOp = WideLoad->getAddr();
5279 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5280 PtrOp = VecPtr->getOperand(0);
5281 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5282 // process one original iteration.
5283 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5284 /*IsUniform*/ true,
5285 /*Mask*/ nullptr, {}, *WideLoad);
5286 N->insertBefore(WideLoad);
5287 NarrowedOps.insert(N);
5288 return N;
5289}
5290
5291std::unique_ptr<VPlan>
5293 const TargetTransformInfo &TTI) {
5294 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5295
5296 if (!VectorLoop)
5297 return nullptr;
5298
5299 // Only handle single-block loops for now.
5300 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5301 return nullptr;
5302
5303 // Skip plans when we may not be able to properly narrow.
5304 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5305 if (!match(&Exiting->back(), m_BranchOnCount()))
5306 return nullptr;
5307
5308 assert(match(&Exiting->back(),
5310 m_Specific(&Plan.getVectorTripCount()))) &&
5311 "unexpected branch-on-count");
5312
5313 VPTypeAnalysis TypeInfo(Plan);
5315 std::optional<ElementCount> VFToOptimize;
5316 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5318 continue;
5319
5322 continue;
5323
5324 // Bail out on recipes not supported at the moment:
5325 // * phi recipes other than the canonical induction
5326 // * recipes writing to memory except interleave groups
5327 // Only support plans with a canonical induction phi.
5328 if (R.isPhi())
5329 return nullptr;
5330
5331 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5332 if (R.mayWriteToMemory() && !InterleaveR)
5333 return nullptr;
5334
5335 // All other ops are allowed, but we reject uses that cannot be converted
5336 // when checking all allowed consumers (store interleave groups) below.
5337 if (!InterleaveR)
5338 continue;
5339
5340 // Try to find a single VF, where all interleave groups are consecutive and
5341 // saturate the full vector width. If we already have a candidate VF, check
5342 // if it is applicable for the current InterleaveR, otherwise look for a
5343 // suitable VF across the Plan's VFs.
5345 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5346 : to_vector(Plan.vectorFactors());
5347 std::optional<ElementCount> NarrowedVF =
5348 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5349 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5350 return nullptr;
5351 VFToOptimize = NarrowedVF;
5352
5353 // Skip read interleave groups.
5354 if (InterleaveR->getStoredValues().empty())
5355 continue;
5356
5357 // Narrow interleave groups, if all operands are already matching narrow
5358 // ops.
5359 auto *Member0 = InterleaveR->getStoredValues()[0];
5360 if (isAlreadyNarrow(Member0) &&
5361 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5362 StoreGroups.push_back(InterleaveR);
5363 continue;
5364 }
5365
5366 // For now, we only support full interleave groups storing load interleave
5367 // groups.
5368 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5369 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5370 if (!DefR)
5371 return false;
5372 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5373 return IR && IR->getInterleaveGroup()->isFull() &&
5374 IR->getVPValue(Op.index()) == Op.value();
5375 })) {
5376 StoreGroups.push_back(InterleaveR);
5377 continue;
5378 }
5379
5380 // Check if all values feeding InterleaveR are matching wide recipes, which
5381 // operands that can be narrowed.
5382 if (!canNarrowOps(InterleaveR->getStoredValues()))
5383 return nullptr;
5384 StoreGroups.push_back(InterleaveR);
5385 }
5386
5387 if (StoreGroups.empty())
5388 return nullptr;
5389
5390 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5391 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5392 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5393 // TODO: Handle cases where only some interleave groups can be narrowed.
5394 std::unique_ptr<VPlan> NewPlan;
5395 if (size(Plan.vectorFactors()) != 1) {
5396 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5397 Plan.setVF(*VFToOptimize);
5398 NewPlan->removeVF(*VFToOptimize);
5399 }
5400
5401 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5402 SmallPtrSet<VPValue *, 4> NarrowedOps;
5403 // Narrow operation tree rooted at store groups.
5404 for (auto *StoreGroup : StoreGroups) {
5405 VPValue *Res =
5406 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5407 auto *SI =
5408 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5409 auto *S = new VPWidenStoreRecipe(
5410 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5411 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5412 S->insertBefore(StoreGroup);
5413 StoreGroup->eraseFromParent();
5414 }
5415
5416 // Adjust induction to reflect that the transformed plan only processes one
5417 // original iteration.
5418 auto *CanIV = VectorLoop->getCanonicalIV();
5419 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5420 VPBuilder PHBuilder(Plan.getVectorPreheader());
5421
5422 VPValue *UF = &Plan.getUF();
5423 if (VFToOptimize->isScalable()) {
5424 VPValue *VScale = PHBuilder.createElementCount(
5426 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5427 Instruction::Mul, {VScale, UF}, {true, false});
5428 Inc->setOperand(1, VScaleUF);
5429 Plan.getVF().replaceAllUsesWith(VScale);
5430 } else {
5431 Inc->setOperand(1, UF);
5433 Plan.getConstantInt(CanIV->getScalarType(), 1));
5434 }
5435 removeDeadRecipes(Plan);
5436 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5438 "All VPVectorPointerRecipes should have been removed");
5439 return NewPlan;
5440}
5441
5442/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5443/// BranchOnCond recipe.
5445 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5446 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5447 auto *MiddleTerm =
5449 // Only add branch metadata if there is a (conditional) terminator.
5450 if (!MiddleTerm)
5451 return;
5452
5453 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5454 "must have a BranchOnCond");
5455 // Assume that `TripCount % VectorStep ` is equally distributed.
5456 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5457 if (VF.isScalable() && VScaleForTuning.has_value())
5458 VectorStep *= *VScaleForTuning;
5459 assert(VectorStep > 0 && "trip count should not be zero");
5460 MDBuilder MDB(Plan.getContext());
5461 MDNode *BranchWeights =
5462 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5463 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5464}
5465
5466/// Compute and return the end value for \p WideIV, unless it is truncated. If
5467/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5468/// compute the end value of the induction.
5470 VPBuilder &VectorPHBuilder,
5471 VPTypeAnalysis &TypeInfo,
5472 VPValue *VectorTC) {
5473 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5474 // Truncated wide inductions resume from the last lane of their vector value
5475 // in the last vector iteration which is handled elsewhere.
5476 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5477 return nullptr;
5478
5479 VPIRValue *Start = WideIV->getStartValue();
5480 VPValue *Step = WideIV->getStepValue();
5482 VPValue *EndValue = VectorTC;
5483 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5484 EndValue = VectorPHBuilder.createDerivedIV(
5485 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5486 Start, VectorTC, Step);
5487 }
5488
5489 // EndValue is derived from the vector trip count (which has the same type as
5490 // the widest induction) and thus may be wider than the induction here.
5491 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5492 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5493 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5494 ScalarTypeOfWideIV,
5495 WideIV->getDebugLoc());
5496 }
5497
5498 return EndValue;
5499}
5500
5502 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues, bool FoldTail) {
5503 VPTypeAnalysis TypeInfo(Plan);
5504 auto *ScalarPH = Plan.getScalarPreheader();
5505 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5506 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5507 VPBuilder VectorPHBuilder(
5508 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5509 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5510 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5511 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5512
5513 // TODO: Extract final value from induction recipe initially, optimize to
5514 // pre-computed end value together in optimizeInductionExitUsers.
5515 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5516 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5517 // TODO: Check if tail is folded directly in VPlan.
5518 VPValue *TC = !FoldTail
5519 ? static_cast<VPValue *>(&Plan.getVectorTripCount())
5520 : Plan.getTripCount();
5522 WideIVR, VectorPHBuilder, TypeInfo, TC)) {
5523 IVEndValues[WideIVR] = EndValue;
5524 ResumePhiR->setOperand(0, EndValue);
5525 ResumePhiR->setName("bc.resume.val");
5526 continue;
5527 }
5528 // TODO: Also handle truncated inductions here. Computing end-values
5529 // separately should be done as VPlan-to-VPlan optimization, after
5530 // legalizing all resume values to use the last lane from the loop.
5531 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5532 "should only skip truncated wide inductions");
5533 continue;
5534 }
5535
5536 // The backedge value provides the value to resume coming out of a loop,
5537 // which for FORs is a vector whose last element needs to be extracted. The
5538 // start value provides the value if the loop is bypassed.
5539 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5540 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5541 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5542 "Cannot handle loops with uncountable early exits");
5543 if (IsFOR) {
5544 auto *ExtractPart = MiddleBuilder.createNaryOp(
5545 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5546 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5548 "vector.recur.extract");
5549 }
5550 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5551 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5552 }
5553}
5554
5556 VFRange &Range) {
5557 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5558 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5559 auto *MiddleVPBB = Plan.getMiddleBlock();
5560 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5561 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5562
5563 auto IsScalableOne = [](ElementCount VF) -> bool {
5564 return VF == ElementCount::getScalable(1);
5565 };
5566
5567 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5568 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5569 if (!FOR)
5570 continue;
5571
5572 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5573 "Cannot handle loops with uncountable early exits");
5574
5575 // This is the second phase of vectorizing first-order recurrences, creating
5576 // extract for users outside the loop. An overview of the transformation is
5577 // described below. Suppose we have the following loop with some use after
5578 // the loop of the last a[i-1],
5579 //
5580 // for (int i = 0; i < n; ++i) {
5581 // t = a[i - 1];
5582 // b[i] = a[i] - t;
5583 // }
5584 // use t;
5585 //
5586 // There is a first-order recurrence on "a". For this loop, the shorthand
5587 // scalar IR looks like:
5588 //
5589 // scalar.ph:
5590 // s.init = a[-1]
5591 // br scalar.body
5592 //
5593 // scalar.body:
5594 // i = phi [0, scalar.ph], [i+1, scalar.body]
5595 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5596 // s2 = a[i]
5597 // b[i] = s2 - s1
5598 // br cond, scalar.body, exit.block
5599 //
5600 // exit.block:
5601 // use = lcssa.phi [s1, scalar.body]
5602 //
5603 // In this example, s1 is a recurrence because it's value depends on the
5604 // previous iteration. In the first phase of vectorization, we created a
5605 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5606 // for users in the scalar preheader and exit block.
5607 //
5608 // vector.ph:
5609 // v_init = vector(..., ..., ..., a[-1])
5610 // br vector.body
5611 //
5612 // vector.body
5613 // i = phi [0, vector.ph], [i+4, vector.body]
5614 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5615 // v2 = a[i, i+1, i+2, i+3]
5616 // b[i] = v2 - v1
5617 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5618 // b[i, i+1, i+2, i+3] = v2 - v1
5619 // br cond, vector.body, middle.block
5620 //
5621 // middle.block:
5622 // vector.recur.extract.for.phi = v2(2)
5623 // vector.recur.extract = v2(3)
5624 // br cond, scalar.ph, exit.block
5625 //
5626 // scalar.ph:
5627 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5628 // [s.init, otherwise]
5629 // br scalar.body
5630 //
5631 // scalar.body:
5632 // i = phi [0, scalar.ph], [i+1, scalar.body]
5633 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5634 // s2 = a[i]
5635 // b[i] = s2 - s1
5636 // br cond, scalar.body, exit.block
5637 //
5638 // exit.block:
5639 // lo = lcssa.phi [s1, scalar.body],
5640 // [vector.recur.extract.for.phi, middle.block]
5641 //
5642 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5643 // Extract the penultimate value of the recurrence and use it as operand for
5644 // the VPIRInstruction modeling the phi.
5646 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5648 continue;
5649
5650 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5651 // penultimate value of the recurrence. Instead we rely on the existing
5652 // extract of the last element from the result of
5653 // VPInstruction::FirstOrderRecurrenceSplice.
5654 // TODO: Consider vscale_range info and UF.
5656 Range))
5657 return;
5658 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5659 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5660 "vector.recur.extract.for.phi");
5661 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5662 }
5663 }
5664}
5665
5668 Loop &L) {
5669 ScalarEvolution &SE = *PSE.getSE();
5670 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5671
5672 // Helper lambda to check if the IV range excludes the sentinel value.
5673 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5674 bool Signed) -> std::optional<APInt> {
5675 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5676 APInt Sentinel =
5677 UseMax
5680
5681 ConstantRange IVRange =
5682 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5683 if (!IVRange.contains(Sentinel))
5684 return Sentinel;
5685 return std::nullopt;
5686 };
5687
5688 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5689 for (VPRecipeBase &Phi :
5690 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5691 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5693 PhiR->getRecurrenceKind()))
5694 continue;
5695
5696 // If there's a header mask, the backedge select will not be the find-last
5697 // select.
5698 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5699 VPValue *CondSelect = BackedgeVal;
5700 if (HeaderMask &&
5701 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5702 m_VPValue(CondSelect), m_Specific(PhiR))))
5703 llvm_unreachable("expected header mask select");
5704
5705 // Get the IV from the conditional select of the reduction phi.
5706 // The conditional select should be a select between the phi and the IV.
5707 VPValue *Cond, *TrueVal, *FalseVal;
5708 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5709 m_VPValue(FalseVal))))
5710 continue;
5711
5712 // The non-phi operand of the select is the IV.
5713 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5714 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5715
5716 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5717 const SCEV *Step;
5718 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5719 continue;
5720
5721 // Determine direction from SCEV step.
5722 if (!SE.isKnownNonZero(Step))
5723 continue;
5724
5725 // Positive step means we need UMax/SMax to find the last IV value, and
5726 // UMin/SMin otherwise.
5727 bool UseMax = SE.isKnownPositive(Step);
5728 bool UseSigned = true;
5729 std::optional<APInt> SentinelVal =
5730 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5731 if (!SentinelVal) {
5732 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5733 UseSigned = false;
5734 }
5735
5736 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5737 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5738 // cannot use min/max.
5739 if (!SentinelVal) {
5740 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5741 if (AR->hasNoSignedWrap())
5742 UseSigned = true;
5743 else if (AR->hasNoUnsignedWrap())
5744 UseSigned = false;
5745 else
5746 continue;
5747 }
5748
5750 BackedgeVal,
5752
5753 RecurKind MinMaxKind =
5754 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5755 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5756 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5757 FastMathFlags());
5758 DebugLoc ExitDL = RdxResult->getDebugLoc();
5759 VPBuilder MiddleBuilder(RdxResult);
5760 VPValue *ReducedIV =
5762 RdxResult->getOperand(0), Flags, ExitDL);
5763
5764 VPValue *NewRdxResult;
5765 VPValue *StartVPV = PhiR->getStartValue();
5766 if (SentinelVal) {
5767 // Sentinel-based approach: reduce IVs with min/max, compare against
5768 // sentinel to detect if condition was ever true, select accordingly.
5769 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5770 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5771 Sentinel, ExitDL);
5772 NewRdxResult =
5773 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5774 StartVPV = Sentinel;
5775 } else {
5776 // Introduce a boolean AnyOf reduction to track if the condition was ever
5777 // true in the loop. Use it to select the initial start value, if it was
5778 // never true.
5779 auto *AnyOfPhi = new VPReductionPHIRecipe(
5780 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5781 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5782 AnyOfPhi->insertAfter(PhiR);
5783
5784 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5785 VPValue *AnyOfCond = Cond;
5786 if (TrueVal == PhiR)
5787 AnyOfCond = LoopBuilder.createNot(Cond);
5788 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5789 AnyOfPhi->setOperand(1, OrVal);
5790
5791 NewRdxResult =
5793 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5794
5795 // Initialize the IV reduction phi with the neutral element, not the
5796 // original start value, to ensure correct min/max reduction results.
5797 StartVPV = Plan.getOrAddLiveIn(
5798 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5799 }
5800 RdxResult->replaceAllUsesWith(NewRdxResult);
5801 RdxResult->eraseFromParent();
5802
5803 auto *NewPhiR = new VPReductionPHIRecipe(
5804 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5805 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5806 NewPhiR->insertBefore(PhiR);
5807 PhiR->replaceAllUsesWith(NewPhiR);
5808 PhiR->eraseFromParent();
5809 }
5810}
5811
5812namespace {
5813
5814/// A chain of recipes that form a partial reduction. Matches either
5815/// reduction_bin_op (extend (A), accumulator), or
5816/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5817struct VPPartialReductionChain {
5818 /// The top-level binary operation that forms the reduction to a scalar
5819 /// after the loop body.
5820 VPWidenRecipe *ReductionBinOp;
5821 /// The extension of each of the inner binary operation's operands.
5822 VPWidenCastRecipe *ExtendA;
5823 VPWidenCastRecipe *ExtendB;
5824 /// The user of the extends that is then reduced.
5825 VPWidenRecipe *BinOp;
5826 unsigned ScaleFactor;
5827 /// The recurrence kind for the entire partial reduction chain.
5828 /// This allows distinguishing between Sub and AddWithSub recurrences,
5829 /// when the ReductionBinOp is a Instruction::Sub.
5830 RecurKind RK;
5831};
5832
5833static VPSingleDefRecipe *
5834optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5835 VPTypeAnalysis &TypeInfo) {
5836 // reduce.add(mul(ext(A), C))
5837 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5838 const APInt *Const;
5839 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5840 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5841 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5842 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5843 if (!BinOp->hasOneUse() ||
5845 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5846 return BinOp;
5847
5848 VPBuilder Builder(BinOp);
5849 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5850 BinOp->getOperand(1), NarrowTy);
5851 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5852 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5853 return BinOp;
5854 }
5855
5856 // reduce.add(ext(mul(ext(A), ext(B))))
5857 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5859 m_ZExtOrSExt(m_VPValue()))))) {
5860 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5861 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5862 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5863 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5864 if (!Mul->hasOneUse() ||
5865 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5866 MulLHS->getOpcode() != MulRHS->getOpcode())
5867 return BinOp;
5868 VPBuilder Builder(Mul);
5869 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5870 MulLHS->getOperand(0),
5871 Ext->getResultType()));
5872 Mul->setOperand(1, MulLHS == MulRHS
5873 ? Mul->getOperand(0)
5874 : Builder.createWidenCast(MulRHS->getOpcode(),
5875 MulRHS->getOperand(0),
5876 Ext->getResultType()));
5877 return Mul;
5878 }
5879
5880 return BinOp;
5881}
5882
5883// Helper to transform a partial reduction chain into a partial reduction
5884// recipe. Assumes profitability has been checked.
5885static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5886 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5887 VPReductionPHIRecipe *RdxPhi) {
5888 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5889 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5890
5891 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5892 VPValue *Accumulator = WidenRecipe->getOperand(1);
5893
5894 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5896 isa<VPExpressionRecipe>(BinOpVal))
5897 std::swap(BinOpVal, Accumulator);
5898 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5899
5900 // Sub-reductions can be implemented in two ways:
5901 // (1) negate the operand in the vector loop (the default way).
5902 // (2) subtract the reduced value from the init value in the middle block.
5903 // Both ways keep the reduction itself as an 'add' reduction.
5904 //
5905 // The ISD nodes for partial reductions don't support folding the
5906 // sub/negation into its operands because the following is not a valid
5907 // transformation:
5908 // sub(0, mul(ext(a), ext(b)))
5909 // -> mul(ext(a), ext(sub(0, b)))
5910 //
5911 // It's therefore better to choose option (2) such that the partial
5912 // reduction is always positive (starting at '0') and to do a final
5913 // subtract in the middle block.
5914 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5915 Chain.RK != RecurKind::Sub) {
5916 VPBuilder Builder(WidenRecipe);
5917 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5918 auto *Zero = Plan.getConstantInt(ElemTy, 0);
5919 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5920 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5921 : VPIRFlags();
5922 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5924 Builder.insert(NegRecipe);
5925 BinOp = NegRecipe;
5926 }
5927
5928 // FIXME: Do these transforms before invoking the cost-model.
5929 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5930
5931 // Check if WidenRecipe is the final result of the reduction. If so look
5932 // through selects for predicated reductions.
5933 VPValue *Cond = nullptr;
5935 WidenRecipe,
5936 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5937 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5938 RdxPhi->getBackedgeValue() == ExitValue;
5939 assert((!ExitValue || IsLastInChain) &&
5940 "if we found ExitValue, it must match RdxPhi's backedge value");
5941
5942 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
5943 RecurKind RdxKind =
5945 auto *PartialRed = new VPReductionRecipe(
5946 RdxKind,
5947 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5948 : FastMathFlags(),
5949 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
5950 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
5951 PartialRed->insertBefore(WidenRecipe);
5952
5953 if (Cond)
5954 ExitValue->replaceAllUsesWith(PartialRed);
5955 WidenRecipe->replaceAllUsesWith(PartialRed);
5956
5957 // We only need to update the PHI node once, which is when we find the
5958 // last reduction in the chain.
5959 if (!IsLastInChain)
5960 return;
5961
5962 // Scale the PHI and ReductionStartVector by the VFScaleFactor
5963 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
5964 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
5965
5966 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
5967 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
5968 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
5969 StartInst->setOperand(2, NewScaleFactor);
5970
5971 // If this is the last value in a sub-reduction chain, then update the PHI
5972 // node to start at `0` and update the reduction-result to subtract from
5973 // the PHI's start value.
5974 if (Chain.RK != RecurKind::Sub)
5975 return;
5976
5977 VPValue *OldStartValue = StartInst->getOperand(0);
5978 StartInst->setOperand(0, StartInst->getOperand(1));
5979
5980 // Replace reduction_result by 'sub (startval, reductionresult)'.
5982 assert(RdxResult && "Could not find reduction result");
5983
5984 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
5985 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
5986 VPInstruction *NewResult = Builder.createNaryOp(
5987 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
5988 RdxPhi->getDebugLoc());
5989 RdxResult->replaceUsesWithIf(
5990 NewResult,
5991 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
5992}
5993
5994/// Check if a partial reduction chain is is supported by the target (i.e. does
5995/// not have an invalid cost) for the given VF range. Clamps the range and
5996/// returns true if profitable for any VF.
5997static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
5998 Type *PhiType, VPCostContext &CostCtx,
5999 VFRange &Range) {
6000 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6001 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6002 if (!Ext)
6003 return {nullptr, TargetTransformInfo::PR_None};
6004 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6006 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6007 return {ExtOpType, ExtKind};
6008 };
6009 auto ExtInfoA = GetExtInfo(Chain.ExtendA);
6010 auto ExtInfoB = GetExtInfo(Chain.ExtendB);
6011 Type *ExtOpTypeA = ExtInfoA.first;
6012 Type *ExtOpTypeB = ExtInfoB.first;
6013 auto ExtKindA = ExtInfoA.second;
6014 auto ExtKindB = ExtInfoB.second;
6015
6016 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6017 // was a constant that can use the same extend kind as the first.
6018 if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) {
6019 const APInt *Const = nullptr;
6020 for (VPValue *Op : Chain.BinOp->operands()) {
6021 if (match(Op, m_APInt(Const)))
6022 break;
6023 }
6024 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6025 return false;
6026 ExtOpTypeB = ExtOpTypeA;
6027 ExtKindB = ExtKindA;
6028 }
6029
6030 std::optional<unsigned> BinOpc =
6031 (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp)
6032 ? std::make_optional(Chain.BinOp->getOpcode())
6033 : std::nullopt;
6034 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6036 [&](ElementCount VF) {
6037 return CostCtx.TTI
6039 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6040 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6041 PhiType->isFloatingPointTy()
6042 ? std::optional{WidenRecipe->getFastMathFlags()}
6043 : std::nullopt)
6044 .isValid();
6045 },
6046 Range);
6047}
6048
6049/// Examines reduction operations to see if the target can use a cheaper
6050/// operation with a wider per-iteration input VF and narrower PHI VF.
6051/// Recursively calls itself to identify chained scaled reductions.
6052/// Returns true if this invocation added an entry to Chains, otherwise false.
6053static bool
6054getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPValue *PrevValue,
6056 VPCostContext &CostCtx, VFRange &Range) {
6057 auto *UpdateR = dyn_cast<VPWidenRecipe>(PrevValue);
6058 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6059 return false;
6060
6061 VPValue *Op = UpdateR->getOperand(0);
6062 VPValue *PhiOp = UpdateR->getOperand(1);
6063 if (Op == RedPhiR)
6064 std::swap(Op, PhiOp);
6065
6066 // If Op is an extend, then it's still a valid partial reduction if the
6067 // extended mul fulfills the other requirements.
6068 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6069 // reduction since the inner extends will be widened. We already have oneUse
6070 // checks on the inner extends so widening them is safe.
6071 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6074 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Op);
6075 if (!CastRecipe)
6076 return false;
6077 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6078 OuterExtKind = TTI::getPartialReductionExtendKind(CastOp);
6079 Op = CastRecipe->getOperand(0);
6080 }
6081
6082 // Try and get a scaled reduction from the first non-phi operand.
6083 // If one is found, we use the discovered reduction instruction in
6084 // place of the accumulator for costing.
6085 if (getScaledReductions(RedPhiR, Op, Chains, CostCtx, Range)) {
6086 Op = UpdateR->getOperand(0);
6087 PhiOp = UpdateR->getOperand(1);
6088 if (Op == Chains.rbegin()->ReductionBinOp)
6089 std::swap(Op, PhiOp);
6090 assert(PhiOp == Chains.rbegin()->ReductionBinOp &&
6091 "PhiOp must be the chain value");
6092 assert(CostCtx.Types.inferScalarType(RedPhiR) ==
6093 CostCtx.Types.inferScalarType(PhiOp) &&
6094 "Unexpected type for chain values");
6095 } else if (RedPhiR != PhiOp) {
6096 // If neither operand of this instruction is the reduction PHI node or a
6097 // link in the reduction chain, then this is just an operand to the chain
6098 // and not a link in the chain itself.
6099 return false;
6100 }
6101
6102 // If the update is a binary op, check both of its operands to see if
6103 // they are extends. Otherwise, see if the update comes directly from an
6104 // extend.
6105 VPWidenCastRecipe *CastRecipes[2] = {nullptr};
6106
6107 // Match extends and populate CastRecipes. Returns false if matching fails.
6108 auto MatchExtends = [OuterExtKind,
6109 &CastRecipes](ArrayRef<VPValue *> Operands) {
6110 assert(Operands.size() <= 2 && "expected at most 2 operands");
6111
6112 for (const auto &[I, OpVal] : enumerate(Operands)) {
6113 // Allow constant as second operand - validation happens in
6114 // isValidPartialReduction.
6115 const APInt *Unused;
6116 if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused)))
6117 continue;
6118
6119 VPValue *ExtInput;
6120 if (!match(OpVal, m_ZExtOrSExt(m_VPValue(ExtInput))) &&
6121 !match(OpVal, m_FPExt(m_VPValue(ExtInput))))
6122 return false;
6123
6124 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(OpVal);
6125 if (!CastRecipes[I])
6126 return false;
6127
6128 // The outer extend kind must match the inner extends for folding.
6129 if (OuterExtKind) {
6130 auto CastOp =
6131 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
6132 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOp))
6133 return false;
6134 }
6135 }
6136 return CastRecipes[0] != nullptr;
6137 };
6138
6139 // If Op is a binary operator, check both of its operands to see if they are
6140 // extends. Otherwise, see if the update comes directly from an extend.
6141 auto *BinOp = dyn_cast<VPWidenRecipe>(Op);
6142 if (BinOp && Instruction::isBinaryOp(BinOp->getOpcode())) {
6143 if (!BinOp->hasOneUse())
6144 return false;
6145
6146 // Handle neg(binop(ext, ext)) pattern.
6147 VPValue *OtherOp = nullptr;
6148 if (match(BinOp, m_Sub(m_ZeroInt(), m_VPValue(OtherOp))))
6149 BinOp = dyn_cast<VPWidenRecipe>(OtherOp);
6150
6151 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6152 !MatchExtends(BinOp->operands()))
6153 return false;
6154 } else if (match(UpdateR, m_Add(m_VPValue(), m_VPValue())) ||
6155 match(UpdateR, m_FAdd(m_VPValue(), m_VPValue()))) {
6156 // We already know the operands for Update are Op and PhiOp.
6157 if (!MatchExtends({Op}))
6158 return false;
6159 BinOp = UpdateR;
6160 } else {
6161 return false;
6162 }
6163
6164 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6165 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6166 Type *ExtOpType =
6167 CostCtx.Types.inferScalarType(CastRecipes[0]->getOperand(0));
6168 TypeSize ASize = ExtOpType->getPrimitiveSizeInBits();
6169 if (!PHISize.hasKnownScalarFactor(ASize))
6170 return false;
6171
6172 RecurKind RK = cast<VPReductionPHIRecipe>(RedPhiR)->getRecurrenceKind();
6173 VPPartialReductionChain Chain(
6174 {UpdateR, CastRecipes[0], CastRecipes[1], BinOp,
6175 static_cast<unsigned>(PHISize.getKnownScalarFactor(ASize)), RK});
6176 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6177 return false;
6178
6179 Chains.push_back(Chain);
6180 return true;
6181}
6182} // namespace
6183
6185 VPCostContext &CostCtx,
6186 VFRange &Range) {
6187 // Find all possible valid partial reductions, grouping chains by their PHI.
6188 // This grouping allows invalidating the whole chain, if any link is not a
6189 // valid partial reduction.
6191 ChainsByPhi;
6192 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6193 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6194 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6195 if (!RedPhiR)
6196 continue;
6197
6198 // Get the backedge value from the reduction PHI and find the
6199 // ComputeReductionResult that uses it (directly or through a select for
6200 // predicated reductions).
6201 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6202 VPValue *ExitValue = RdxResult->getOperand(0);
6203 match(ExitValue,
6204 m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6205 getScaledReductions(RedPhiR, ExitValue, ChainsByPhi[RedPhiR], CostCtx,
6206 Range);
6207 }
6208 }
6209
6210 if (ChainsByPhi.empty())
6211 return;
6212
6213 // Build set of partial reduction operations for extend user validation and
6214 // a map of reduction bin ops to their scale factors for scale validation.
6215 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6216 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6217 for (const auto &[_, Chains] : ChainsByPhi)
6218 for (const VPPartialReductionChain &Chain : Chains) {
6219 PartialReductionOps.insert(Chain.BinOp);
6220 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6221 }
6222
6223 // A partial reduction is invalid if any of its extends are used by
6224 // something that isn't another partial reduction. This is because the
6225 // extends are intended to be lowered along with the reduction itself.
6226 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6227 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6228 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6229 });
6230 };
6231
6232 // Validate chains: check that extends are only used by partial reductions,
6233 // and that reduction bin ops are only used by other partial reductions with
6234 // matching scale factors, are outside the loop region or the select
6235 // introduced by tail-folding. Otherwise we would create users of scaled
6236 // reductions where the types of the other operands don't match.
6237 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6238 for (const VPPartialReductionChain &Chain : Chains) {
6239 if (!ExtendUsersValid(Chain.ExtendA) ||
6240 !ExtendUsersValid(Chain.ExtendB)) {
6241 Chains.clear();
6242 break;
6243 }
6244 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6245 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6246 return PhiR == RedPhiR;
6247 auto *R = cast<VPSingleDefRecipe>(U);
6248 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6250 m_Specific(Chain.ReductionBinOp))) ||
6251 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6252 m_Specific(RedPhiR)));
6253 };
6254 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6255 Chains.clear();
6256 break;
6257 }
6258
6259 // Check if the compute-reduction-result is used by a sunk store.
6260 // TODO: Also form partial reductions in those cases.
6261 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6262 if (any_of(RdxResult->users(), [](VPUser *U) {
6263 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6264 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6265 })) {
6266 Chains.clear();
6267 break;
6268 }
6269 }
6270 }
6271 }
6272
6273 for (auto &[Phi, Chains] : ChainsByPhi)
6274 for (const VPPartialReductionChain &Chain : Chains)
6275 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6276}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool canNarrowOps(ArrayRef< VPValue * > Ops)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3815
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4182
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4257
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4209
iterator end()
Definition VPlan.h:4219
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4217
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4270
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4229
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4231
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2720
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2756
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2746
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2762
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2742
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:164
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:269
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:290
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:202
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:221
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:239
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3224
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3757
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3847
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3927
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3269
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2233
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2275
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2264
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4335
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4359
Class to record and manage LLVM IR flags.
Definition VPlan.h:670
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1105
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1160
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1262
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1207
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1204
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1256
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1199
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1196
@ CanonicalIVIncrementForPart
Definition VPlan.h:1180
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2865
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2857
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2886
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2938
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2896
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1525
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3411
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4487
VPBasicBlock * getParent()
Definition VPlan.h:462
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:536
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3098
A recipe for handling reduction phis.
Definition VPlan.h:2626
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2673
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2666
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2989
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4370
const VPBlockBase * getEntry() const
Definition VPlan.h:4406
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4481
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4438
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4423
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4468
const VPBlockBase * getExiting() const
Definition VPlan.h:4418
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4431
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3143
bool isSingleScalar() const
Definition VPlan.h:3184
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3208
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3999
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:588
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:656
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
unsigned getNumOperands() const
Definition VPlanValue.h:296
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1400
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
bool hasOneUse() const
Definition VPlanValue.h:142
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:172
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1403
unsigned getNumUsers() const
Definition VPlanValue.h:104
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1407
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2081
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3890
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1767
Instruction::CastOps getOpcode() const
Definition VPlan.h:1805
A recipe for handling GEP instructions.
Definition VPlan.h:2017
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2299
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2327
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2345
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2330
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2350
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2381
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2428
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2432
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2459
A recipe for widening vector intrinsics.
Definition VPlan.h:1819
A common base class for widening memory operations.
Definition VPlan.h:3454
A recipe for widened phis.
Definition VPlan.h:2517
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1711
unsigned getOpcode() const
Definition VPlan.h:1748
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4500
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4796
bool hasVF(ElementCount VF) const
Definition VPlan.h:4709
LLVMContext & getContext() const
Definition VPlan.h:4691
VPBasicBlock * getEntry()
Definition VPlan.h:4592
bool hasScalableVF() const
Definition VPlan.h:4710
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4689
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4682
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4650
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4671
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4716
VPValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4686
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4775
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4844
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4799
bool hasUF(unsigned UF) const
Definition VPlan.h:4727
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4640
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4679
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4752
void setVF(ElementCount VF)
Definition VPlan.h:4697
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4743
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1033
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4730
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4664
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4617
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4822
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4772
bool hasScalarVFOnly() const
Definition VPlan.h:4720
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4631
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4636
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4597
void setUF(unsigned UF)
Definition VPlan.h:4735
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4876
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1181
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4778
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:427
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:262
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:289
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:282
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1762
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:275
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2608
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2564
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:183
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:223
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3587
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3545
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3672
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3628
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues, bool FoldTail)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...