LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47
49 VPlan &Plan,
51 GetIntOrFpInductionDescriptor,
52 const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 const auto *II = GetIntOrFpInductionDescriptor(Phi);
76 if (!II) {
77 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
78 for (VPValue *Op : PhiR->operands())
79 NewRecipe->addOperand(Op);
80 } else {
81 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
82 VPValue *Step =
84 // It is always safe to copy over the NoWrap and FastMath flags. In
85 // particular, when folding tail by masking, the masked-off lanes are
86 // never used, so it is safe.
88 NewRecipe = new VPWidenIntOrFpInductionRecipe(
89 Phi, Start, Step, &Plan.getVF(), *II, Flags,
90 Ingredient.getDebugLoc());
91 }
92 } else {
93 auto *VPI = cast<VPInstruction>(&Ingredient);
94 assert(!isa<PHINode>(Inst) && "phis should be handled above");
95 // Create VPWidenMemoryRecipe for loads and stores.
96 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
97 NewRecipe = new VPWidenLoadRecipe(
98 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
99 false /*Consecutive*/, false /*Reverse*/, *VPI,
100 Ingredient.getDebugLoc());
101 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
102 NewRecipe = new VPWidenStoreRecipe(
103 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
104 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
105 Ingredient.getDebugLoc());
107 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
108 Ingredient.getDebugLoc());
109 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
110 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
111 if (VectorID == Intrinsic::not_intrinsic)
112 return false;
113 NewRecipe = new VPWidenIntrinsicRecipe(
114 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
115 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
116 *VPI, CI->getDebugLoc());
117 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
118 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
119 *VPI, Ingredient.getDebugLoc());
120 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
121 NewRecipe = new VPWidenCastRecipe(
122 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
123 VPIRFlags(*CI), VPIRMetadata(*CI));
124 } else {
125 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
126 *VPI, Ingredient.getDebugLoc());
127 }
128 }
129
130 NewRecipe->insertBefore(&Ingredient);
131 if (NewRecipe->getNumDefinedValues() == 1)
132 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
133 else
134 assert(NewRecipe->getNumDefinedValues() == 0 &&
135 "Only recpies with zero or one defined values expected");
136 Ingredient.eraseFromParent();
137 }
138 }
139 return true;
140}
141
142// Check if a memory operation doesn't alias with memory operations in blocks
143// between FirstBB and LastBB using scoped noalias metadata.
144// For load hoisting, we only check writes in one direction.
145// For store sinking, we check both reads and writes bidirectionally.
147 const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
148 bool CheckReads,
149 const SmallPtrSetImpl<VPRecipeBase *> *ExcludeRecipes = nullptr) {
150 if (!MemLoc.AATags.Scope)
151 return false;
152
153 const AAMDNodes &MemAA = MemLoc.AATags;
154
155 for (VPBlockBase *Block = FirstBB; Block;
156 Block = Block->getSingleSuccessor()) {
157 assert(Block->getNumSuccessors() <= 1 &&
158 "Expected at most one successor in block chain");
159 auto *VPBB = cast<VPBasicBlock>(Block);
160 for (VPRecipeBase &R : *VPBB) {
161 if (ExcludeRecipes && ExcludeRecipes->contains(&R))
162 continue;
163
164 // Skip recipes that don't need checking.
165 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
166 continue;
167
169 if (!Loc)
170 // Conservatively assume aliasing for memory operations without
171 // location.
172 return false;
173
174 // For reads, check if they don't alias in the reverse direction and
175 // skip if so.
176 if (CheckReads && R.mayReadFromMemory() &&
178 MemAA.NoAlias))
179 continue;
180
181 // Check if the memory operations may alias in the forward direction.
183 Loc->AATags.NoAlias))
184 return false;
185 }
186
187 if (Block == LastBB)
188 break;
189 }
190 return true;
191}
192
193/// Return true if we do not know how to (mechanically) hoist or sink \p R out
194/// of a loop region.
196 // Assumes don't alias anything or throw; as long as they're guaranteed to
197 // execute, they're safe to hoist.
199 return false;
200
201 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
202 // memory location is not modified in the vector loop.
203 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
204 return true;
205
206 // Allocas cannot be hoisted.
207 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
208 return RepR && RepR->getOpcode() == Instruction::Alloca;
209}
210
211static bool sinkScalarOperands(VPlan &Plan) {
212 auto Iter = vp_depth_first_deep(Plan.getEntry());
213 bool ScalarVFOnly = Plan.hasScalarVFOnly();
214 bool Changed = false;
215
217 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
218 VPBasicBlock *SinkTo, VPValue *Op) {
219 auto *Candidate =
220 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
221 if (!Candidate)
222 return;
223
224 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
225 // for now.
227 return;
228
229 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
230 return;
231
232 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
233 if (!ScalarVFOnly && RepR->isSingleScalar())
234 return;
235
236 WorkList.insert({SinkTo, Candidate});
237 };
238
239 // First, collect the operands of all recipes in replicate blocks as seeds for
240 // sinking.
242 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
243 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
244 continue;
245 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
246 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
247 continue;
248 for (auto &Recipe : *VPBB)
249 for (VPValue *Op : Recipe.operands())
250 InsertIfValidSinkCandidate(VPBB, Op);
251 }
252
253 // Try to sink each replicate or scalar IV steps recipe in the worklist.
254 for (unsigned I = 0; I != WorkList.size(); ++I) {
255 VPBasicBlock *SinkTo;
256 VPSingleDefRecipe *SinkCandidate;
257 std::tie(SinkTo, SinkCandidate) = WorkList[I];
258
259 // All recipe users of SinkCandidate must be in the same block SinkTo or all
260 // users outside of SinkTo must only use the first lane of SinkCandidate. In
261 // the latter case, we need to duplicate SinkCandidate.
262 auto UsersOutsideSinkTo =
263 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
264 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
265 });
266 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
267 return !U->usesFirstLaneOnly(SinkCandidate);
268 }))
269 continue;
270 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
271
272 if (NeedsDuplicating) {
273 if (ScalarVFOnly)
274 continue;
275 VPSingleDefRecipe *Clone;
276 if (auto *SinkCandidateRepR =
277 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
278 // TODO: Handle converting to uniform recipes as separate transform,
279 // then cloning should be sufficient here.
280 Instruction *I = SinkCandidate->getUnderlyingInstr();
281 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
282 nullptr /*Mask*/, *SinkCandidateRepR,
283 *SinkCandidateRepR);
284 // TODO: add ".cloned" suffix to name of Clone's VPValue.
285 } else {
286 Clone = SinkCandidate->clone();
287 }
288
289 Clone->insertBefore(SinkCandidate);
290 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
291 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
292 });
293 }
294 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
295 for (VPValue *Op : SinkCandidate->operands())
296 InsertIfValidSinkCandidate(SinkTo, Op);
297 Changed = true;
298 }
299 return Changed;
300}
301
302/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
303/// the mask.
305 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
306 if (!EntryBB || EntryBB->size() != 1 ||
307 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
308 return nullptr;
309
310 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
311}
312
313/// If \p R is a triangle region, return the 'then' block of the triangle.
315 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
316 if (EntryBB->getNumSuccessors() != 2)
317 return nullptr;
318
319 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
320 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
321 if (!Succ0 || !Succ1)
322 return nullptr;
323
324 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
325 return nullptr;
326 if (Succ0->getSingleSuccessor() == Succ1)
327 return Succ0;
328 if (Succ1->getSingleSuccessor() == Succ0)
329 return Succ1;
330 return nullptr;
331}
332
333// Merge replicate regions in their successor region, if a replicate region
334// is connected to a successor replicate region with the same predicate by a
335// single, empty VPBasicBlock.
337 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
338
339 // Collect replicate regions followed by an empty block, followed by another
340 // replicate region with matching masks to process front. This is to avoid
341 // iterator invalidation issues while merging regions.
344 vp_depth_first_deep(Plan.getEntry()))) {
345 if (!Region1->isReplicator())
346 continue;
347 auto *MiddleBasicBlock =
348 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
349 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
350 continue;
351
352 auto *Region2 =
353 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
354 if (!Region2 || !Region2->isReplicator())
355 continue;
356
357 VPValue *Mask1 = getPredicatedMask(Region1);
358 VPValue *Mask2 = getPredicatedMask(Region2);
359 if (!Mask1 || Mask1 != Mask2)
360 continue;
361
362 assert(Mask1 && Mask2 && "both region must have conditions");
363 WorkList.push_back(Region1);
364 }
365
366 // Move recipes from Region1 to its successor region, if both are triangles.
367 for (VPRegionBlock *Region1 : WorkList) {
368 if (TransformedRegions.contains(Region1))
369 continue;
370 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
371 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
372
373 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
374 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
375 if (!Then1 || !Then2)
376 continue;
377
378 // Note: No fusion-preventing memory dependencies are expected in either
379 // region. Such dependencies should be rejected during earlier dependence
380 // checks, which guarantee accesses can be re-ordered for vectorization.
381 //
382 // Move recipes to the successor region.
383 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
384 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
385
386 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
387 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
388
389 // Move VPPredInstPHIRecipes from the merge block to the successor region's
390 // merge block. Update all users inside the successor region to use the
391 // original values.
392 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
393 VPValue *PredInst1 =
394 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
395 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
396 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
397 return cast<VPRecipeBase>(&U)->getParent() == Then2;
398 });
399
400 // Remove phi recipes that are unused after merging the regions.
401 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
402 Phi1ToMove.eraseFromParent();
403 continue;
404 }
405 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
406 }
407
408 // Remove the dead recipes in Region1's entry block.
409 for (VPRecipeBase &R :
410 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
411 R.eraseFromParent();
412
413 // Finally, remove the first region.
414 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
415 VPBlockUtils::disconnectBlocks(Pred, Region1);
416 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
417 }
418 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
419 TransformedRegions.insert(Region1);
420 }
421
422 return !TransformedRegions.empty();
423}
424
426 VPlan &Plan) {
427 Instruction *Instr = PredRecipe->getUnderlyingInstr();
428 // Build the triangular if-then region.
429 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
430 assert(Instr->getParent() && "Predicated instruction not in any basic block");
431 auto *BlockInMask = PredRecipe->getMask();
432 auto *MaskDef = BlockInMask->getDefiningRecipe();
433 auto *BOMRecipe = new VPBranchOnMaskRecipe(
434 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
435 auto *Entry =
436 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
437
438 // Replace predicated replicate recipe with a replicate recipe without a
439 // mask but in the replicate region.
440 auto *RecipeWithoutMask = new VPReplicateRecipe(
441 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
442 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
443 PredRecipe->getDebugLoc());
444 auto *Pred =
445 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
446
447 VPPredInstPHIRecipe *PHIRecipe = nullptr;
448 if (PredRecipe->getNumUsers() != 0) {
449 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
450 RecipeWithoutMask->getDebugLoc());
451 PredRecipe->replaceAllUsesWith(PHIRecipe);
452 PHIRecipe->setOperand(0, RecipeWithoutMask);
453 }
454 PredRecipe->eraseFromParent();
455 auto *Exiting =
456 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
458 Plan.createReplicateRegion(Entry, Exiting, RegionName);
459
460 // Note: first set Entry as region entry and then connect successors starting
461 // from it in order, to propagate the "parent" of each VPBasicBlock.
462 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
463 VPBlockUtils::connectBlocks(Pred, Exiting);
464
465 return Region;
466}
467
468static void addReplicateRegions(VPlan &Plan) {
471 vp_depth_first_deep(Plan.getEntry()))) {
472 for (VPRecipeBase &R : *VPBB)
473 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
474 if (RepR->isPredicated())
475 WorkList.push_back(RepR);
476 }
477 }
478
479 unsigned BBNum = 0;
480 for (VPReplicateRecipe *RepR : WorkList) {
481 VPBasicBlock *CurrentBlock = RepR->getParent();
482 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
483
484 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
485 SplitBlock->setName(
486 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
487 // Record predicated instructions for above packing optimizations.
489 Region->setParent(CurrentBlock->getParent());
491
492 VPRegionBlock *ParentRegion = Region->getParent();
493 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
494 ParentRegion->setExiting(SplitBlock);
495 }
496}
497
498/// Remove redundant VPBasicBlocks by merging them into their predecessor if
499/// the predecessor has a single successor.
503 vp_depth_first_deep(Plan.getEntry()))) {
504 // Don't fold the blocks in the skeleton of the Plan into their single
505 // predecessors for now.
506 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
507 if (!VPBB->getParent())
508 continue;
509 auto *PredVPBB =
510 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
511 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
512 isa<VPIRBasicBlock>(PredVPBB))
513 continue;
514 WorkList.push_back(VPBB);
515 }
516
517 for (VPBasicBlock *VPBB : WorkList) {
518 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
519 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
520 R.moveBefore(*PredVPBB, PredVPBB->end());
521 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
522 auto *ParentRegion = VPBB->getParent();
523 if (ParentRegion && ParentRegion->getExiting() == VPBB)
524 ParentRegion->setExiting(PredVPBB);
525 for (auto *Succ : to_vector(VPBB->successors())) {
527 VPBlockUtils::connectBlocks(PredVPBB, Succ);
528 }
529 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
530 }
531 return !WorkList.empty();
532}
533
535 // Convert masked VPReplicateRecipes to if-then region blocks.
537
538 bool ShouldSimplify = true;
539 while (ShouldSimplify) {
540 ShouldSimplify = sinkScalarOperands(Plan);
541 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
542 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
543 }
544}
545
546/// Remove redundant casts of inductions.
547///
548/// Such redundant casts are casts of induction variables that can be ignored,
549/// because we already proved that the casted phi is equal to the uncasted phi
550/// in the vectorized loop. There is no need to vectorize the cast - the same
551/// value can be used for both the phi and casts in the vector loop.
553 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
555 if (!IV || IV->getTruncInst())
556 continue;
557
558 // A sequence of IR Casts has potentially been recorded for IV, which
559 // *must be bypassed* when the IV is vectorized, because the vectorized IV
560 // will produce the desired casted value. This sequence forms a def-use
561 // chain and is provided in reverse order, ending with the cast that uses
562 // the IV phi. Search for the recipe of the last cast in the chain and
563 // replace it with the original IV. Note that only the final cast is
564 // expected to have users outside the cast-chain and the dead casts left
565 // over will be cleaned up later.
566 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
567 VPValue *FindMyCast = IV;
568 for (Instruction *IRCast : reverse(Casts)) {
569 VPSingleDefRecipe *FoundUserCast = nullptr;
570 for (auto *U : FindMyCast->users()) {
571 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
572 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
573 FoundUserCast = UserCast;
574 break;
575 }
576 }
577 FindMyCast = FoundUserCast;
578 }
579 FindMyCast->replaceAllUsesWith(IV);
580 }
581}
582
583/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
584/// recipe, if it exists.
586 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
587 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
588 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
589 for (VPUser *U : CanonicalIV->users()) {
591 if (WidenNewIV)
592 break;
593 }
594
595 if (!WidenNewIV)
596 return;
597
598 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
599 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
600 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
601
602 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
603 continue;
604
605 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
606 // everything WidenNewIV's users need. That is, WidenOriginalIV will
607 // generate a vector phi or all users of WidenNewIV demand the first lane
608 // only.
609 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
610 vputils::onlyFirstLaneUsed(WidenNewIV)) {
611 // We are replacing a wide canonical iv with a suitable wide induction.
612 // This is used to compute header mask, hence all lanes will be used and
613 // we need to drop wrap flags only applying to lanes guranteed to execute
614 // in the original scalar loop.
615 WidenOriginalIV->dropPoisonGeneratingFlags();
616 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
617 WidenNewIV->eraseFromParent();
618 return;
619 }
620 }
621}
622
623/// Returns true if \p R is dead and can be removed.
624static bool isDeadRecipe(VPRecipeBase &R) {
625 // Do remove conditional assume instructions as their conditions may be
626 // flattened.
627 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
628 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
630 if (IsConditionalAssume)
631 return true;
632
633 if (R.mayHaveSideEffects())
634 return false;
635
636 // Recipe is dead if no user keeps the recipe alive.
637 return all_of(R.definedValues(),
638 [](VPValue *V) { return V->getNumUsers() == 0; });
639}
640
643 vp_post_order_deep(Plan.getEntry()))) {
644 // The recipes in the block are processed in reverse order, to catch chains
645 // of dead recipes.
646 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
647 if (isDeadRecipe(R)) {
648 R.eraseFromParent();
649 continue;
650 }
651
652 // Check if R is a dead VPPhi <-> update cycle and remove it.
653 auto *PhiR = dyn_cast<VPPhi>(&R);
654 if (!PhiR || PhiR->getNumOperands() != 2)
655 continue;
656 VPUser *PhiUser = PhiR->getSingleUser();
657 if (!PhiUser)
658 continue;
659 VPValue *Incoming = PhiR->getOperand(1);
660 if (PhiUser != Incoming->getDefiningRecipe() ||
661 Incoming->getNumUsers() != 1)
662 continue;
663 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
664 PhiR->eraseFromParent();
665 Incoming->getDefiningRecipe()->eraseFromParent();
666 }
667 }
668}
669
672 Instruction::BinaryOps InductionOpcode,
673 FPMathOperator *FPBinOp, Instruction *TruncI,
674 VPValue *StartV, VPValue *Step, DebugLoc DL,
675 VPBuilder &Builder) {
676 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
677 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
678 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
679 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
680 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
681
682 // Truncate base induction if needed.
683 VPTypeAnalysis TypeInfo(Plan);
684 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
685 if (TruncI) {
686 Type *TruncTy = TruncI->getType();
687 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
688 "Not truncating.");
689 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
690 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
691 ResultTy = TruncTy;
692 }
693
694 // Truncate step if needed.
695 Type *StepTy = TypeInfo.inferScalarType(Step);
696 if (ResultTy != StepTy) {
697 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
698 "Not truncating.");
699 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
700 auto *VecPreheader =
702 VPBuilder::InsertPointGuard Guard(Builder);
703 Builder.setInsertPoint(VecPreheader);
704 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
705 }
706 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
707 &Plan.getVF(), DL);
708}
709
712 for (unsigned I = 0; I != Users.size(); ++I) {
714 if (isa<VPHeaderPHIRecipe>(Cur))
715 continue;
716 for (VPValue *V : Cur->definedValues())
717 Users.insert_range(V->users());
718 }
719 return Users.takeVector();
720}
721
722/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
723/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
724/// generates scalar values.
725static VPValue *
727 VPlan &Plan, VPBuilder &Builder) {
729 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
730 VPValue *StepV = PtrIV->getOperand(1);
732 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
733 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
734
735 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
736 PtrIV->getDebugLoc(), "next.gep");
737}
738
739/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
740/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
741/// VPWidenPointerInductionRecipe will generate vectors only. If some users
742/// require vectors while other require scalars, the scalar uses need to extract
743/// the scalars from the generated vectors (Note that this is different to how
744/// int/fp inductions are handled). Legalize extract-from-ends using uniform
745/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
746/// the correct end value is available. Also optimize
747/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
748/// providing them scalar steps built on the canonical scalar IV and update the
749/// original IV's users. This is an optional optimization to reduce the needs of
750/// vector extracts.
753 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
754 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
755 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
756 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
757 if (!PhiR)
758 continue;
759
760 // Try to narrow wide and replicating recipes to uniform recipes, based on
761 // VPlan analysis.
762 // TODO: Apply to all recipes in the future, to replace legacy uniformity
763 // analysis.
764 auto Users = collectUsersRecursively(PhiR);
765 for (VPUser *U : reverse(Users)) {
766 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
767 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
768 // Skip recipes that shouldn't be narrowed.
769 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
770 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
771 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
772 continue;
773
774 // Skip recipes that may have other lanes than their first used.
776 continue;
777
778 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
779 Def->operands(), /*IsUniform*/ true,
780 /*Mask*/ nullptr, /*Flags*/ *Def);
781 Clone->insertAfter(Def);
782 Def->replaceAllUsesWith(Clone);
783 }
784
785 // Replace wide pointer inductions which have only their scalars used by
786 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
787 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
788 if (!Plan.hasScalarVFOnly() &&
789 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
790 continue;
791
792 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
793 PtrIV->replaceAllUsesWith(PtrAdd);
794 continue;
795 }
796
797 // Replace widened induction with scalar steps for users that only use
798 // scalars.
799 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
800 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
801 return U->usesScalars(WideIV);
802 }))
803 continue;
804
805 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
807 Plan, ID.getKind(), ID.getInductionOpcode(),
808 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
809 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
810 WideIV->getDebugLoc(), Builder);
811
812 // Update scalar users of IV to use Step instead.
813 if (!HasOnlyVectorVFs) {
814 assert(!Plan.hasScalableVF() &&
815 "plans containing a scalar VF cannot also include scalable VFs");
816 WideIV->replaceAllUsesWith(Steps);
817 } else {
818 bool HasScalableVF = Plan.hasScalableVF();
819 WideIV->replaceUsesWithIf(Steps,
820 [WideIV, HasScalableVF](VPUser &U, unsigned) {
821 if (HasScalableVF)
822 return U.usesFirstLaneOnly(WideIV);
823 return U.usesScalars(WideIV);
824 });
825 }
826 }
827}
828
829/// Check if \p VPV is an untruncated wide induction, either before or after the
830/// increment. If so return the header IV (before the increment), otherwise
831/// return null.
833 ScalarEvolution &SE) {
834 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
835 if (WideIV) {
836 // VPV itself is a wide induction, separately compute the end value for exit
837 // users if it is not a truncated IV.
838 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
839 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
840 }
841
842 // Check if VPV is an optimizable induction increment.
843 VPRecipeBase *Def = VPV->getDefiningRecipe();
844 if (!Def || Def->getNumOperands() != 2)
845 return nullptr;
846 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
847 if (!WideIV)
848 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
849 if (!WideIV)
850 return nullptr;
851
852 auto IsWideIVInc = [&]() {
853 auto &ID = WideIV->getInductionDescriptor();
854
855 // Check if VPV increments the induction by the induction step.
856 VPValue *IVStep = WideIV->getStepValue();
857 switch (ID.getInductionOpcode()) {
858 case Instruction::Add:
859 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
860 case Instruction::FAdd:
862 m_Specific(IVStep)));
863 case Instruction::FSub:
864 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
865 m_Specific(IVStep)));
866 case Instruction::Sub: {
867 // IVStep will be the negated step of the subtraction. Check if Step == -1
868 // * IVStep.
869 VPValue *Step;
870 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
871 return false;
872 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
873 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
874 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
875 !isa<SCEVCouldNotCompute>(StepSCEV) &&
876 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
877 }
878 default:
879 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
880 match(VPV, m_GetElementPtr(m_Specific(WideIV),
881 m_Specific(WideIV->getStepValue())));
882 }
883 llvm_unreachable("should have been covered by switch above");
884 };
885 return IsWideIVInc() ? WideIV : nullptr;
886}
887
888/// Attempts to optimize the induction variable exit values for users in the
889/// early exit block.
891 VPTypeAnalysis &TypeInfo,
892 VPBlockBase *PredVPBB,
893 VPValue *Op,
894 ScalarEvolution &SE) {
895 VPValue *Incoming, *Mask;
898 return nullptr;
899
900 auto *WideIV = getOptimizableIVOf(Incoming, SE);
901 if (!WideIV)
902 return nullptr;
903
904 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
905 if (WideIntOrFp && WideIntOrFp->getTruncInst())
906 return nullptr;
907
908 // Calculate the final index.
909 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
910 auto *CanonicalIV = LoopRegion->getCanonicalIV();
911 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
912 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
913
914 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
915 VPValue *FirstActiveLane =
916 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
917 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
918 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
919 FirstActiveLaneType, DL);
920 VPValue *EndValue =
921 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
922
923 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
924 // changed it means the exit is using the incremented value, so we need to
925 // add the step.
926 if (Incoming != WideIV) {
927 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
928 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
929 }
930
931 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
932 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
933 VPValue *Start = WideIV->getStartValue();
934 VPValue *Step = WideIV->getStepValue();
935 EndValue = B.createDerivedIV(
936 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
937 Start, EndValue, Step);
938 }
939
940 return EndValue;
941}
942
943/// Attempts to optimize the induction variable exit values for users in the
944/// exit block coming from the latch in the original scalar loop.
946 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
950 return nullptr;
951
952 auto *WideIV = getOptimizableIVOf(Incoming, SE);
953 if (!WideIV)
954 return nullptr;
955
956 VPValue *EndValue = EndValues.lookup(WideIV);
957 assert(EndValue && "end value must have been pre-computed");
958
959 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
960 // changed it means the exit is using the incremented value, so we don't
961 // need to subtract the step.
962 if (Incoming != WideIV)
963 return EndValue;
964
965 // Otherwise, subtract the step from the EndValue.
966 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
967 VPValue *Step = WideIV->getStepValue();
968 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
969 if (ScalarTy->isIntegerTy())
970 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
971 DebugLoc::getUnknown(), "ind.escape");
972 if (ScalarTy->isPointerTy()) {
973 Type *StepTy = TypeInfo.inferScalarType(Step);
974 auto *Zero = Plan.getConstantInt(StepTy, 0);
975 return B.createPtrAdd(EndValue,
976 B.createNaryOp(Instruction::Sub, {Zero, Step}),
977 DebugLoc::getUnknown(), "ind.escape");
978 }
979 if (ScalarTy->isFloatingPointTy()) {
980 const auto &ID = WideIV->getInductionDescriptor();
981 return B.createNaryOp(
982 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
983 ? Instruction::FSub
984 : Instruction::FAdd,
985 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
986 }
987 llvm_unreachable("all possible induction types must be handled");
988 return nullptr;
989}
990
992 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
993 ScalarEvolution &SE) {
994 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
995 VPTypeAnalysis TypeInfo(Plan);
996 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
997 for (VPRecipeBase &R : ExitVPBB->phis()) {
998 auto *ExitIRI = cast<VPIRPhi>(&R);
999
1000 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1001 VPValue *Escape = nullptr;
1002 if (PredVPBB == MiddleVPBB)
1003 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1004 ExitIRI->getOperand(Idx),
1005 EndValues, SE);
1006 else
1007 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
1008 ExitIRI->getOperand(Idx), SE);
1009 if (Escape)
1010 ExitIRI->setOperand(Idx, Escape);
1011 }
1012 }
1013 }
1014}
1015
1016/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1017/// them with already existing recipes expanding the same SCEV expression.
1020
1021 for (VPRecipeBase &R :
1023 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1024 if (!ExpR)
1025 continue;
1026
1027 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1028 if (Inserted)
1029 continue;
1030 ExpR->replaceAllUsesWith(V->second);
1031 ExpR->eraseFromParent();
1032 }
1033}
1034
1036 SmallVector<VPValue *> WorkList;
1038 WorkList.push_back(V);
1039
1040 while (!WorkList.empty()) {
1041 VPValue *Cur = WorkList.pop_back_val();
1042 if (!Seen.insert(Cur).second)
1043 continue;
1044 VPRecipeBase *R = Cur->getDefiningRecipe();
1045 if (!R)
1046 continue;
1047 if (!isDeadRecipe(*R))
1048 continue;
1049 append_range(WorkList, R->operands());
1050 R->eraseFromParent();
1051 }
1052}
1053
1054/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1055/// Returns an optional pair, where the first element indicates whether it is
1056/// an intrinsic ID.
1057static std::optional<std::pair<bool, unsigned>>
1059 return TypeSwitch<const VPSingleDefRecipe *,
1060 std::optional<std::pair<bool, unsigned>>>(R)
1063 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1064 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1065 return std::make_pair(true, I->getVectorIntrinsicID());
1066 })
1067 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1068 // For recipes that do not directly map to LLVM IR instructions,
1069 // assign opcodes after the last VPInstruction opcode (which is also
1070 // after the last IR Instruction opcode), based on the VPDefID.
1071 return std::make_pair(false,
1072 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1073 })
1074 .Default([](auto *) { return std::nullopt; });
1075}
1076
1077/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1078/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1079/// Operands are foldable live-ins.
1081 ArrayRef<VPValue *> Operands,
1082 const DataLayout &DL,
1083 VPTypeAnalysis &TypeInfo) {
1084 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1085 if (!OpcodeOrIID)
1086 return nullptr;
1087
1089 for (VPValue *Op : Operands) {
1090 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1091 return nullptr;
1092 Ops.push_back(Op->getLiveInIRValue());
1093 }
1094
1095 auto FoldToIRValue = [&]() -> Value * {
1096 InstSimplifyFolder Folder(DL);
1097 if (OpcodeOrIID->first) {
1098 if (R.getNumOperands() != 2)
1099 return nullptr;
1100 unsigned ID = OpcodeOrIID->second;
1101 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1102 TypeInfo.inferScalarType(&R));
1103 }
1104 unsigned Opcode = OpcodeOrIID->second;
1105 if (Instruction::isBinaryOp(Opcode))
1106 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1107 Ops[0], Ops[1]);
1108 if (Instruction::isCast(Opcode))
1109 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1110 TypeInfo.inferScalarType(R.getVPSingleValue()));
1111 switch (Opcode) {
1113 return Folder.FoldSelect(Ops[0], Ops[1],
1115 case VPInstruction::Not:
1116 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1118 case Instruction::Select:
1119 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1120 case Instruction::ICmp:
1121 case Instruction::FCmp:
1122 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1123 Ops[1]);
1124 case Instruction::GetElementPtr: {
1125 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1126 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1127 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1128 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1129 }
1132 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1133 Ops[0], Ops[1],
1134 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1135 // An extract of a live-in is an extract of a broadcast, so return the
1136 // broadcasted element.
1137 case Instruction::ExtractElement:
1138 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1139 return Ops[0];
1140 }
1141 return nullptr;
1142 };
1143
1144 if (Value *V = FoldToIRValue())
1145 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1146 return nullptr;
1147}
1148
1149/// Try to simplify VPSingleDefRecipe \p Def.
1151 VPlan *Plan = Def->getParent()->getPlan();
1152
1153 // Simplification of live-in IR values for SingleDef recipes using
1154 // InstSimplifyFolder.
1155 const DataLayout &DL =
1157 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1158 return Def->replaceAllUsesWith(V);
1159
1160 // Fold PredPHI LiveIn -> LiveIn.
1161 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1162 VPValue *Op = PredPHI->getOperand(0);
1163 if (Op->isLiveIn())
1164 PredPHI->replaceAllUsesWith(Op);
1165 }
1166
1167 VPBuilder Builder(Def);
1168 VPValue *A;
1169 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1170 Type *TruncTy = TypeInfo.inferScalarType(Def);
1171 Type *ATy = TypeInfo.inferScalarType(A);
1172 if (TruncTy == ATy) {
1173 Def->replaceAllUsesWith(A);
1174 } else {
1175 // Don't replace a scalarizing recipe with a widened cast.
1176 if (isa<VPReplicateRecipe>(Def))
1177 return;
1178 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1179
1180 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1181 ? Instruction::SExt
1182 : Instruction::ZExt;
1183 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1184 TruncTy);
1185 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1186 // UnderlyingExt has distinct return type, used to retain legacy cost.
1187 Ext->setUnderlyingValue(UnderlyingExt);
1188 }
1189 Def->replaceAllUsesWith(Ext);
1190 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1191 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1192 Def->replaceAllUsesWith(Trunc);
1193 }
1194 }
1195#ifndef NDEBUG
1196 // Verify that the cached type info is for both A and its users is still
1197 // accurate by comparing it to freshly computed types.
1198 VPTypeAnalysis TypeInfo2(*Plan);
1199 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1200 for (VPUser *U : A->users()) {
1201 auto *R = cast<VPRecipeBase>(U);
1202 for (VPValue *VPV : R->definedValues())
1203 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1204 }
1205#endif
1206 }
1207
1208 // Simplify (X && Y) || (X && !Y) -> X.
1209 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1210 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1211 // recipes to be visited during simplification.
1212 VPValue *X, *Y, *Z;
1213 if (match(Def,
1216 Def->replaceAllUsesWith(X);
1217 Def->eraseFromParent();
1218 return;
1219 }
1220
1221 // x | 1 -> 1
1222 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1223 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1224
1225 // x | 0 -> x
1226 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1227 return Def->replaceAllUsesWith(X);
1228
1229 // x & 0 -> 0
1230 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1231 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1232
1233 // x && false -> false
1234 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1235 return Def->replaceAllUsesWith(Def->getOperand(1));
1236
1237 // (x && y) || (x && z) -> x && (y || z)
1240 // Simplify only if one of the operands has one use to avoid creating an
1241 // extra recipe.
1242 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1243 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1244 return Def->replaceAllUsesWith(
1245 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1246
1247 // x && !x -> 0
1249 return Def->replaceAllUsesWith(Plan->getFalse());
1250
1251 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1252 return Def->replaceAllUsesWith(X);
1253
1254 // select !c, x, y -> select c, y, x
1255 VPValue *C;
1256 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1257 Def->setOperand(0, C);
1258 Def->setOperand(1, Y);
1259 Def->setOperand(2, X);
1260 return;
1261 }
1262
1263 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1264 // tail folding it is likely that x is a header mask and can be simplified
1265 // further.
1267 m_VPValue(Z))) &&
1268 X->hasMoreThanOneUniqueUser())
1269 return Def->replaceAllUsesWith(
1270 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1271
1272 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1273 return Def->replaceAllUsesWith(A);
1274
1275 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1276 return Def->replaceAllUsesWith(A);
1277
1278 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1279 return Def->replaceAllUsesWith(
1280 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1281
1282 if (match(Def, m_Not(m_VPValue(A)))) {
1283 if (match(A, m_Not(m_VPValue(A))))
1284 return Def->replaceAllUsesWith(A);
1285
1286 // Try to fold Not into compares by adjusting the predicate in-place.
1287 CmpPredicate Pred;
1288 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1289 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1290 if (all_of(Cmp->users(),
1292 m_Not(m_Specific(Cmp)),
1293 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1294 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1295 for (VPUser *U : to_vector(Cmp->users())) {
1296 auto *R = cast<VPSingleDefRecipe>(U);
1297 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1298 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1299 R->setOperand(1, Y);
1300 R->setOperand(2, X);
1301 } else {
1302 // not (cmp pred) -> cmp inv_pred
1303 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1304 R->replaceAllUsesWith(Cmp);
1305 }
1306 }
1307 // If Cmp doesn't have a debug location, use the one from the negation,
1308 // to preserve the location.
1309 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1310 Cmp->setDebugLoc(Def->getDebugLoc());
1311 }
1312 }
1313 }
1314
1315 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1316 // any-of (fcmp uno %A, %B), ...
1317 if (match(Def, m_AnyOf())) {
1319 VPRecipeBase *UnpairedCmp = nullptr;
1320 for (VPValue *Op : Def->operands()) {
1321 VPValue *X;
1322 if (Op->getNumUsers() > 1 ||
1324 m_Deferred(X)))) {
1325 NewOps.push_back(Op);
1326 } else if (!UnpairedCmp) {
1327 UnpairedCmp = Op->getDefiningRecipe();
1328 } else {
1329 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1330 UnpairedCmp->getOperand(0), X));
1331 UnpairedCmp = nullptr;
1332 }
1333 }
1334
1335 if (UnpairedCmp)
1336 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1337
1338 if (NewOps.size() < Def->getNumOperands()) {
1339 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1340 return Def->replaceAllUsesWith(NewAnyOf);
1341 }
1342 }
1343
1344 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1345 // This is useful for fmax/fmin without fast-math flags, where we need to
1346 // check if any operand is NaN.
1348 m_Deferred(X)),
1350 m_Deferred(Y))))) {
1351 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1352 return Def->replaceAllUsesWith(NewCmp);
1353 }
1354
1355 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1356 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1357 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1358 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1359 TypeInfo.inferScalarType(Def))
1360 return Def->replaceAllUsesWith(Def->getOperand(1));
1361
1363 m_One()))) {
1364 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1365 if (TypeInfo.inferScalarType(X) != WideStepTy)
1366 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1367 Def->replaceAllUsesWith(X);
1368 return;
1369 }
1370
1371 // For i1 vp.merges produced by AnyOf reductions:
1372 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1374 m_VPValue(X), m_VPValue())) &&
1376 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1377 Def->setOperand(1, Def->getOperand(0));
1378 Def->setOperand(0, Y);
1379 return;
1380 }
1381
1382 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1383 if (Phi->getOperand(0) == Phi->getOperand(1))
1384 Phi->replaceAllUsesWith(Phi->getOperand(0));
1385 return;
1386 }
1387
1388 // Look through ExtractLastElement (BuildVector ....).
1391 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1392 Def->replaceAllUsesWith(
1393 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1394 return;
1395 }
1396
1397 // Look through ExtractPenultimateElement (BuildVector ....).
1399 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1400 Def->replaceAllUsesWith(
1401 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1402 return;
1403 }
1404
1405 uint64_t Idx;
1407 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1408 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1409 return;
1410 }
1411
1412 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1413 Def->replaceAllUsesWith(
1414 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1415 return;
1416 }
1417
1418 // Look through broadcast of single-scalar when used as select conditions; in
1419 // that case the scalar condition can be used directly.
1420 if (match(Def,
1423 "broadcast operand must be single-scalar");
1424 Def->setOperand(0, C);
1425 return;
1426 }
1427
1428 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1429 if (Phi->getNumOperands() == 1)
1430 Phi->replaceAllUsesWith(Phi->getOperand(0));
1431 return;
1432 }
1433
1434 // Some simplifications can only be applied after unrolling. Perform them
1435 // below.
1436 if (!Plan->isUnrolled())
1437 return;
1438
1439 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1440 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1441 isa<VPPhi>(X)) {
1442 auto *Phi = cast<VPPhi>(X);
1443 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1444 Phi->getSingleUser() == Def) {
1445 Phi->setOperand(0, Y);
1446 Def->replaceAllUsesWith(Phi);
1447 return;
1448 }
1449 }
1450
1451 // VPVectorPointer for part 0 can be replaced by their start pointer.
1452 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1453 if (VecPtr->isFirstPart()) {
1454 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1455 return;
1456 }
1457 }
1458
1459 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1460 // the first lane is demanded.
1461 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1462 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1463 Steps->replaceAllUsesWith(Steps->getOperand(0));
1464 return;
1465 }
1466 }
1467 // Simplify redundant ReductionStartVector recipes after unrolling.
1468 VPValue *StartV;
1470 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1471 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1472 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1473 return PhiR && PhiR->isInLoop();
1474 });
1475 return;
1476 }
1477
1478 if (match(Def,
1481 Def->replaceAllUsesWith(A);
1482 return;
1483 }
1484
1489 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1490 all_of(A->users(),
1491 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1492 return Def->replaceAllUsesWith(A);
1493 }
1494
1495 if (Plan->getUF() == 1 &&
1497 return Def->replaceAllUsesWith(
1498 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1499 }
1500}
1501
1504 Plan.getEntry());
1505 VPTypeAnalysis TypeInfo(Plan);
1507 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1508 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1509 simplifyRecipe(Def, TypeInfo);
1510 }
1511}
1512
1514 if (Plan.hasScalarVFOnly())
1515 return;
1516
1517 // Try to narrow wide and replicating recipes to single scalar recipes,
1518 // based on VPlan analysis. Only process blocks in the loop region for now,
1519 // without traversing into nested regions, as recipes in replicate regions
1520 // cannot be converted yet.
1523 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1525 VPReplicateRecipe>(&R))
1526 continue;
1527 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1528 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1529 continue;
1530
1531 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1532 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1533 vputils::isSingleScalar(RepR->getOperand(1))) {
1534 auto *Clone = new VPReplicateRecipe(
1535 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1536 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1537 *RepR /*Metadata*/, RepR->getDebugLoc());
1538 Clone->insertBefore(RepOrWidenR);
1539 unsigned ExtractOpc =
1540 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1543 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1544 Ext->insertBefore(Clone);
1545 Clone->setOperand(0, Ext);
1546 RepR->eraseFromParent();
1547 continue;
1548 }
1549
1550 // Skip recipes that aren't single scalars.
1551 if (!vputils::isSingleScalar(RepOrWidenR))
1552 continue;
1553
1554 // Skip recipes for which conversion to single-scalar does introduce
1555 // additional broadcasts. No extra broadcasts are needed, if either only
1556 // the scalars of the recipe are used, or at least one of the operands
1557 // would require a broadcast. In the latter case, the single-scalar may
1558 // need to be broadcasted, but another broadcast is removed.
1559 if (!all_of(RepOrWidenR->users(),
1560 [RepOrWidenR](const VPUser *U) {
1561 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1562 unsigned Opcode = VPI->getOpcode();
1563 if (Opcode == VPInstruction::ExtractLastElement ||
1564 Opcode == VPInstruction::ExtractLastLanePerPart ||
1565 Opcode == VPInstruction::ExtractPenultimateElement)
1566 return true;
1567 }
1568
1569 return U->usesScalars(RepOrWidenR);
1570 }) &&
1571 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1572 if (Op->getSingleUser() != RepOrWidenR)
1573 return false;
1574 // Non-constant live-ins require broadcasts, while constants do not
1575 // need explicit broadcasts.
1576 bool LiveInNeedsBroadcast =
1577 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1578 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1579 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1580 }))
1581 continue;
1582
1583 auto *Clone = new VPReplicateRecipe(
1584 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1585 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1586 Clone->insertBefore(RepOrWidenR);
1587 RepOrWidenR->replaceAllUsesWith(Clone);
1588 if (isDeadRecipe(*RepOrWidenR))
1589 RepOrWidenR->eraseFromParent();
1590 }
1591 }
1592}
1593
1594/// Try to see if all of \p Blend's masks share a common value logically and'ed
1595/// and remove it from the masks.
1597 if (Blend->isNormalized())
1598 return;
1599 VPValue *CommonEdgeMask;
1600 if (!match(Blend->getMask(0),
1601 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1602 return;
1603 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1604 if (!match(Blend->getMask(I),
1605 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1606 return;
1607 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1608 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1609}
1610
1611/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1612/// to make sure the masks are simplified.
1613static void simplifyBlends(VPlan &Plan) {
1616 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1617 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1618 if (!Blend)
1619 continue;
1620
1621 removeCommonBlendMask(Blend);
1622
1623 // Try to remove redundant blend recipes.
1624 SmallPtrSet<VPValue *, 4> UniqueValues;
1625 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1626 UniqueValues.insert(Blend->getIncomingValue(0));
1627 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1628 if (!match(Blend->getMask(I), m_False()))
1629 UniqueValues.insert(Blend->getIncomingValue(I));
1630
1631 if (UniqueValues.size() == 1) {
1632 Blend->replaceAllUsesWith(*UniqueValues.begin());
1633 Blend->eraseFromParent();
1634 continue;
1635 }
1636
1637 if (Blend->isNormalized())
1638 continue;
1639
1640 // Normalize the blend so its first incoming value is used as the initial
1641 // value with the others blended into it.
1642
1643 unsigned StartIndex = 0;
1644 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1645 // If a value's mask is used only by the blend then is can be deadcoded.
1646 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1647 // that's used by multiple blends where it can be removed from them all.
1648 VPValue *Mask = Blend->getMask(I);
1649 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1650 StartIndex = I;
1651 break;
1652 }
1653 }
1654
1655 SmallVector<VPValue *, 4> OperandsWithMask;
1656 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1657
1658 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1659 if (I == StartIndex)
1660 continue;
1661 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1662 OperandsWithMask.push_back(Blend->getMask(I));
1663 }
1664
1665 auto *NewBlend =
1666 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1667 OperandsWithMask, Blend->getDebugLoc());
1668 NewBlend->insertBefore(&R);
1669
1670 VPValue *DeadMask = Blend->getMask(StartIndex);
1671 Blend->replaceAllUsesWith(NewBlend);
1672 Blend->eraseFromParent();
1674
1675 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1676 VPValue *NewMask;
1677 if (NewBlend->getNumOperands() == 3 &&
1678 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1679 VPValue *Inc0 = NewBlend->getOperand(0);
1680 VPValue *Inc1 = NewBlend->getOperand(1);
1681 VPValue *OldMask = NewBlend->getOperand(2);
1682 NewBlend->setOperand(0, Inc1);
1683 NewBlend->setOperand(1, Inc0);
1684 NewBlend->setOperand(2, NewMask);
1685 if (OldMask->getNumUsers() == 0)
1686 cast<VPInstruction>(OldMask)->eraseFromParent();
1687 }
1688 }
1689 }
1690}
1691
1692/// Optimize the width of vector induction variables in \p Plan based on a known
1693/// constant Trip Count, \p BestVF and \p BestUF.
1695 ElementCount BestVF,
1696 unsigned BestUF) {
1697 // Only proceed if we have not completely removed the vector region.
1698 if (!Plan.getVectorLoopRegion())
1699 return false;
1700
1701 const APInt *TC;
1702 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1703 return false;
1704
1705 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1706 // and UF. Returns at least 8.
1707 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1708 APInt AlignedTC =
1711 APInt MaxVal = AlignedTC - 1;
1712 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1713 };
1714 unsigned NewBitWidth =
1715 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1716
1717 LLVMContext &Ctx = Plan.getContext();
1718 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1719
1720 bool MadeChange = false;
1721
1722 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1723 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1724 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1725
1726 // Currently only handle canonical IVs as it is trivial to replace the start
1727 // and stop values, and we currently only perform the optimization when the
1728 // IV has a single use.
1729 if (!WideIV || !WideIV->isCanonical() ||
1730 WideIV->hasMoreThanOneUniqueUser() ||
1731 NewIVTy == WideIV->getScalarType())
1732 continue;
1733
1734 // Currently only handle cases where the single user is a header-mask
1735 // comparison with the backedge-taken-count.
1736 VPUser *SingleUser = WideIV->getSingleUser();
1737 if (!SingleUser ||
1738 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1741 continue;
1742
1743 // Update IV operands and comparison bound to use new narrower type.
1744 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1745 WideIV->setStartValue(NewStart);
1746 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1747 WideIV->setStepValue(NewStep);
1748
1749 auto *NewBTC = new VPWidenCastRecipe(
1750 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1751 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1752 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1753 Cmp->setOperand(1, NewBTC);
1754
1755 MadeChange = true;
1756 }
1757
1758 return MadeChange;
1759}
1760
1761/// Return true if \p Cond is known to be true for given \p BestVF and \p
1762/// BestUF.
1764 ElementCount BestVF, unsigned BestUF,
1765 ScalarEvolution &SE) {
1767 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1768 &SE](VPValue *C) {
1769 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1770 });
1771
1772 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1774 m_Specific(CanIV->getBackedgeValue()),
1775 m_Specific(&Plan.getVectorTripCount()))))
1776 return false;
1777
1778 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1779 // count is not conveniently available as SCEV so far, so we compare directly
1780 // against the original trip count. This is stricter than necessary, as we
1781 // will only return true if the trip count == vector trip count.
1782 const SCEV *VectorTripCount =
1784 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1785 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1786 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1787 "Trip count SCEV must be computable");
1788 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1789 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1790 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1791}
1792
1793/// Try to replace multiple active lane masks used for control flow with
1794/// a single, wide active lane mask instruction followed by multiple
1795/// extract subvector intrinsics. This applies to the active lane mask
1796/// instructions both in the loop and in the preheader.
1797/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1798/// new extracts from the first active lane mask, which has it's last
1799/// operand (multiplier) set to UF.
1801 unsigned UF) {
1802 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1803 return false;
1804
1805 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1806 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1807 auto *Term = &ExitingVPBB->back();
1808
1809 using namespace llvm::VPlanPatternMatch;
1811 m_VPValue(), m_VPValue(), m_VPValue())))))
1812 return false;
1813
1814 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1815 LLVMContext &Ctx = Plan.getContext();
1816
1817 auto ExtractFromALM = [&](VPInstruction *ALM,
1818 SmallVectorImpl<VPValue *> &Extracts) {
1819 DebugLoc DL = ALM->getDebugLoc();
1820 for (unsigned Part = 0; Part < UF; ++Part) {
1822 Ops.append({ALM, Plan.getOrAddLiveIn(
1823 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1824 VF.getKnownMinValue() * Part))});
1825 auto *Ext =
1826 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1827 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1828 Extracts[Part] = Ext;
1829 Ext->insertAfter(ALM);
1830 }
1831 };
1832
1833 // Create a list of each active lane mask phi, ordered by unroll part.
1835 for (VPRecipeBase &R : Header->phis()) {
1837 if (!Phi)
1838 continue;
1839 VPValue *Index = nullptr;
1840 match(Phi->getBackedgeValue(),
1842 assert(Index && "Expected index from ActiveLaneMask instruction");
1843
1844 uint64_t Part;
1845 if (match(Index,
1847 m_VPValue(), m_ConstantInt(Part))))
1848 Phis[Part] = Phi;
1849 else
1850 // Anything other than a CanonicalIVIncrementForPart is part 0
1851 Phis[0] = Phi;
1852 }
1853
1854 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1855 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1856
1857 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1858 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1859
1860 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1861 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1862 "Expected incoming values of Phi to be ActiveLaneMasks");
1863
1864 // When using wide lane masks, the return type of the get.active.lane.mask
1865 // intrinsic is VF x UF (last operand).
1866 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1867 EntryALM->setOperand(2, ALMMultiplier);
1868 LoopALM->setOperand(2, ALMMultiplier);
1869
1870 // Create UF x extract vectors and insert into preheader.
1871 SmallVector<VPValue *> EntryExtracts(UF);
1872 ExtractFromALM(EntryALM, EntryExtracts);
1873
1874 // Create UF x extract vectors and insert before the loop compare & branch,
1875 // updating the compare to use the first extract.
1876 SmallVector<VPValue *> LoopExtracts(UF);
1877 ExtractFromALM(LoopALM, LoopExtracts);
1878 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1879 Not->setOperand(0, LoopExtracts[0]);
1880
1881 // Update the incoming values of active lane mask phis.
1882 for (unsigned Part = 0; Part < UF; ++Part) {
1883 Phis[Part]->setStartValue(EntryExtracts[Part]);
1884 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1885 }
1886
1887 return true;
1888}
1889
1890/// Try to simplify the branch condition of \p Plan. This may restrict the
1891/// resulting plan to \p BestVF and \p BestUF.
1893 unsigned BestUF,
1895 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1896 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1897 auto *Term = &ExitingVPBB->back();
1898 VPValue *Cond;
1899 ScalarEvolution &SE = *PSE.getSE();
1900 if (match(Term, m_BranchOnCount()) ||
1902 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1903 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1904 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1905 const SCEV *VectorTripCount =
1907 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1908 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1909 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1910 "Trip count SCEV must be computable");
1911 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1912 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1913 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1914 return false;
1915 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1916 // For BranchOnCond, check if we can prove the condition to be true using VF
1917 // and UF.
1918 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1919 return false;
1920 } else {
1921 return false;
1922 }
1923
1924 // The vector loop region only executes once. If possible, completely remove
1925 // the region, otherwise replace the terminator controlling the latch with
1926 // (BranchOnCond true).
1927 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1928 // support for other non-canonical widen induction recipes (e.g.,
1929 // VPWidenPointerInductionRecipe).
1930 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1931 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1932 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1933 return R->isCanonical();
1934 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1935 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1936 })) {
1937 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1938 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1939 VPBuilder Builder(Plan.getVectorPreheader());
1940 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1941 R->getScalarType());
1942 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1943 HeaderR.eraseFromParent();
1944 continue;
1945 }
1946 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1947 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1948 HeaderR.eraseFromParent();
1949 }
1950
1951 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1952 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1953 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1954 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1955
1956 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1957 B->setParent(nullptr);
1958
1959 VPBlockUtils::connectBlocks(Preheader, Header);
1960 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1962 } else {
1963 // The vector region contains header phis for which we cannot remove the
1964 // loop region yet.
1965 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1966 {}, {}, Term->getDebugLoc());
1967 ExitingVPBB->appendRecipe(BOC);
1968 }
1969
1970 Term->eraseFromParent();
1971
1972 return true;
1973}
1974
1975/// From the definition of llvm.experimental.get.vector.length,
1976/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
1980 vp_depth_first_deep(Plan.getEntry()))) {
1981 for (VPRecipeBase &R : *VPBB) {
1982 VPValue *AVL;
1983 if (!match(&R, m_EVL(m_VPValue(AVL))))
1984 continue;
1985
1986 ScalarEvolution &SE = *PSE.getSE();
1987 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
1988 if (isa<SCEVCouldNotCompute>(AVLSCEV))
1989 continue;
1990 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
1991 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
1992 continue;
1993
1995 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
1996 R.getDebugLoc());
1997 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
1998 return true;
1999 }
2000 }
2001 return false;
2002}
2003
2005 unsigned BestUF,
2007 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2008 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2009
2010 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2011 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2012 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2013 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2014
2015 if (MadeChange) {
2016 Plan.setVF(BestVF);
2017 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2018 }
2019}
2020
2021/// Sink users of \p FOR after the recipe defining the previous value \p
2022/// Previous of the recurrence. \returns true if all users of \p FOR could be
2023/// re-arranged as needed or false if it is not possible.
2024static bool
2026 VPRecipeBase *Previous,
2027 VPDominatorTree &VPDT) {
2028 // Collect recipes that need sinking.
2031 Seen.insert(Previous);
2032 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2033 // The previous value must not depend on the users of the recurrence phi. In
2034 // that case, FOR is not a fixed order recurrence.
2035 if (SinkCandidate == Previous)
2036 return false;
2037
2038 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2039 !Seen.insert(SinkCandidate).second ||
2040 VPDT.properlyDominates(Previous, SinkCandidate))
2041 return true;
2042
2043 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2044 return false;
2045
2046 WorkList.push_back(SinkCandidate);
2047 return true;
2048 };
2049
2050 // Recursively sink users of FOR after Previous.
2051 WorkList.push_back(FOR);
2052 for (unsigned I = 0; I != WorkList.size(); ++I) {
2053 VPRecipeBase *Current = WorkList[I];
2054 assert(Current->getNumDefinedValues() == 1 &&
2055 "only recipes with a single defined value expected");
2056
2057 for (VPUser *User : Current->getVPSingleValue()->users()) {
2058 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2059 return false;
2060 }
2061 }
2062
2063 // Keep recipes to sink ordered by dominance so earlier instructions are
2064 // processed first.
2065 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2066 return VPDT.properlyDominates(A, B);
2067 });
2068
2069 for (VPRecipeBase *SinkCandidate : WorkList) {
2070 if (SinkCandidate == FOR)
2071 continue;
2072
2073 SinkCandidate->moveAfter(Previous);
2074 Previous = SinkCandidate;
2075 }
2076 return true;
2077}
2078
2079/// Try to hoist \p Previous and its operands before all users of \p FOR.
2081 VPRecipeBase *Previous,
2082 VPDominatorTree &VPDT) {
2083 if (cannotHoistOrSinkRecipe(*Previous))
2084 return false;
2085
2086 // Collect recipes that need hoisting.
2087 SmallVector<VPRecipeBase *> HoistCandidates;
2089 VPRecipeBase *HoistPoint = nullptr;
2090 // Find the closest hoist point by looking at all users of FOR and selecting
2091 // the recipe dominating all other users.
2092 for (VPUser *U : FOR->users()) {
2093 auto *R = cast<VPRecipeBase>(U);
2094 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2095 HoistPoint = R;
2096 }
2097 assert(all_of(FOR->users(),
2098 [&VPDT, HoistPoint](VPUser *U) {
2099 auto *R = cast<VPRecipeBase>(U);
2100 return HoistPoint == R ||
2101 VPDT.properlyDominates(HoistPoint, R);
2102 }) &&
2103 "HoistPoint must dominate all users of FOR");
2104
2105 auto NeedsHoisting = [HoistPoint, &VPDT,
2106 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2107 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2108 if (!HoistCandidate)
2109 return nullptr;
2110 VPRegionBlock *EnclosingLoopRegion =
2111 HoistCandidate->getParent()->getEnclosingLoopRegion();
2112 assert((!HoistCandidate->getRegion() ||
2113 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2114 "CFG in VPlan should still be flat, without replicate regions");
2115 // Hoist candidate was already visited, no need to hoist.
2116 if (!Visited.insert(HoistCandidate).second)
2117 return nullptr;
2118
2119 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2120 // hoisting.
2121 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2122 return nullptr;
2123
2124 // If we reached a recipe that dominates HoistPoint, we don't need to
2125 // hoist the recipe.
2126 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2127 return nullptr;
2128 return HoistCandidate;
2129 };
2130
2131 if (!NeedsHoisting(Previous->getVPSingleValue()))
2132 return true;
2133
2134 // Recursively try to hoist Previous and its operands before all users of FOR.
2135 HoistCandidates.push_back(Previous);
2136
2137 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2138 VPRecipeBase *Current = HoistCandidates[I];
2139 assert(Current->getNumDefinedValues() == 1 &&
2140 "only recipes with a single defined value expected");
2141 if (cannotHoistOrSinkRecipe(*Current))
2142 return false;
2143
2144 for (VPValue *Op : Current->operands()) {
2145 // If we reach FOR, it means the original Previous depends on some other
2146 // recurrence that in turn depends on FOR. If that is the case, we would
2147 // also need to hoist recipes involving the other FOR, which may break
2148 // dependencies.
2149 if (Op == FOR)
2150 return false;
2151
2152 if (auto *R = NeedsHoisting(Op)) {
2153 // Bail out if the recipe defines multiple values.
2154 // TODO: Hoisting such recipes requires additional handling.
2155 if (R->getNumDefinedValues() != 1)
2156 return false;
2157 HoistCandidates.push_back(R);
2158 }
2159 }
2160 }
2161
2162 // Order recipes to hoist by dominance so earlier instructions are processed
2163 // first.
2164 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2165 return VPDT.properlyDominates(A, B);
2166 });
2167
2168 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2169 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2170 HoistPoint->getIterator());
2171 }
2172
2173 return true;
2174}
2175
2177 VPBuilder &LoopBuilder) {
2178 VPDominatorTree VPDT(Plan);
2179
2181 for (VPRecipeBase &R :
2184 RecurrencePhis.push_back(FOR);
2185
2186 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2188 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2189 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2190 // to terminate.
2191 while (auto *PrevPhi =
2193 assert(PrevPhi->getParent() == FOR->getParent());
2194 assert(SeenPhis.insert(PrevPhi).second);
2195 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2196 }
2197
2198 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2199 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2200 return false;
2201
2202 // Introduce a recipe to combine the incoming and previous values of a
2203 // fixed-order recurrence.
2204 VPBasicBlock *InsertBlock = Previous->getParent();
2205 if (isa<VPHeaderPHIRecipe>(Previous))
2206 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2207 else
2208 LoopBuilder.setInsertPoint(InsertBlock,
2209 std::next(Previous->getIterator()));
2210
2211 auto *RecurSplice =
2213 {FOR, FOR->getBackedgeValue()});
2214
2215 FOR->replaceAllUsesWith(RecurSplice);
2216 // Set the first operand of RecurSplice to FOR again, after replacing
2217 // all users.
2218 RecurSplice->setOperand(0, FOR);
2219
2220 // Check for users extracting at the penultimate active lane of the FOR.
2221 // If only a single lane is active in the current iteration, we need to
2222 // select the last element from the previous iteration (from the FOR phi
2223 // directly).
2224 for (VPUser *U : RecurSplice->users()) {
2226 m_Specific(RecurSplice))))
2227 continue;
2228
2230 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2231 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2232 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2233 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2234 VPValue *PenultimateIndex =
2235 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2236 VPValue *PenultimateLastIter =
2237 B.createNaryOp(VPInstruction::ExtractLane,
2238 {PenultimateIndex, FOR->getBackedgeValue()});
2239 VPValue *LastPrevIter =
2240 B.createNaryOp(VPInstruction::ExtractLastElement, FOR);
2241 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2242 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2243 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2244 }
2245 }
2246 return true;
2247}
2248
2250 for (VPRecipeBase &R :
2252 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2253 if (!PhiR)
2254 continue;
2255 RecurKind RK = PhiR->getRecurrenceKind();
2256 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2258 continue;
2259
2260 for (VPUser *U : collectUsersRecursively(PhiR))
2261 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2262 RecWithFlags->dropPoisonGeneratingFlags();
2263 }
2264 }
2265}
2266
2267namespace {
2268struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2269 static bool isSentinel(const VPSingleDefRecipe *Def) {
2270 return Def == getEmptyKey() || Def == getTombstoneKey();
2271 }
2272
2273 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2274 /// return that source element type.
2275 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2276 // All VPInstructions that lower to GEPs must have the i8 source element
2277 // type (as they are PtrAdds), so we omit it.
2279 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2280 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2281 return GEP->getSourceElementType();
2282 return nullptr;
2283 })
2284 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2285 [](auto *I) { return I->getSourceElementType(); })
2286 .Default([](auto *) { return nullptr; });
2287 }
2288
2289 /// Returns true if recipe \p Def can be safely handed for CSE.
2290 static bool canHandle(const VPSingleDefRecipe *Def) {
2291 // We can extend the list of handled recipes in the future,
2292 // provided we account for the data embedded in them while checking for
2293 // equality or hashing.
2294 auto C = getOpcodeOrIntrinsicID(Def);
2295
2296 // The issue with (Insert|Extract)Value is that the index of the
2297 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2298 // VPlan.
2299 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2300 C->second == Instruction::ExtractValue)))
2301 return false;
2302
2303 // During CSE, we can only handle recipes that don't read from memory: if
2304 // they read from memory, there could be an intervening write to memory
2305 // before the next instance is CSE'd, leading to an incorrect result.
2306 return !Def->mayReadFromMemory();
2307 }
2308
2309 /// Hash the underlying data of \p Def.
2310 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2311 const VPlan *Plan = Def->getParent()->getPlan();
2312 VPTypeAnalysis TypeInfo(*Plan);
2313 hash_code Result = hash_combine(
2314 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2315 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2317 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2318 if (RFlags->hasPredicate())
2319 return hash_combine(Result, RFlags->getPredicate());
2320 return Result;
2321 }
2322
2323 /// Check equality of underlying data of \p L and \p R.
2324 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2325 if (isSentinel(L) || isSentinel(R))
2326 return L == R;
2327 if (L->getVPDefID() != R->getVPDefID() ||
2329 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2331 !equal(L->operands(), R->operands()))
2332 return false;
2334 "must have valid opcode info for both recipes");
2335 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2336 if (LFlags->hasPredicate() &&
2337 LFlags->getPredicate() !=
2338 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2339 return false;
2340 // Recipes in replicate regions implicitly depend on predicate. If either
2341 // recipe is in a replicate region, only consider them equal if both have
2342 // the same parent.
2343 const VPRegionBlock *RegionL = L->getRegion();
2344 const VPRegionBlock *RegionR = R->getRegion();
2345 if (((RegionL && RegionL->isReplicator()) ||
2346 (RegionR && RegionR->isReplicator())) &&
2347 L->getParent() != R->getParent())
2348 return false;
2349 const VPlan *Plan = L->getParent()->getPlan();
2350 VPTypeAnalysis TypeInfo(*Plan);
2351 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2352 }
2353};
2354} // end anonymous namespace
2355
2356/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2357/// Plan.
2359 VPDominatorTree VPDT(Plan);
2361
2363 vp_depth_first_deep(Plan.getEntry()))) {
2364 for (VPRecipeBase &R : *VPBB) {
2365 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2366 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2367 continue;
2368 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2369 // V must dominate Def for a valid replacement.
2370 if (!VPDT.dominates(V->getParent(), VPBB))
2371 continue;
2372 // Only keep flags present on both V and Def.
2373 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2374 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2375 Def->replaceAllUsesWith(V);
2376 continue;
2377 }
2378 CSEMap[Def] = Def;
2379 }
2380 }
2381}
2382
2383/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2384static void licm(VPlan &Plan) {
2385 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2386
2387 // Hoist any loop invariant recipes from the vector loop region to the
2388 // preheader. Preform a shallow traversal of the vector loop region, to
2389 // exclude recipes in replicate regions. Since the top-level blocks in the
2390 // vector loop region are guaranteed to execute if the vector pre-header is,
2391 // we don't need to check speculation safety.
2392 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2393 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2394 "Expected vector prehader's successor to be the vector loop region");
2396 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2397 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2399 continue;
2400 if (any_of(R.operands(), [](VPValue *Op) {
2401 return !Op->isDefinedOutsideLoopRegions();
2402 }))
2403 continue;
2404 R.moveBefore(*Preheader, Preheader->end());
2405 }
2406 }
2407}
2408
2410 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2411 if (Plan.hasScalarVFOnly())
2412 return;
2413 // Keep track of created truncates, so they can be re-used. Note that we
2414 // cannot use RAUW after creating a new truncate, as this would could make
2415 // other uses have different types for their operands, making them invalidly
2416 // typed.
2418 VPTypeAnalysis TypeInfo(Plan);
2419 VPBasicBlock *PH = Plan.getVectorPreheader();
2422 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2425 &R))
2426 continue;
2427
2428 VPValue *ResultVPV = R.getVPSingleValue();
2429 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2430 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2431 if (!NewResSizeInBits)
2432 continue;
2433
2434 // If the value wasn't vectorized, we must maintain the original scalar
2435 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2436 // skip casts which do not need to be handled explicitly here, as
2437 // redundant casts will be removed during recipe simplification.
2439 continue;
2440
2441 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2442 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2443 assert(OldResTy->isIntegerTy() && "only integer types supported");
2444 (void)OldResSizeInBits;
2445
2446 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2447
2448 // Any wrapping introduced by shrinking this operation shouldn't be
2449 // considered undefined behavior. So, we can't unconditionally copy
2450 // arithmetic wrapping flags to VPW.
2451 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2452 VPW->dropPoisonGeneratingFlags();
2453
2454 if (OldResSizeInBits != NewResSizeInBits &&
2455 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2456 // Extend result to original width.
2457 auto *Ext =
2458 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2459 Ext->insertAfter(&R);
2460 ResultVPV->replaceAllUsesWith(Ext);
2461 Ext->setOperand(0, ResultVPV);
2462 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2463 } else {
2464 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2465 "Only ICmps should not need extending the result.");
2466 }
2467
2468 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2470 continue;
2471
2472 // Shrink operands by introducing truncates as needed.
2473 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2474 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2475 auto *Op = R.getOperand(Idx);
2476 unsigned OpSizeInBits =
2478 if (OpSizeInBits == NewResSizeInBits)
2479 continue;
2480 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2481 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2482 if (!IterIsEmpty) {
2483 R.setOperand(Idx, ProcessedIter->second);
2484 continue;
2485 }
2486
2487 VPBuilder Builder;
2488 if (Op->isLiveIn())
2489 Builder.setInsertPoint(PH);
2490 else
2491 Builder.setInsertPoint(&R);
2492 VPWidenCastRecipe *NewOp =
2493 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2494 ProcessedIter->second = NewOp;
2495 R.setOperand(Idx, NewOp);
2496 }
2497
2498 }
2499 }
2500}
2501
2505 VPValue *Cond;
2506 // Skip blocks that are not terminated by BranchOnCond.
2507 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2508 continue;
2509
2510 assert(VPBB->getNumSuccessors() == 2 &&
2511 "Two successors expected for BranchOnCond");
2512 unsigned RemovedIdx;
2513 if (match(Cond, m_True()))
2514 RemovedIdx = 1;
2515 else if (match(Cond, m_False()))
2516 RemovedIdx = 0;
2517 else
2518 continue;
2519
2520 VPBasicBlock *RemovedSucc =
2521 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2522 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2523 "There must be a single edge between VPBB and its successor");
2524 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2525 // these recipes.
2526 for (VPRecipeBase &R : RemovedSucc->phis())
2527 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2528
2529 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2530 // automatically on VPlan destruction if it becomes unreachable.
2531 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2532 VPBB->back().eraseFromParent();
2533 }
2534}
2535
2555
2556// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2557// the loop terminator with a branch-on-cond recipe with the negated
2558// active-lane-mask as operand. Note that this turns the loop into an
2559// uncountable one. Only the existing terminator is replaced, all other existing
2560// recipes/users remain unchanged, except for poison-generating flags being
2561// dropped from the canonical IV increment. Return the created
2562// VPActiveLaneMaskPHIRecipe.
2563//
2564// The function uses the following definitions:
2565//
2566// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2567// calculate-trip-count-minus-VF (original TC) : original TC
2568// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2569// CanonicalIVPhi : CanonicalIVIncrement
2570// %StartV is the canonical induction start value.
2571//
2572// The function adds the following recipes:
2573//
2574// vector.ph:
2575// %TripCount = calculate-trip-count-minus-VF (original TC)
2576// [if DataWithControlFlowWithoutRuntimeCheck]
2577// %EntryInc = canonical-iv-increment-for-part %StartV
2578// %EntryALM = active-lane-mask %EntryInc, %TripCount
2579//
2580// vector.body:
2581// ...
2582// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2583// ...
2584// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2585// %ALM = active-lane-mask %InLoopInc, TripCount
2586// %Negated = Not %ALM
2587// branch-on-cond %Negated
2588//
2591 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2592 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2593 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2594 VPValue *StartV = CanonicalIVPHI->getStartValue();
2595
2596 auto *CanonicalIVIncrement =
2597 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2598 // TODO: Check if dropping the flags is needed if
2599 // !DataAndControlFlowWithoutRuntimeCheck.
2600 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2601 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2602 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2603 // we have to take unrolling into account. Each part needs to start at
2604 // Part * VF
2605 auto *VecPreheader = Plan.getVectorPreheader();
2606 VPBuilder Builder(VecPreheader);
2607
2608 // Create the ActiveLaneMask instruction using the correct start values.
2609 VPValue *TC = Plan.getTripCount();
2610
2611 VPValue *TripCount, *IncrementValue;
2613 // When the loop is guarded by a runtime overflow check for the loop
2614 // induction variable increment by VF, we can increment the value before
2615 // the get.active.lane mask and use the unmodified tripcount.
2616 IncrementValue = CanonicalIVIncrement;
2617 TripCount = TC;
2618 } else {
2619 // When avoiding a runtime check, the active.lane.mask inside the loop
2620 // uses a modified trip count and the induction variable increment is
2621 // done after the active.lane.mask intrinsic is called.
2622 IncrementValue = CanonicalIVPHI;
2623 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2624 {TC}, DL);
2625 }
2626 auto *EntryIncrement = Builder.createOverflowingOp(
2627 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2628 "index.part.next");
2629
2630 // Create the active lane mask instruction in the VPlan preheader.
2631 VPValue *ALMMultiplier =
2632 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2633 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2634 {EntryIncrement, TC, ALMMultiplier}, DL,
2635 "active.lane.mask.entry");
2636
2637 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2638 // preheader ActiveLaneMask instruction.
2639 auto *LaneMaskPhi =
2641 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2642
2643 // Create the active lane mask for the next iteration of the loop before the
2644 // original terminator.
2645 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2646 Builder.setInsertPoint(OriginalTerminator);
2647 auto *InLoopIncrement =
2648 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2649 {IncrementValue}, {false, false}, DL);
2650 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2651 {InLoopIncrement, TripCount, ALMMultiplier},
2652 DL, "active.lane.mask.next");
2653 LaneMaskPhi->addOperand(ALM);
2654
2655 // Replace the original terminator with BranchOnCond. We have to invert the
2656 // mask here because a true condition means jumping to the exit block.
2657 auto *NotMask = Builder.createNot(ALM, DL);
2658 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2659 OriginalTerminator->eraseFromParent();
2660 return LaneMaskPhi;
2661}
2662
2663/// Collect the header mask with the pattern:
2664/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2665/// TODO: Introduce explicit recipe for header-mask instead of searching
2666/// for the header-mask pattern manually.
2668 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2669 SmallVector<VPValue *> WideCanonicalIVs;
2670 auto *FoundWidenCanonicalIVUser = find_if(
2672 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2674 "Must have at most one VPWideCanonicalIVRecipe");
2675 if (FoundWidenCanonicalIVUser !=
2676 LoopRegion->getCanonicalIV()->users().end()) {
2677 auto *WideCanonicalIV =
2678 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2679 WideCanonicalIVs.push_back(WideCanonicalIV);
2680 }
2681
2682 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2683 // version of the canonical induction.
2684 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2685 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2686 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2687 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2688 WideCanonicalIVs.push_back(WidenOriginalIV);
2689 }
2690
2691 // Walk users of wide canonical IVs and find the single compare of the form
2692 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2693 VPSingleDefRecipe *HeaderMask = nullptr;
2694 for (auto *Wide : WideCanonicalIVs) {
2695 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2696 auto *VPI = dyn_cast<VPInstruction>(U);
2697 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2698 continue;
2699
2700 assert(VPI->getOperand(0) == Wide &&
2701 "WidenCanonicalIV must be the first operand of the compare");
2702 assert(!HeaderMask && "Multiple header masks found?");
2703 HeaderMask = VPI;
2704 }
2705 }
2706 return HeaderMask;
2707}
2708
2710 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2713 UseActiveLaneMaskForControlFlow) &&
2714 "DataAndControlFlowWithoutRuntimeCheck implies "
2715 "UseActiveLaneMaskForControlFlow");
2716
2717 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2718 auto *FoundWidenCanonicalIVUser = find_if(
2720 assert(FoundWidenCanonicalIVUser &&
2721 "Must have widened canonical IV when tail folding!");
2722 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2723 auto *WideCanonicalIV =
2724 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2725 VPSingleDefRecipe *LaneMask;
2726 if (UseActiveLaneMaskForControlFlow) {
2729 } else {
2730 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2731 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2732 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2733 LaneMask =
2734 B.createNaryOp(VPInstruction::ActiveLaneMask,
2735 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2736 nullptr, "active.lane.mask");
2737 }
2738
2739 // Walk users of WideCanonicalIV and replace the header mask of the form
2740 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2741 // removing the old one to ensure there is always only a single header mask.
2742 HeaderMask->replaceAllUsesWith(LaneMask);
2743 HeaderMask->eraseFromParent();
2744}
2745
2746template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2747 Op0_t In;
2749
2750 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2751
2752 template <typename OpTy> bool match(OpTy *V) const {
2753 if (m_Specific(In).match(V)) {
2754 Out = nullptr;
2755 return true;
2756 }
2757 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2758 }
2759};
2760
2761/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2762/// Returns the remaining part \p Out if so, or nullptr otherwise.
2763template <typename Op0_t, typename Op1_t>
2764static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2765 Op1_t &Out) {
2766 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2767}
2768
2769/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2770/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2771/// recipe could be created.
2772/// \p HeaderMask Header Mask.
2773/// \p CurRecipe Recipe to be transform.
2774/// \p TypeInfo VPlan-based type analysis.
2775/// \p EVL The explicit vector length parameter of vector-predication
2776/// intrinsics.
2778 VPRecipeBase &CurRecipe,
2779 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2780 VPlan *Plan = CurRecipe.getParent()->getPlan();
2781 DebugLoc DL = CurRecipe.getDebugLoc();
2782 VPValue *Addr, *Mask, *EndPtr;
2783
2784 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2785 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2786 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2787 EVLEndPtr->insertBefore(&CurRecipe);
2788 EVLEndPtr->setOperand(1, &EVL);
2789 return EVLEndPtr;
2790 };
2791
2792 if (match(&CurRecipe,
2793 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2794 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2795 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2796 EVL, Mask);
2797
2798 if (match(&CurRecipe,
2799 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2800 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2801 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2802 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2803 AdjustEndPtr(EndPtr), EVL, Mask);
2804
2805 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2806 m_RemoveMask(HeaderMask, Mask))) &&
2807 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2808 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2809 EVL, Mask);
2810
2811 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2812 m_RemoveMask(HeaderMask, Mask))) &&
2813 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2814 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2815 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2816 AdjustEndPtr(EndPtr), EVL, Mask);
2817
2818 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2819 if (Rdx->isConditional() &&
2820 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2821 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2822
2823 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2824 if (Interleave->getMask() &&
2825 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2826 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2827
2828 VPValue *LHS, *RHS;
2829 if (match(&CurRecipe,
2830 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2831 return new VPWidenIntrinsicRecipe(
2832 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2833 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2834
2835 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2836 m_VPValue(RHS))))
2837 return new VPWidenIntrinsicRecipe(
2838 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2839 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2840
2841 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2842 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2843 VPValue *ZExt =
2844 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2845 return new VPInstruction(Instruction::Sub,
2846 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2847 }
2848
2849 return nullptr;
2850}
2851
2852/// Replace recipes with their EVL variants.
2854 VPTypeAnalysis TypeInfo(Plan);
2855 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2856 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2857
2858 assert(all_of(Plan.getVF().users(),
2861 "User of VF that we can't transform to EVL.");
2862 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2864 });
2865
2866 assert(all_of(Plan.getVFxUF().users(),
2867 [&LoopRegion, &Plan](VPUser *U) {
2868 return match(U,
2869 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2870 m_Specific(&Plan.getVFxUF()))) ||
2871 isa<VPWidenPointerInductionRecipe>(U);
2872 }) &&
2873 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2874 "increment of the canonical induction.");
2875 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2876 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2877 // canonical induction must not be updated.
2879 });
2880
2881 // Defer erasing recipes till the end so that we don't invalidate the
2882 // VPTypeAnalysis cache.
2884
2885 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2886 // contained.
2887 bool ContainsFORs =
2889 if (ContainsFORs) {
2890 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2891 VPValue *MaxEVL = &Plan.getVF();
2892 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2893 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2894 MaxEVL = Builder.createScalarZExtOrTrunc(
2895 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2896 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2897
2898 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2899 VPValue *PrevEVL = Builder.createScalarPhi(
2900 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2901
2904 for (VPRecipeBase &R : *VPBB) {
2905 VPValue *V1, *V2;
2906 if (!match(&R,
2908 m_VPValue(V1), m_VPValue(V2))))
2909 continue;
2910 VPValue *Imm = Plan.getOrAddLiveIn(
2913 Intrinsic::experimental_vp_splice,
2914 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2915 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
2916 R.getDebugLoc());
2917 VPSplice->insertBefore(&R);
2918 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2919 ToErase.push_back(&R);
2920 }
2921 }
2922 }
2923
2924 VPValue *HeaderMask = findHeaderMask(Plan);
2925 if (!HeaderMask)
2926 return;
2927
2928 // Replace header masks with a mask equivalent to predicating by EVL:
2929 //
2930 // icmp ule widen-canonical-iv backedge-taken-count
2931 // ->
2932 // icmp ult step-vector, EVL
2933 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2934 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2935 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2936 VPValue *EVLMask = Builder.createICmp(
2938 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2939 HeaderMask->replaceAllUsesWith(EVLMask);
2940 ToErase.push_back(HeaderMask->getDefiningRecipe());
2941
2942 // Try to optimize header mask recipes away to their EVL variants.
2943 // TODO: Split optimizeMaskToEVL out and move into
2944 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2945 // tryToBuildVPlanWithVPRecipes beforehand.
2946 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2947 auto *CurRecipe = cast<VPRecipeBase>(U);
2948 VPRecipeBase *EVLRecipe =
2949 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2950 if (!EVLRecipe)
2951 continue;
2952
2953 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2954 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2955 "New recipe must define the same number of values as the "
2956 "original.");
2957 EVLRecipe->insertBefore(CurRecipe);
2959 EVLRecipe)) {
2960 for (unsigned I = 0; I < NumDefVal; ++I) {
2961 VPValue *CurVPV = CurRecipe->getVPValue(I);
2962 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2963 }
2964 }
2965 ToErase.push_back(CurRecipe);
2966 }
2967 // Remove dead EVL mask.
2968 if (EVLMask->getNumUsers() == 0)
2969 ToErase.push_back(EVLMask->getDefiningRecipe());
2970
2971 for (VPRecipeBase *R : reverse(ToErase)) {
2972 SmallVector<VPValue *> PossiblyDead(R->operands());
2973 R->eraseFromParent();
2974 for (VPValue *Op : PossiblyDead)
2976 }
2977}
2978
2979/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2980/// replaces all uses except the canonical IV increment of
2981/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2982/// is used only for loop iterations counting after this transformation.
2983///
2984/// The function uses the following definitions:
2985/// %StartV is the canonical induction start value.
2986///
2987/// The function adds the following recipes:
2988///
2989/// vector.ph:
2990/// ...
2991///
2992/// vector.body:
2993/// ...
2994/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2995/// [ %NextEVLIV, %vector.body ]
2996/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2997/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2998/// ...
2999/// %OpEVL = cast i32 %VPEVL to IVSize
3000/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3001/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3002/// ...
3003///
3004/// If MaxSafeElements is provided, the function adds the following recipes:
3005/// vector.ph:
3006/// ...
3007///
3008/// vector.body:
3009/// ...
3010/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3011/// [ %NextEVLIV, %vector.body ]
3012/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3013/// %cmp = cmp ult %AVL, MaxSafeElements
3014/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3015/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3016/// ...
3017/// %OpEVL = cast i32 %VPEVL to IVSize
3018/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3019/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3020/// ...
3021///
3023 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3024 if (Plan.hasScalarVFOnly())
3025 return;
3026 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3027 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3028
3029 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3030 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3031 VPValue *StartV = CanonicalIVPHI->getStartValue();
3032
3033 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3034 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3035 EVLPhi->insertAfter(CanonicalIVPHI);
3036 VPBuilder Builder(Header, Header->getFirstNonPhi());
3037 // Create the AVL (application vector length), starting from TC -> 0 in steps
3038 // of EVL.
3039 VPPhi *AVLPhi = Builder.createScalarPhi(
3040 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3041 VPValue *AVL = AVLPhi;
3042
3043 if (MaxSafeElements) {
3044 // Support for MaxSafeDist for correct loop emission.
3045 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3046 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3047 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3048 "safe_avl");
3049 }
3050 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3052
3053 auto *CanonicalIVIncrement =
3054 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3055 Builder.setInsertPoint(CanonicalIVIncrement);
3056 VPValue *OpVPEVL = VPEVL;
3057
3058 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3059 OpVPEVL = Builder.createScalarZExtOrTrunc(
3060 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3061
3062 auto *NextEVLIV = Builder.createOverflowingOp(
3063 Instruction::Add, {OpVPEVL, EVLPhi},
3064 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3065 CanonicalIVIncrement->hasNoSignedWrap()},
3066 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3067 EVLPhi->addOperand(NextEVLIV);
3068
3069 VPValue *NextAVL = Builder.createOverflowingOp(
3070 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3071 DebugLoc::getCompilerGenerated(), "avl.next");
3072 AVLPhi->addOperand(NextAVL);
3073
3074 transformRecipestoEVLRecipes(Plan, *VPEVL);
3075
3076 // Replace all uses of VPCanonicalIVPHIRecipe by
3077 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3078 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3079 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3080 // TODO: support unroll factor > 1.
3081 Plan.setUF(1);
3082}
3083
3085 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3086 // There should be only one EVL PHI in the entire plan.
3087 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3088
3091 for (VPRecipeBase &R : VPBB->phis())
3092 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3093 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3094 EVLPhi = PhiR;
3095 }
3096
3097 // Early return if no EVL PHI is found.
3098 if (!EVLPhi)
3099 return;
3100
3101 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3102 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3103 VPValue *AVL;
3104 [[maybe_unused]] bool FoundAVL =
3105 match(EVLIncrement,
3106 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3107 assert(FoundAVL && "Didn't find AVL?");
3108
3109 // The AVL may be capped to a safe distance.
3110 VPValue *SafeAVL;
3111 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3112 AVL = SafeAVL;
3113
3114 VPValue *AVLNext;
3115 [[maybe_unused]] bool FoundAVLNext =
3117 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3118 assert(FoundAVLNext && "Didn't find AVL backedge?");
3119
3120 // Convert EVLPhi to concrete recipe.
3121 auto *ScalarR =
3122 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3123 EVLPhi->getDebugLoc(), "evl.based.iv");
3124 EVLPhi->replaceAllUsesWith(ScalarR);
3125 EVLPhi->eraseFromParent();
3126
3127 // Replace CanonicalIVInc with EVL-PHI increment.
3128 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3129 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3130 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3131 m_Specific(&Plan.getVFxUF()))) &&
3132 "Unexpected canonical iv");
3133 Backedge->replaceAllUsesWith(EVLIncrement);
3134
3135 // Remove unused phi and increment.
3136 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3137 CanonicalIVIncrement->eraseFromParent();
3138 CanonicalIV->eraseFromParent();
3139
3140 // Replace the use of VectorTripCount in the latch-exiting block.
3141 // Before: (branch-on-count EVLIVInc, VectorTripCount)
3142 // After: (branch-on-cond eq AVLNext, 0)
3143
3144 VPBasicBlock *LatchExiting =
3145 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3146 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3147 // Skip single-iteration loop region
3148 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3149 return;
3150 assert(LatchExitingBr &&
3151 match(LatchExitingBr,
3152 m_BranchOnCount(m_VPValue(EVLIncrement),
3153 m_Specific(&Plan.getVectorTripCount()))) &&
3154 "Unexpected terminator in EVL loop");
3155
3156 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3157 VPBuilder Builder(LatchExitingBr);
3158 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3159 Plan.getConstantInt(AVLTy, 0));
3160 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3161 LatchExitingBr->eraseFromParent();
3162}
3163
3165 VPlan &Plan, PredicatedScalarEvolution &PSE,
3166 const DenseMap<Value *, const SCEV *> &StridesMap) {
3167 // Replace VPValues for known constant strides guaranteed by predicate scalar
3168 // evolution.
3169 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3170 auto *R = cast<VPRecipeBase>(&U);
3171 return R->getRegion() ||
3172 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3173 };
3174 ValueToSCEVMapTy RewriteMap;
3175 for (const SCEV *Stride : StridesMap.values()) {
3176 using namespace SCEVPatternMatch;
3177 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3178 const APInt *StrideConst;
3179 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3180 // Only handle constant strides for now.
3181 continue;
3182
3183 auto *CI = Plan.getConstantInt(*StrideConst);
3184 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3185 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3186
3187 // The versioned value may not be used in the loop directly but through a
3188 // sext/zext. Add new live-ins in those cases.
3189 for (Value *U : StrideV->users()) {
3191 continue;
3192 VPValue *StrideVPV = Plan.getLiveIn(U);
3193 if (!StrideVPV)
3194 continue;
3195 unsigned BW = U->getType()->getScalarSizeInBits();
3196 APInt C =
3197 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3198 VPValue *CI = Plan.getConstantInt(C);
3199 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3200 }
3201 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3202 }
3203
3204 for (VPRecipeBase &R : *Plan.getEntry()) {
3205 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3206 if (!ExpSCEV)
3207 continue;
3208 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3209 auto *NewSCEV =
3210 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3211 if (NewSCEV != ScevExpr) {
3212 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3213 ExpSCEV->replaceAllUsesWith(NewExp);
3214 if (Plan.getTripCount() == ExpSCEV)
3215 Plan.resetTripCount(NewExp);
3216 }
3217 }
3218}
3219
3221 VPlan &Plan,
3222 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3223 // Collect recipes in the backward slice of `Root` that may generate a poison
3224 // value that is used after vectorization.
3226 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3228 Worklist.push_back(Root);
3229
3230 // Traverse the backward slice of Root through its use-def chain.
3231 while (!Worklist.empty()) {
3232 VPRecipeBase *CurRec = Worklist.pop_back_val();
3233
3234 if (!Visited.insert(CurRec).second)
3235 continue;
3236
3237 // Prune search if we find another recipe generating a widen memory
3238 // instruction. Widen memory instructions involved in address computation
3239 // will lead to gather/scatter instructions, which don't need to be
3240 // handled.
3242 VPHeaderPHIRecipe>(CurRec))
3243 continue;
3244
3245 // This recipe contributes to the address computation of a widen
3246 // load/store. If the underlying instruction has poison-generating flags,
3247 // drop them directly.
3248 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3249 VPValue *A, *B;
3250 // Dropping disjoint from an OR may yield incorrect results, as some
3251 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3252 // for dependence analysis). Instead, replace it with an equivalent Add.
3253 // This is possible as all users of the disjoint OR only access lanes
3254 // where the operands are disjoint or poison otherwise.
3255 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3256 RecWithFlags->isDisjoint()) {
3257 VPBuilder Builder(RecWithFlags);
3258 VPInstruction *New = Builder.createOverflowingOp(
3259 Instruction::Add, {A, B}, {false, false},
3260 RecWithFlags->getDebugLoc());
3261 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3262 RecWithFlags->replaceAllUsesWith(New);
3263 RecWithFlags->eraseFromParent();
3264 CurRec = New;
3265 } else
3266 RecWithFlags->dropPoisonGeneratingFlags();
3267 } else {
3270 (void)Instr;
3271 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3272 "found instruction with poison generating flags not covered by "
3273 "VPRecipeWithIRFlags");
3274 }
3275
3276 // Add new definitions to the worklist.
3277 for (VPValue *Operand : CurRec->operands())
3278 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3279 Worklist.push_back(OpDef);
3280 }
3281 });
3282
3283 // Traverse all the recipes in the VPlan and collect the poison-generating
3284 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3285 // VPInterleaveRecipe.
3286 auto Iter = vp_depth_first_deep(Plan.getEntry());
3288 for (VPRecipeBase &Recipe : *VPBB) {
3289 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3290 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3291 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3292 if (AddrDef && WidenRec->isConsecutive() &&
3293 BlockNeedsPredication(UnderlyingInstr.getParent()))
3294 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3295 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3296 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3297 if (AddrDef) {
3298 // Check if any member of the interleave group needs predication.
3299 const InterleaveGroup<Instruction> *InterGroup =
3300 InterleaveRec->getInterleaveGroup();
3301 bool NeedPredication = false;
3302 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3303 I < NumMembers; ++I) {
3304 Instruction *Member = InterGroup->getMember(I);
3305 if (Member)
3306 NeedPredication |= BlockNeedsPredication(Member->getParent());
3307 }
3308
3309 if (NeedPredication)
3310 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3311 }
3312 }
3313 }
3314 }
3315}
3316
3318 VPlan &Plan,
3320 &InterleaveGroups,
3321 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3322 if (InterleaveGroups.empty())
3323 return;
3324
3325 // Interleave memory: for each Interleave Group we marked earlier as relevant
3326 // for this VPlan, replace the Recipes widening its memory instructions with a
3327 // single VPInterleaveRecipe at its insertion point.
3328 VPDominatorTree VPDT(Plan);
3329 for (const auto *IG : InterleaveGroups) {
3330 auto *Start =
3331 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3332 VPIRMetadata InterleaveMD(*Start);
3333 SmallVector<VPValue *, 4> StoredValues;
3334 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3335 StoredValues.push_back(StoreR->getStoredValue());
3336 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3337 Instruction *MemberI = IG->getMember(I);
3338 if (!MemberI)
3339 continue;
3340 VPWidenMemoryRecipe *MemoryR =
3341 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3342 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3343 StoredValues.push_back(StoreR->getStoredValue());
3344 InterleaveMD.intersect(*MemoryR);
3345 }
3346
3347 bool NeedsMaskForGaps =
3348 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3349 (!StoredValues.empty() && !IG->isFull());
3350
3351 Instruction *IRInsertPos = IG->getInsertPos();
3352 auto *InsertPos =
3353 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3354
3356 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3357 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3358 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3359
3360 // Get or create the start address for the interleave group.
3361 VPValue *Addr = Start->getAddr();
3362 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3363 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3364 // We cannot re-use the address of member zero because it does not
3365 // dominate the insert position. Instead, use the address of the insert
3366 // position and create a PtrAdd adjusting it to the address of member
3367 // zero.
3368 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3369 // InsertPos or sink loads above zero members to join it.
3370 assert(IG->getIndex(IRInsertPos) != 0 &&
3371 "index of insert position shouldn't be zero");
3372 auto &DL = IRInsertPos->getDataLayout();
3373 APInt Offset(32,
3374 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3375 IG->getIndex(IRInsertPos),
3376 /*IsSigned=*/true);
3377 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3378 VPBuilder B(InsertPos);
3379 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3380 }
3381 // If the group is reverse, adjust the index to refer to the last vector
3382 // lane instead of the first. We adjust the index from the first vector
3383 // lane, rather than directly getting the pointer for lane VF - 1, because
3384 // the pointer operand of the interleaved access is supposed to be uniform.
3385 if (IG->isReverse()) {
3386 auto *ReversePtr = new VPVectorEndPointerRecipe(
3387 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3388 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3389 ReversePtr->insertBefore(InsertPos);
3390 Addr = ReversePtr;
3391 }
3392 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3393 InsertPos->getMask(), NeedsMaskForGaps,
3394 InterleaveMD, InsertPos->getDebugLoc());
3395 VPIG->insertBefore(InsertPos);
3396
3397 unsigned J = 0;
3398 for (unsigned i = 0; i < IG->getFactor(); ++i)
3399 if (Instruction *Member = IG->getMember(i)) {
3400 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3401 if (!Member->getType()->isVoidTy()) {
3402 VPValue *OriginalV = MemberR->getVPSingleValue();
3403 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3404 J++;
3405 }
3406 MemberR->eraseFromParent();
3407 }
3408 }
3409}
3410
3411/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3412/// value, phi and backedge value. In the following example:
3413///
3414/// vector.ph:
3415/// Successor(s): vector loop
3416///
3417/// <x1> vector loop: {
3418/// vector.body:
3419/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3420/// ...
3421/// EMIT branch-on-count ...
3422/// No successors
3423/// }
3424///
3425/// WIDEN-INDUCTION will get expanded to:
3426///
3427/// vector.ph:
3428/// ...
3429/// vp<%induction.start> = ...
3430/// vp<%induction.increment> = ...
3431///
3432/// Successor(s): vector loop
3433///
3434/// <x1> vector loop: {
3435/// vector.body:
3436/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3437/// ...
3438/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3439/// EMIT branch-on-count ...
3440/// No successors
3441/// }
3442static void
3444 VPTypeAnalysis &TypeInfo) {
3445 VPlan *Plan = WidenIVR->getParent()->getPlan();
3446 VPValue *Start = WidenIVR->getStartValue();
3447 VPValue *Step = WidenIVR->getStepValue();
3448 VPValue *VF = WidenIVR->getVFValue();
3449 DebugLoc DL = WidenIVR->getDebugLoc();
3450
3451 // The value from the original loop to which we are mapping the new induction
3452 // variable.
3453 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3454
3455 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3458 VPIRFlags Flags = *WidenIVR;
3459 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3460 AddOp = Instruction::Add;
3461 MulOp = Instruction::Mul;
3462 } else {
3463 AddOp = ID.getInductionOpcode();
3464 MulOp = Instruction::FMul;
3465 }
3466
3467 // If the phi is truncated, truncate the start and step values.
3468 VPBuilder Builder(Plan->getVectorPreheader());
3469 Type *StepTy = TypeInfo.inferScalarType(Step);
3470 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3471 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3472 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3473 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3474 // Truncation doesn't preserve WrapFlags.
3475 Flags.dropPoisonGeneratingFlags();
3476 StepTy = Ty;
3477 }
3478
3479 // Construct the initial value of the vector IV in the vector loop preheader.
3480 Type *IVIntTy =
3482 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3483 if (StepTy->isFloatingPointTy())
3484 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3485
3486 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3487 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3488
3489 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3490 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3491 DebugLoc::getUnknown(), "induction");
3492
3493 // Create the widened phi of the vector IV.
3494 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3495 WidenIVR->getDebugLoc(), "vec.ind");
3496 WidePHI->insertBefore(WidenIVR);
3497
3498 // Create the backedge value for the vector IV.
3499 VPValue *Inc;
3500 VPValue *Prev;
3501 // If unrolled, use the increment and prev value from the operands.
3502 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3503 Inc = SplatVF;
3504 Prev = WidenIVR->getLastUnrolledPartOperand();
3505 } else {
3506 if (VPRecipeBase *R = VF->getDefiningRecipe())
3507 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3508 // Multiply the vectorization factor by the step using integer or
3509 // floating-point arithmetic as appropriate.
3510 if (StepTy->isFloatingPointTy())
3511 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3512 DL);
3513 else
3514 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3515 TypeInfo.inferScalarType(VF), DL);
3516
3517 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3518 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3519 Prev = WidePHI;
3520 }
3521
3523 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3524 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3525 WidenIVR->getDebugLoc(), "vec.ind.next");
3526
3527 WidePHI->addOperand(Next);
3528
3529 WidenIVR->replaceAllUsesWith(WidePHI);
3530}
3531
3532/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3533/// initial value, phi and backedge value. In the following example:
3534///
3535/// <x1> vector loop: {
3536/// vector.body:
3537/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3538/// ...
3539/// EMIT branch-on-count ...
3540/// }
3541///
3542/// WIDEN-POINTER-INDUCTION will get expanded to:
3543///
3544/// <x1> vector loop: {
3545/// vector.body:
3546/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3547/// EMIT %mul = mul %stepvector, %step
3548/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3549/// ...
3550/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3551/// EMIT branch-on-count ...
3552/// }
3554 VPTypeAnalysis &TypeInfo) {
3555 VPlan *Plan = R->getParent()->getPlan();
3556 VPValue *Start = R->getStartValue();
3557 VPValue *Step = R->getStepValue();
3558 VPValue *VF = R->getVFValue();
3559
3560 assert(R->getInductionDescriptor().getKind() ==
3562 "Not a pointer induction according to InductionDescriptor!");
3563 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3564 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3565 "Recipe should have been replaced");
3566
3567 VPBuilder Builder(R);
3568 DebugLoc DL = R->getDebugLoc();
3569
3570 // Build a scalar pointer phi.
3571 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3572
3573 // Create actual address geps that use the pointer phi as base and a
3574 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3575 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3576 Type *StepTy = TypeInfo.inferScalarType(Step);
3577 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3578 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3579 VPValue *PtrAdd = Builder.createNaryOp(
3580 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3581 R->replaceAllUsesWith(PtrAdd);
3582
3583 // Create the backedge value for the scalar pointer phi.
3585 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3586 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3587 DL);
3588 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3589
3590 VPValue *InductionGEP =
3591 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3592 ScalarPtrPhi->addOperand(InductionGEP);
3593}
3594
3596 // Replace loop regions with explicity CFG.
3597 SmallVector<VPRegionBlock *> LoopRegions;
3599 vp_depth_first_deep(Plan.getEntry()))) {
3600 if (!R->isReplicator())
3601 LoopRegions.push_back(R);
3602 }
3603 for (VPRegionBlock *R : LoopRegions)
3604 R->dissolveToCFGLoop();
3605}
3606
3608 VPTypeAnalysis TypeInfo(Plan);
3611 vp_depth_first_deep(Plan.getEntry()))) {
3612 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3613 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3614 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3615 ToRemove.push_back(WidenIVR);
3616 continue;
3617 }
3618
3619 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3620 // If the recipe only generates scalars, scalarize it instead of
3621 // expanding it.
3622 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3623 VPBuilder Builder(WidenIVR);
3624 VPValue *PtrAdd =
3625 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3626 WidenIVR->replaceAllUsesWith(PtrAdd);
3627 ToRemove.push_back(WidenIVR);
3628 continue;
3629 }
3630 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3631 ToRemove.push_back(WidenIVR);
3632 continue;
3633 }
3634
3635 // Expand VPBlendRecipe into VPInstruction::Select.
3636 VPBuilder Builder(&R);
3637 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3638 VPValue *Select = Blend->getIncomingValue(0);
3639 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3640 Select = Builder.createSelect(Blend->getMask(I),
3641 Blend->getIncomingValue(I), Select,
3642 R.getDebugLoc(), "predphi");
3643 Blend->replaceAllUsesWith(Select);
3644 ToRemove.push_back(Blend);
3645 }
3646
3647 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3648 Expr->decompose();
3649 ToRemove.push_back(Expr);
3650 }
3651
3652 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3653 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3654 if (LastActiveL &&
3655 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3656 // Create Not(Mask) for all operands.
3658 for (VPValue *Op : LastActiveL->operands()) {
3659 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3660 NotMasks.push_back(NotMask);
3661 }
3662
3663 // Create FirstActiveLane on the inverted masks.
3664 VPValue *FirstInactiveLane = Builder.createNaryOp(
3666 LastActiveL->getDebugLoc(), "first.inactive.lane");
3667
3668 // Subtract 1 to get the last active lane.
3669 VPValue *One = Plan.getOrAddLiveIn(
3670 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3671 VPValue *LastLane = Builder.createNaryOp(
3672 Instruction::Sub, {FirstInactiveLane, One},
3673 LastActiveL->getDebugLoc(), "last.active.lane");
3674
3675 LastActiveL->replaceAllUsesWith(LastLane);
3676 ToRemove.push_back(LastActiveL);
3677 continue;
3678 }
3679
3680 VPValue *VectorStep;
3681 VPValue *ScalarStep;
3683 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3684 continue;
3685
3686 // Expand WideIVStep.
3687 auto *VPI = cast<VPInstruction>(&R);
3688 Type *IVTy = TypeInfo.inferScalarType(VPI);
3689 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3691 ? Instruction::UIToFP
3692 : Instruction::Trunc;
3693 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3694 }
3695
3696 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3697 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3698 ScalarStep =
3699 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3700 }
3701
3702 VPIRFlags Flags;
3703 if (IVTy->isFloatingPointTy())
3704 Flags = {VPI->getFastMathFlags()};
3705
3706 unsigned MulOpc =
3707 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3708 VPInstruction *Mul = Builder.createNaryOp(
3709 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3710 VectorStep = Mul;
3711 VPI->replaceAllUsesWith(VectorStep);
3712 ToRemove.push_back(VPI);
3713 }
3714 }
3715
3716 for (VPRecipeBase *R : ToRemove)
3717 R->eraseFromParent();
3718}
3719
3721 VPBasicBlock *EarlyExitVPBB,
3722 VPlan &Plan,
3723 VPBasicBlock *HeaderVPBB,
3724 VPBasicBlock *LatchVPBB) {
3725 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3726 if (!EarlyExitVPBB->getSinglePredecessor() &&
3727 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3728 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3729 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3730 "unsupported early exit VPBB");
3731 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3732 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3733 // of the phis.
3734 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3735 cast<VPIRPhi>(&R)->swapOperands();
3736 }
3737
3738 VPBuilder Builder(LatchVPBB->getTerminator());
3739 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3740 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3741 "Terminator must be be BranchOnCond");
3742 VPValue *CondOfEarlyExitingVPBB =
3743 EarlyExitingVPBB->getTerminator()->getOperand(0);
3744 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3745 ? CondOfEarlyExitingVPBB
3746 : Builder.createNot(CondOfEarlyExitingVPBB);
3747
3748 // Split the middle block and have it conditionally branch to the early exit
3749 // block if CondToEarlyExit.
3750 VPValue *IsEarlyExitTaken =
3751 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3752 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3753 VPBasicBlock *VectorEarlyExitVPBB =
3754 Plan.createVPBasicBlock("vector.early.exit");
3755 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3756 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3757 NewMiddle->swapSuccessors();
3758
3759 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3760
3761 // Update the exit phis in the early exit block.
3762 VPBuilder MiddleBuilder(NewMiddle);
3763 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3764 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3765 auto *ExitIRI = cast<VPIRPhi>(&R);
3766 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3767 // a single predecessor and 1 if it has two.
3768 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3769 if (ExitIRI->getNumOperands() != 1) {
3770 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3771 // predecessor. Extract its last lane.
3772 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3773 }
3774
3775 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3776 if (!IncomingFromEarlyExit->isLiveIn()) {
3777 // Update the incoming value from the early exit.
3778 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3779 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3780 DebugLoc::getUnknown(), "first.active.lane");
3781 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3782 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3783 DebugLoc::getUnknown(), "early.exit.value");
3784 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3785 }
3786 }
3787 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3788
3789 // Replace the condition controlling the non-early exit from the vector loop
3790 // with one exiting if either the original condition of the vector latch is
3791 // true or the early exit has been taken.
3792 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3793 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3794 "Unexpected terminator");
3795 auto *IsLatchExitTaken =
3796 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3797 LatchExitingBranch->getOperand(1));
3798 auto *AnyExitTaken = Builder.createNaryOp(
3799 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3800 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3801 LatchExitingBranch->eraseFromParent();
3802}
3803
3804/// This function tries convert extended in-loop reductions to
3805/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3806/// valid. The created recipe must be decomposed to its constituent
3807/// recipes before execution.
3808static VPExpressionRecipe *
3810 VFRange &Range) {
3811 Type *RedTy = Ctx.Types.inferScalarType(Red);
3812 VPValue *VecOp = Red->getVecOp();
3813
3814 // Clamp the range if using extended-reduction is profitable.
3815 auto IsExtendedRedValidAndClampRange =
3816 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3818 [&](ElementCount VF) {
3819 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3821
3822 InstructionCost ExtRedCost;
3823 InstructionCost ExtCost =
3824 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3825 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3826
3827 if (Red->isPartialReduction()) {
3830 // FIXME: Move partial reduction creation, costing and clamping
3831 // here from LoopVectorize.cpp.
3832 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3833 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3834 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3835 } else {
3836 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3837 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3838 Red->getFastMathFlags(), CostKind);
3839 }
3840 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3841 },
3842 Range);
3843 };
3844
3845 VPValue *A;
3846 // Match reduce(ext)).
3847 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3848 IsExtendedRedValidAndClampRange(
3849 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3850 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3851 Ctx.Types.inferScalarType(A)))
3852 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3853
3854 return nullptr;
3855}
3856
3857/// This function tries convert extended in-loop reductions to
3858/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3859/// and valid. The created VPExpressionRecipe must be decomposed to its
3860/// constituent recipes before execution. Patterns of the
3861/// VPExpressionRecipe:
3862/// reduce.add(mul(...)),
3863/// reduce.add(mul(ext(A), ext(B))),
3864/// reduce.add(ext(mul(ext(A), ext(B)))).
3865static VPExpressionRecipe *
3867 VPCostContext &Ctx, VFRange &Range) {
3868 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3869 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3870 return nullptr;
3871
3872 Type *RedTy = Ctx.Types.inferScalarType(Red);
3873
3874 // Clamp the range if using multiply-accumulate-reduction is profitable.
3875 auto IsMulAccValidAndClampRange =
3877 VPWidenCastRecipe *OuterExt) -> bool {
3879 [&](ElementCount VF) {
3881 Type *SrcTy =
3882 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3883 InstructionCost MulAccCost;
3884
3885 if (Red->isPartialReduction()) {
3886 Type *SrcTy2 =
3887 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3888 // FIXME: Move partial reduction creation, costing and clamping
3889 // here from LoopVectorize.cpp.
3890 MulAccCost = Ctx.TTI.getPartialReductionCost(
3891 Opcode, SrcTy, SrcTy2, RedTy, VF,
3893 Ext0->getOpcode())
3896 Ext1->getOpcode())
3898 Mul->getOpcode(), CostKind);
3899 } else {
3900 // Only partial reductions support mixed extends at the moment.
3901 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3902 return false;
3903
3904 bool IsZExt =
3905 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3906 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3907 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3908 SrcVecTy, CostKind);
3909 }
3910
3911 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3912 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3913 InstructionCost ExtCost = 0;
3914 if (Ext0)
3915 ExtCost += Ext0->computeCost(VF, Ctx);
3916 if (Ext1)
3917 ExtCost += Ext1->computeCost(VF, Ctx);
3918 if (OuterExt)
3919 ExtCost += OuterExt->computeCost(VF, Ctx);
3920
3921 return MulAccCost.isValid() &&
3922 MulAccCost < ExtCost + MulCost + RedCost;
3923 },
3924 Range);
3925 };
3926
3927 VPValue *VecOp = Red->getVecOp();
3928 VPRecipeBase *Sub = nullptr;
3929 VPValue *A, *B;
3930 VPValue *Tmp = nullptr;
3931 // Sub reductions could have a sub between the add reduction and vec op.
3932 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3933 Sub = VecOp->getDefiningRecipe();
3934 VecOp = Tmp;
3935 }
3936
3937 // If ValB is a constant and can be safely extended, truncate it to the same
3938 // type as ExtA's operand, then extend it to the same type as ExtA. This
3939 // creates two uniform extends that can more easily be matched by the rest of
3940 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3941 // replaced with the new extend of the constant.
3942 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3943 VPWidenCastRecipe *&ExtB,
3944 VPValue *&ValB, VPWidenRecipe *Mul) {
3945 if (!ExtA || ExtB || !ValB->isLiveIn())
3946 return;
3947 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3948 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3949 const APInt *Const;
3950 if (!match(ValB, m_APInt(Const)) ||
3952 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3953 return;
3954 // The truncate ensures that the type of each extended operand is the
3955 // same, and it's been proven that the constant can be extended from
3956 // NarrowTy safely. Necessary since ExtA's extended operand would be
3957 // e.g. an i8, while the const will likely be an i32. This will be
3958 // elided by later optimisations.
3959 VPBuilder Builder(Mul);
3960 auto *Trunc =
3961 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3962 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3963 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3964 Mul->setOperand(1, ExtB);
3965 };
3966
3967 // Try to match reduce.add(mul(...)).
3968 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3971 auto *Mul = cast<VPWidenRecipe>(VecOp);
3972
3973 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3974 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3975
3976 // Match reduce.add/sub(mul(ext, ext)).
3977 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3978 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3979 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3980 if (Sub)
3981 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3982 cast<VPWidenRecipe>(Sub), Red);
3983 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3984 }
3985 // TODO: Add an expression type for this variant with a negated mul
3986 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3987 return new VPExpressionRecipe(Mul, Red);
3988 }
3989 // TODO: Add an expression type for negated versions of other expression
3990 // variants.
3991 if (Sub)
3992 return nullptr;
3993
3994 // Match reduce.add(ext(mul(A, B))).
3995 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3996 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
3997 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4000
4001 // reduce.add(ext(mul(ext, const)))
4002 // -> reduce.add(ext(mul(ext, ext(const))))
4003 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4004
4005 // reduce.add(ext(mul(ext(A), ext(B))))
4006 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4007 // The inner extends must either have the same opcode as the outer extend or
4008 // be the same, in which case the multiply can never result in a negative
4009 // value and the outer extend can be folded away by doing wider
4010 // extends for the operands of the mul.
4011 if (Ext0 && Ext1 &&
4012 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4013 Ext0->getOpcode() == Ext1->getOpcode() &&
4014 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4015 auto *NewExt0 = new VPWidenCastRecipe(
4016 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4017 *Ext0, *Ext0, Ext0->getDebugLoc());
4018 NewExt0->insertBefore(Ext0);
4019
4020 VPWidenCastRecipe *NewExt1 = NewExt0;
4021 if (Ext0 != Ext1) {
4022 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4023 Ext->getResultType(), nullptr, *Ext1,
4024 *Ext1, Ext1->getDebugLoc());
4025 NewExt1->insertBefore(Ext1);
4026 }
4027 Mul->setOperand(0, NewExt0);
4028 Mul->setOperand(1, NewExt1);
4029 Red->setOperand(1, Mul);
4030 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4031 }
4032 }
4033 return nullptr;
4034}
4035
4036/// This function tries to create abstract recipes from the reduction recipe for
4037/// following optimizations and cost estimation.
4039 VPCostContext &Ctx,
4040 VFRange &Range) {
4041 VPExpressionRecipe *AbstractR = nullptr;
4042 auto IP = std::next(Red->getIterator());
4043 auto *VPBB = Red->getParent();
4044 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4045 AbstractR = MulAcc;
4046 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4047 AbstractR = ExtRed;
4048 // Cannot create abstract inloop reduction recipes.
4049 if (!AbstractR)
4050 return;
4051
4052 AbstractR->insertBefore(*VPBB, IP);
4053 Red->replaceAllUsesWith(AbstractR);
4054}
4055
4066
4068 if (Plan.hasScalarVFOnly())
4069 return;
4070
4071#ifndef NDEBUG
4072 VPDominatorTree VPDT(Plan);
4073#endif
4074
4075 SmallVector<VPValue *> VPValues;
4078 append_range(VPValues, Plan.getLiveIns());
4079 for (VPRecipeBase &R : *Plan.getEntry())
4080 append_range(VPValues, R.definedValues());
4081
4082 auto *VectorPreheader = Plan.getVectorPreheader();
4083 for (VPValue *VPV : VPValues) {
4085 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4086 isa<Constant>(VPV->getLiveInIRValue())))
4087 continue;
4088
4089 // Add explicit broadcast at the insert point that dominates all users.
4090 VPBasicBlock *HoistBlock = VectorPreheader;
4091 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4092 for (VPUser *User : VPV->users()) {
4093 if (User->usesScalars(VPV))
4094 continue;
4095 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4096 HoistPoint = HoistBlock->begin();
4097 else
4098 assert(VPDT.dominates(VectorPreheader,
4099 cast<VPRecipeBase>(User)->getParent()) &&
4100 "All users must be in the vector preheader or dominated by it");
4101 }
4102
4103 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4104 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4105 VPV->replaceUsesWithIf(Broadcast,
4106 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4107 return Broadcast != &U && !U.usesScalars(VPV);
4108 });
4109 }
4110}
4111
4113 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4114
4115 // Collect candidate loads with invariant addresses and noalias scopes
4116 // metadata and memory-writing recipes with noalias metadata.
4120 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4121 for (VPRecipeBase &R : *VPBB) {
4122 // Only handle single-scalar replicated loads with invariant addresses.
4123 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4124 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4125 RepR->getOpcode() != Instruction::Load)
4126 continue;
4127
4128 VPValue *Addr = RepR->getOperand(0);
4129 if (Addr->isDefinedOutsideLoopRegions()) {
4131 if (!Loc.AATags.Scope)
4132 continue;
4133 CandidateLoads.push_back({RepR, Loc});
4134 }
4135 }
4136 if (R.mayWriteToMemory()) {
4138 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4139 return;
4140 Stores.push_back(*Loc);
4141 }
4142 }
4143 }
4144
4145 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4146 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4147 // Hoist the load to the preheader if it doesn't alias with any stores
4148 // according to the noalias metadata. Other loads should have been hoisted
4149 // by other passes
4150 const AAMDNodes &LoadAA = LoadLoc.AATags;
4151 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4153 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4154 })) {
4155 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4156 }
4157 }
4158}
4159
4160// Collect common metadata from a group of replicate recipes by intersecting
4161// metadata from all recipes in the group.
4163 VPIRMetadata CommonMetadata = *Recipes.front();
4164 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4165 CommonMetadata.intersect(*Recipe);
4166 return CommonMetadata;
4167}
4168
4169template <unsigned Opcode>
4172 const Loop *L) {
4173 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4174 "Only Load and Store opcodes supported");
4175 constexpr bool IsLoad = (Opcode == Instruction::Load);
4176 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4177 VPTypeAnalysis TypeInfo(Plan);
4178
4179 // Group predicated operations by their address SCEV.
4181 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4182 auto *VPBB = cast<VPBasicBlock>(Block);
4183 for (VPRecipeBase &R : *VPBB) {
4184 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4185 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4186 continue;
4187
4188 // For loads, operand 0 is address; for stores, operand 1 is address.
4189 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4190 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
4191 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4192 RecipesByAddress[AddrSCEV].push_back(RepR);
4193 }
4194 }
4195
4196 // For each address, collect operations with the same or complementary masks.
4198 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4199 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4200 };
4201 for (auto &[Addr, Recipes] : RecipesByAddress) {
4202 if (Recipes.size() < 2)
4203 continue;
4204
4205 // Collect groups with the same or complementary masks.
4206 for (VPReplicateRecipe *&RecipeI : Recipes) {
4207 if (!RecipeI)
4208 continue;
4209
4210 VPValue *MaskI = RecipeI->getMask();
4211 Type *TypeI = GetLoadStoreValueType(RecipeI);
4213 Group.push_back(RecipeI);
4214 RecipeI = nullptr;
4215
4216 // Find all operations with the same or complementary masks.
4217 bool HasComplementaryMask = false;
4218 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4219 if (!RecipeJ)
4220 continue;
4221
4222 VPValue *MaskJ = RecipeJ->getMask();
4223 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4224 if (TypeI == TypeJ) {
4225 // Check if any operation in the group has a complementary mask with
4226 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4227 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4228 match(MaskJ, m_Not(m_Specific(MaskI)));
4229 Group.push_back(RecipeJ);
4230 RecipeJ = nullptr;
4231 }
4232 }
4233
4234 if (HasComplementaryMask) {
4235 assert(Group.size() >= 2 && "must have at least 2 entries");
4236 AllGroups.push_back(std::move(Group));
4237 }
4238 }
4239 }
4240
4241 return AllGroups;
4242}
4243
4244// Find the recipe with minimum alignment in the group.
4245template <typename InstType>
4246static VPReplicateRecipe *
4248 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4249 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4250 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4251 });
4252}
4253
4255 const Loop *L) {
4256 auto Groups =
4258 if (Groups.empty())
4259 return;
4260
4261 VPDominatorTree VPDT(Plan);
4262
4263 // Process each group of loads.
4264 for (auto &Group : Groups) {
4265 // Sort loads by dominance order, with earliest (most dominating) first.
4266 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4267 return VPDT.properlyDominates(A, B);
4268 });
4269
4270 // Try to use the earliest (most dominating) load to replace all others.
4271 VPReplicateRecipe *EarliestLoad = Group[0];
4272 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4273 VPBasicBlock *LastBB = Group.back()->getParent();
4274
4275 // Check that the load doesn't alias with stores between first and last.
4276 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4277 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
4278 /*CheckReads=*/false))
4279 continue;
4280
4281 // Collect common metadata from all loads in the group.
4282 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4283
4284 // Find the load with minimum alignment to use.
4285 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4286
4287 // Create an unpredicated version of the earliest load with common
4288 // metadata.
4289 auto *UnpredicatedLoad = new VPReplicateRecipe(
4290 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4291 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4292 CommonMetadata);
4293
4294 UnpredicatedLoad->insertBefore(EarliestLoad);
4295
4296 // Replace all loads in the group with the unpredicated load.
4297 for (VPReplicateRecipe *Load : Group) {
4298 Load->replaceAllUsesWith(UnpredicatedLoad);
4299 Load->eraseFromParent();
4300 }
4301 }
4302}
4303
4304static bool
4306 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4307 if (!StoreLoc || !StoreLoc->AATags.Scope)
4308 return false;
4309
4310 // When sinking a group of stores, all members of the group alias each other.
4311 // Skip them during the alias checks.
4312 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4313 StoresToSink.end());
4314
4315 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4316 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4317 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB,
4318 /*CheckReads=*/true, &StoresToSinkSet);
4319}
4320
4322 const Loop *L) {
4323 auto Groups =
4325 if (Groups.empty())
4326 return;
4327
4328 VPDominatorTree VPDT(Plan);
4329
4330 for (auto &Group : Groups) {
4331 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4332 return VPDT.properlyDominates(A, B);
4333 });
4334
4335 if (!canSinkStoreWithNoAliasCheck(Group))
4336 continue;
4337
4338 // Use the last (most dominated) store's location for the unconditional
4339 // store.
4340 VPReplicateRecipe *LastStore = Group.back();
4341 VPBasicBlock *InsertBB = LastStore->getParent();
4342
4343 // Collect common alias metadata from all stores in the group.
4344 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4345
4346 // Build select chain for stored values.
4347 VPValue *SelectedValue = Group[0]->getOperand(0);
4348 VPBuilder Builder(InsertBB, LastStore->getIterator());
4349
4350 for (unsigned I = 1; I < Group.size(); ++I) {
4351 VPValue *Mask = Group[I]->getMask();
4352 VPValue *Value = Group[I]->getOperand(0);
4353 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4354 Group[I]->getDebugLoc());
4355 }
4356
4357 // Find the store with minimum alignment to use.
4358 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4359
4360 // Create unconditional store with selected value and common metadata.
4361 auto *UnpredicatedStore =
4362 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4363 {SelectedValue, LastStore->getOperand(1)},
4364 /*IsSingleScalar=*/false,
4365 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4366 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4367
4368 // Remove all predicated stores from the group.
4369 for (VPReplicateRecipe *Store : Group)
4370 Store->eraseFromParent();
4371 }
4372}
4373
4375 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4377 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4378 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4379
4380 VPValue *TC = Plan.getTripCount();
4381 // Skip cases for which the trip count may be non-trivial to materialize.
4382 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4383 // tail is required.
4384 if (!Plan.hasScalarTail() ||
4386 Plan.getScalarPreheader() ||
4387 !TC->isLiveIn())
4388 return;
4389
4390 // Materialize vector trip counts for constants early if it can simply
4391 // be computed as (Original TC / VF * UF) * VF * UF.
4392 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4393 // tail-folded loops.
4394 ScalarEvolution &SE = *PSE.getSE();
4395 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4396 if (!isa<SCEVConstant>(TCScev))
4397 return;
4398 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4399 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4400 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4401 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4402}
4403
4405 VPBasicBlock *VectorPH) {
4407 if (BTC->getNumUsers() == 0)
4408 return;
4409
4410 VPBuilder Builder(VectorPH, VectorPH->begin());
4411 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4412 auto *TCMO = Builder.createNaryOp(
4413 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4414 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4415 BTC->replaceAllUsesWith(TCMO);
4416}
4417
4419 if (Plan.hasScalarVFOnly())
4420 return;
4421
4422 VPTypeAnalysis TypeInfo(Plan);
4423 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4424 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4426 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4427 vp_depth_first_shallow(LoopRegion->getEntry()));
4428 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4429 // VPInstructions, excluding ones in replicate regions. Those are not
4430 // materialized explicitly yet. Those vector users are still handled in
4431 // VPReplicateRegion::execute(), via shouldPack().
4432 // TODO: materialize build vectors for replicating recipes in replicating
4433 // regions.
4434 for (VPBasicBlock *VPBB :
4435 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4436 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4438 continue;
4439 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4440 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4441 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4442 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4443 };
4444 if ((isa<VPReplicateRecipe>(DefR) &&
4445 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4446 (isa<VPInstruction>(DefR) &&
4448 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4449 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4450 continue;
4451
4452 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4453 unsigned Opcode = ScalarTy->isStructTy()
4456 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4457 BuildVector->insertAfter(DefR);
4458
4459 DefR->replaceUsesWithIf(
4460 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4461 VPUser &U, unsigned) {
4462 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4463 });
4464 }
4465 }
4466
4467 // Create explicit VPInstructions to convert vectors to scalars. The current
4468 // implementation is conservative - it may miss some cases that may or may not
4469 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4470 // if they are known to operate on scalar values.
4471 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4472 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4475 continue;
4476 for (VPValue *Def : R.definedValues()) {
4477 // Skip recipes that are single-scalar or only have their first lane
4478 // used.
4479 // TODO: The Defs skipped here may or may not be vector values.
4480 // Introduce Unpacks, and remove them later, if they are guaranteed to
4481 // produce scalar values.
4483 continue;
4484
4485 // At the moment, we create unpacks only for scalar users outside
4486 // replicate regions. Recipes inside replicate regions still extract the
4487 // required lanes implicitly.
4488 // TODO: Remove once replicate regions are unrolled completely.
4489 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4490 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4491 return U->usesScalars(Def) &&
4492 (!ParentRegion || !ParentRegion->isReplicator());
4493 };
4494 if (none_of(Def->users(), IsCandidateUnpackUser))
4495 continue;
4496
4497 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4498 if (R.isPhi())
4499 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4500 else
4501 Unpack->insertAfter(&R);
4502 Def->replaceUsesWithIf(Unpack,
4503 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4504 return IsCandidateUnpackUser(&U);
4505 });
4506 }
4507 }
4508 }
4509}
4510
4512 VPBasicBlock *VectorPHVPBB,
4513 bool TailByMasking,
4514 bool RequiresScalarEpilogue) {
4515 VPValue &VectorTC = Plan.getVectorTripCount();
4516 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4517 // There's nothing to do if there are no users of the vector trip count or its
4518 // IR value has already been set.
4519 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4520 return;
4521
4522 VPValue *TC = Plan.getTripCount();
4523 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4524 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4525 VPValue *Step = &Plan.getVFxUF();
4526
4527 // If the tail is to be folded by masking, round the number of iterations N
4528 // up to a multiple of Step instead of rounding down. This is done by first
4529 // adding Step-1 and then rounding down. Note that it's ok if this addition
4530 // overflows: the vector induction variable will eventually wrap to zero given
4531 // that it starts at zero and its Step is a power of two; the loop will then
4532 // exit, with the last early-exit vector comparison also producing all-true.
4533 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4534 // is accounted for in emitIterationCountCheck that adds an overflow check.
4535 if (TailByMasking) {
4536 TC = Builder.createNaryOp(
4537 Instruction::Add,
4538 {TC, Builder.createNaryOp(Instruction::Sub,
4539 {Step, Plan.getConstantInt(TCTy, 1)})},
4540 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4541 }
4542
4543 // Now we need to generate the expression for the part of the loop that the
4544 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4545 // iterations are not required for correctness, or N - Step, otherwise. Step
4546 // is equal to the vectorization factor (number of SIMD elements) times the
4547 // unroll factor (number of SIMD instructions).
4548 VPValue *R =
4549 Builder.createNaryOp(Instruction::URem, {TC, Step},
4550 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4551
4552 // There are cases where we *must* run at least one iteration in the remainder
4553 // loop. See the cost model for when this can happen. If the step evenly
4554 // divides the trip count, we set the remainder to be equal to the step. If
4555 // the step does not evenly divide the trip count, no adjustment is necessary
4556 // since there will already be scalar iterations. Note that the minimum
4557 // iterations check ensures that N >= Step.
4558 if (RequiresScalarEpilogue) {
4559 assert(!TailByMasking &&
4560 "requiring scalar epilogue is not supported with fail folding");
4561 VPValue *IsZero =
4562 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4563 R = Builder.createSelect(IsZero, Step, R);
4564 }
4565
4566 VPValue *Res = Builder.createNaryOp(
4567 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4568 VectorTC.replaceAllUsesWith(Res);
4569}
4570
4572 ElementCount VFEC) {
4573 VPBuilder Builder(VectorPH, VectorPH->begin());
4574 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4575 VPValue &VF = Plan.getVF();
4576 VPValue &VFxUF = Plan.getVFxUF();
4577 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4578 // used.
4579 // TODO: Assert that they aren't used.
4580
4581 // If there are no users of the runtime VF, compute VFxUF by constant folding
4582 // the multiplication of VF and UF.
4583 if (VF.getNumUsers() == 0) {
4584 VPValue *RuntimeVFxUF =
4585 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4586 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4587 return;
4588 }
4589
4590 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4591 // vscale) * UF.
4592 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4594 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4596 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4597 }
4598 VF.replaceAllUsesWith(RuntimeVF);
4599
4600 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4601 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4602 VFxUF.replaceAllUsesWith(MulByUF);
4603}
4604
4607 const DataLayout &DL = SE.getDataLayout();
4608 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4609
4610 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4611 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4612 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4613 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4615 continue;
4616 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4617 if (!ExpSCEV)
4618 break;
4619 const SCEV *Expr = ExpSCEV->getSCEV();
4620 Value *Res =
4621 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4622 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4623 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4624 ExpSCEV->replaceAllUsesWith(Exp);
4625 if (Plan.getTripCount() == ExpSCEV)
4626 Plan.resetTripCount(Exp);
4627 ExpSCEV->eraseFromParent();
4628 }
4630 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4631 "after any VPIRInstructions");
4632 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4633 // to the VPIRBasicBlock.
4634 auto EI = Entry->begin();
4635 for (Instruction &I : drop_end(*EntryBB)) {
4636 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4637 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4638 EI++;
4639 continue;
4640 }
4642 }
4643
4644 return ExpandedSCEVs;
4645}
4646
4647/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4648/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4649/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4650/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4651/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4652/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4653/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4654/// is defined at \p Idx of a load interleave group.
4655static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4656 VPValue *OpV, unsigned Idx) {
4657 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4658 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4659 if (!Member0OpR)
4660 return Member0Op == OpV;
4661 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4662 return !W->getMask() && Member0Op == OpV;
4663 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4664 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4665 return false;
4666}
4667
4668/// Returns true if \p IR is a full interleave group with factor and number of
4669/// members both equal to \p VF. The interleave group must also access the full
4670/// vector width \p VectorRegWidth.
4672 ElementCount VF,
4673 VPTypeAnalysis &TypeInfo,
4674 TypeSize VectorRegWidth) {
4675 if (!InterleaveR || InterleaveR->getMask())
4676 return false;
4677
4678 Type *GroupElementTy = nullptr;
4679 if (InterleaveR->getStoredValues().empty()) {
4680 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4681 if (!all_of(InterleaveR->definedValues(),
4682 [&TypeInfo, GroupElementTy](VPValue *Op) {
4683 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4684 }))
4685 return false;
4686 } else {
4687 GroupElementTy =
4688 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4689 if (!all_of(InterleaveR->getStoredValues(),
4690 [&TypeInfo, GroupElementTy](VPValue *Op) {
4691 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4692 }))
4693 return false;
4694 }
4695
4696 unsigned VFMin = VF.getKnownMinValue();
4697 TypeSize GroupSize = TypeSize::get(
4698 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4699 const auto *IG = InterleaveR->getInterleaveGroup();
4700 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4701 GroupSize == VectorRegWidth;
4702}
4703
4704/// Returns true if \p VPValue is a narrow VPValue.
4705static bool isAlreadyNarrow(VPValue *VPV) {
4706 if (VPV->isLiveIn())
4707 return true;
4708 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4709 return RepR && RepR->isSingleScalar();
4710}
4711
4712// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4713// a narrow variant.
4714static VPValue *
4716 auto *R = V->getDefiningRecipe();
4717 if (!R || NarrowedOps.contains(V))
4718 return V;
4719
4720 if (isAlreadyNarrow(V))
4721 return V;
4722
4723 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4724 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4725 WideMember0->setOperand(
4726 Idx,
4727 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4728 return V;
4729 }
4730
4731 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4732 // Narrow interleave group to wide load, as transformed VPlan will only
4733 // process one original iteration.
4734 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4735 auto *L = new VPWidenLoadRecipe(
4736 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4737 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4738 L->insertBefore(LoadGroup);
4739 NarrowedOps.insert(L);
4740 return L;
4741 }
4742
4743 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4744 assert(RepR->isSingleScalar() &&
4745 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4746 "must be a single scalar load");
4747 NarrowedOps.insert(RepR);
4748 return RepR;
4749 }
4750
4751 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4752 VPValue *PtrOp = WideLoad->getAddr();
4753 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4754 PtrOp = VecPtr->getOperand(0);
4755 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4756 // process one original iteration.
4757 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4758 /*IsUniform*/ true,
4759 /*Mask*/ nullptr, {}, *WideLoad);
4760 N->insertBefore(WideLoad);
4761 NarrowedOps.insert(N);
4762 return N;
4763}
4764
4766 TypeSize VectorRegWidth) {
4767 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4768 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4769 return;
4770
4771 VPTypeAnalysis TypeInfo(Plan);
4772
4774 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4776 continue;
4777
4780 continue;
4781
4782 // Bail out on recipes not supported at the moment:
4783 // * phi recipes other than the canonical induction
4784 // * recipes writing to memory except interleave groups
4785 // Only support plans with a canonical induction phi.
4786 if (R.isPhi())
4787 return;
4788
4789 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4790 if (R.mayWriteToMemory() && !InterleaveR)
4791 return;
4792
4793 // Do not narrow interleave groups if there are VectorPointer recipes and
4794 // the plan was unrolled. The recipe implicitly uses VF from
4795 // VPTransformState.
4796 // TODO: Remove restriction once the VF for the VectorPointer offset is
4797 // modeled explicitly as operand.
4798 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4799 return;
4800
4801 // All other ops are allowed, but we reject uses that cannot be converted
4802 // when checking all allowed consumers (store interleave groups) below.
4803 if (!InterleaveR)
4804 continue;
4805
4806 // Bail out on non-consecutive interleave groups.
4807 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4808 VectorRegWidth))
4809 return;
4810
4811 // Skip read interleave groups.
4812 if (InterleaveR->getStoredValues().empty())
4813 continue;
4814
4815 // Narrow interleave groups, if all operands are already matching narrow
4816 // ops.
4817 auto *Member0 = InterleaveR->getStoredValues()[0];
4818 if (isAlreadyNarrow(Member0) &&
4819 all_of(InterleaveR->getStoredValues(),
4820 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4821 StoreGroups.push_back(InterleaveR);
4822 continue;
4823 }
4824
4825 // For now, we only support full interleave groups storing load interleave
4826 // groups.
4827 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4828 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4829 if (!DefR)
4830 return false;
4831 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4832 return IR && IR->getInterleaveGroup()->isFull() &&
4833 IR->getVPValue(Op.index()) == Op.value();
4834 })) {
4835 StoreGroups.push_back(InterleaveR);
4836 continue;
4837 }
4838
4839 // Check if all values feeding InterleaveR are matching wide recipes, which
4840 // operands that can be narrowed.
4841 auto *WideMember0 =
4842 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4843 if (!WideMember0)
4844 return;
4845 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4847 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4848 R->getNumOperands() > 2)
4849 return;
4850 if (any_of(enumerate(R->operands()),
4851 [WideMember0, Idx = I](const auto &P) {
4852 const auto &[OpIdx, OpV] = P;
4853 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4854 }))
4855 return;
4856 }
4857 StoreGroups.push_back(InterleaveR);
4858 }
4859
4860 if (StoreGroups.empty())
4861 return;
4862
4863 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4864 SmallPtrSet<VPValue *, 4> NarrowedOps;
4865 // Narrow operation tree rooted at store groups.
4866 for (auto *StoreGroup : StoreGroups) {
4867 VPValue *Res =
4868 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4869 auto *SI =
4870 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4871 auto *S = new VPWidenStoreRecipe(
4872 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4873 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4874 S->insertBefore(StoreGroup);
4875 StoreGroup->eraseFromParent();
4876 }
4877
4878 // Adjust induction to reflect that the transformed plan only processes one
4879 // original iteration.
4880 auto *CanIV = VectorLoop->getCanonicalIV();
4881 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4882 VPBuilder PHBuilder(Plan.getVectorPreheader());
4883
4884 VPValue *UF = Plan.getOrAddLiveIn(
4885 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4886 if (VF.isScalable()) {
4887 VPValue *VScale = PHBuilder.createElementCount(
4889 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4890 Inc->setOperand(1, VScaleUF);
4891 Plan.getVF().replaceAllUsesWith(VScale);
4892 } else {
4893 Inc->setOperand(1, UF);
4895 Plan.getConstantInt(CanIV->getScalarType(), 1));
4896 }
4897 removeDeadRecipes(Plan);
4898}
4899
4900/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4901/// BranchOnCond recipe.
4903 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4904 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4905 auto *MiddleTerm =
4907 // Only add branch metadata if there is a (conditional) terminator.
4908 if (!MiddleTerm)
4909 return;
4910
4911 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4912 "must have a BranchOnCond");
4913 // Assume that `TripCount % VectorStep ` is equally distributed.
4914 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4915 if (VF.isScalable() && VScaleForTuning.has_value())
4916 VectorStep *= *VScaleForTuning;
4917 assert(VectorStep > 0 && "trip count should not be zero");
4918 MDBuilder MDB(Plan.getContext());
4919 MDNode *BranchWeights =
4920 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4921 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
4922}
4923
4924/// Compute and return the end value for \p WideIV, unless it is truncated. If
4925/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
4926/// compute the end value of the induction.
4928 VPBuilder &VectorPHBuilder,
4929 VPTypeAnalysis &TypeInfo,
4930 VPValue *VectorTC) {
4931 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4932 // Truncated wide inductions resume from the last lane of their vector value
4933 // in the last vector iteration which is handled elsewhere.
4934 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4935 return nullptr;
4936
4937 VPValue *Start = WideIV->getStartValue();
4938 VPValue *Step = WideIV->getStepValue();
4940 VPValue *EndValue = VectorTC;
4941 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4942 EndValue = VectorPHBuilder.createDerivedIV(
4943 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4944 Start, VectorTC, Step);
4945 }
4946
4947 // EndValue is derived from the vector trip count (which has the same type as
4948 // the widest induction) and thus may be wider than the induction here.
4949 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4950 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4951 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4952 ScalarTypeOfWideIV,
4953 WideIV->getDebugLoc());
4954 }
4955
4956 return EndValue;
4957}
4958
4960 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
4961 VPTypeAnalysis TypeInfo(Plan);
4962 auto *ScalarPH = Plan.getScalarPreheader();
4963 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4964 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4965 VPBuilder VectorPHBuilder(
4966 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4967 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4968 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
4969 auto *ResumePhiR = cast<VPPhi>(&PhiR);
4970
4971 // TODO: Extract final value from induction recipe initially, optimize to
4972 // pre-computed end value together in optimizeInductionExitUsers.
4973 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
4974 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4976 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
4977 IVEndValues[WideIVR] = EndValue;
4978 ResumePhiR->setOperand(0, EndValue);
4979 ResumePhiR->setName("bc.resume.val");
4980 continue;
4981 }
4982 // TODO: Also handle truncated inductions here. Computing end-values
4983 // separately should be done as VPlan-to-VPlan optimization, after
4984 // legalizing all resume values to use the last lane from the loop.
4985 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4986 "should only skip truncated wide inductions");
4987 continue;
4988 }
4989
4990 // The backedge value provides the value to resume coming out of a loop,
4991 // which for FORs is a vector whose last element needs to be extracted. The
4992 // start value provides the value if the loop is bypassed.
4993 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4994 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4995 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4996 "Cannot handle loops with uncountable early exits");
4997 if (IsFOR)
4998 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4999 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop},
5000 DebugLoc::getUnknown(), "vector.recur.extract");
5001 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5002 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5003 }
5004}
5005
5007 VFRange &Range) {
5008 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5009 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5010 auto *MiddleVPBB = Plan.getMiddleBlock();
5011 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5012 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5013
5014 auto IsScalableOne = [](ElementCount VF) -> bool {
5015 return VF == ElementCount::getScalable(1);
5016 };
5017
5018 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5019 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5020 if (!FOR)
5021 continue;
5022
5023 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5024 "Cannot handle loops with uncountable early exits");
5025
5026 // This is the second phase of vectorizing first-order recurrences, creating
5027 // extract for users outside the loop. An overview of the transformation is
5028 // described below. Suppose we have the following loop with some use after
5029 // the loop of the last a[i-1],
5030 //
5031 // for (int i = 0; i < n; ++i) {
5032 // t = a[i - 1];
5033 // b[i] = a[i] - t;
5034 // }
5035 // use t;
5036 //
5037 // There is a first-order recurrence on "a". For this loop, the shorthand
5038 // scalar IR looks like:
5039 //
5040 // scalar.ph:
5041 // s.init = a[-1]
5042 // br scalar.body
5043 //
5044 // scalar.body:
5045 // i = phi [0, scalar.ph], [i+1, scalar.body]
5046 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5047 // s2 = a[i]
5048 // b[i] = s2 - s1
5049 // br cond, scalar.body, exit.block
5050 //
5051 // exit.block:
5052 // use = lcssa.phi [s1, scalar.body]
5053 //
5054 // In this example, s1 is a recurrence because it's value depends on the
5055 // previous iteration. In the first phase of vectorization, we created a
5056 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5057 // for users in the scalar preheader and exit block.
5058 //
5059 // vector.ph:
5060 // v_init = vector(..., ..., ..., a[-1])
5061 // br vector.body
5062 //
5063 // vector.body
5064 // i = phi [0, vector.ph], [i+4, vector.body]
5065 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5066 // v2 = a[i, i+1, i+2, i+3]
5067 // b[i] = v2 - v1
5068 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5069 // b[i, i+1, i+2, i+3] = v2 - v1
5070 // br cond, vector.body, middle.block
5071 //
5072 // middle.block:
5073 // vector.recur.extract.for.phi = v2(2)
5074 // vector.recur.extract = v2(3)
5075 // br cond, scalar.ph, exit.block
5076 //
5077 // scalar.ph:
5078 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5079 // [s.init, otherwise]
5080 // br scalar.body
5081 //
5082 // scalar.body:
5083 // i = phi [0, scalar.ph], [i+1, scalar.body]
5084 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5085 // s2 = a[i]
5086 // b[i] = s2 - s1
5087 // br cond, scalar.body, exit.block
5088 //
5089 // exit.block:
5090 // lo = lcssa.phi [s1, scalar.body],
5091 // [vector.recur.extract.for.phi, middle.block]
5092 //
5093 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5094 // Extract the penultimate value of the recurrence and use it as operand for
5095 // the VPIRInstruction modeling the phi.
5096 for (VPUser *U : FOR->users()) {
5097 using namespace llvm::VPlanPatternMatch;
5098 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
5099 continue;
5100 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5101 // penultimate value of the recurrence. Instead we rely on the existing
5102 // extract of the last element from the result of
5103 // VPInstruction::FirstOrderRecurrenceSplice.
5104 // TODO: Consider vscale_range info and UF.
5106 Range))
5107 return;
5108 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5109 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
5110 {}, "vector.recur.extract.for.phi");
5111 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
5112 }
5113 }
5114}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink)
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool CheckReads, const SmallPtrSetImpl< VPRecipeBase * > *ExcludeRecipes=nullptr)
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3607
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3968
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4043
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3995
iterator end()
Definition VPlan.h:4005
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4003
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4056
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4017
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2514
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2548
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2538
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2554
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2534
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3018
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3549
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:431
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:404
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:416
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3718
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3639
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3063
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2051
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2094
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2083
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4121
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4145
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1129
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1071
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1066
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1063
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
void setName(StringRef NewName)
Set the symbolic name for the VPInstruction.
Definition VPlan.h:1243
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2656
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2648
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2677
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2730
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2688
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3205
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4273
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2892
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2781
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4156
const VPBlockBase * getEntry() const
Definition VPlan.h:4192
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4267
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4224
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4209
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4254
const VPBlockBase * getExiting() const
Definition VPlan.h:4204
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4217
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2937
bool isSingleScalar() const
Definition VPlan.h:2978
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3002
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3788
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1913
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3681
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1554
Instruction::CastOps getOpcode() const
Definition VPlan.h:1590
A recipe for handling GEP instructions.
Definition VPlan.h:1850
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2118
PHINode * getPHINode() const
Definition VPlan.h:2160
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2146
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2163
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2194
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2270
A recipe for widening vector intrinsics.
Definition VPlan.h:1604
A common base class for widening memory operations.
Definition VPlan.h:3248
A recipe for widened phis.
Definition VPlan.h:2328
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1514
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4286
bool hasVF(ElementCount VF) const
Definition VPlan.h:4491
LLVMContext & getContext() const
Definition VPlan.h:4479
VPBasicBlock * getEntry()
Definition VPlan.h:4379
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4470
bool hasScalableVF() const
Definition VPlan.h:4492
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4477
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4473
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4441
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4548
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4462
unsigned getUF() const
Definition VPlan.h:4511
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4625
bool hasUF(unsigned UF) const
Definition VPlan.h:4509
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4431
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4554
void setVF(ElementCount VF)
Definition VPlan.h:4485
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4524
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4455
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4404
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4603
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4551
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4533
bool hasScalarVFOnly() const
Definition VPlan.h:4502
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4422
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4573
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4427
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4570
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4384
void setUF(unsigned UF)
Definition VPlan.h:4516
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4657
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2020
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2370
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3380
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3339
A recipe for widening select instructions.
Definition VPlan.h:1803
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3464
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3421
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...