LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47
49 VPlan &Plan,
51 GetIntOrFpInductionDescriptor,
52 const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 const auto *II = GetIntOrFpInductionDescriptor(Phi);
76 if (!II) {
77 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
78 for (VPValue *Op : PhiR->operands())
79 NewRecipe->addOperand(Op);
80 } else {
81 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
82 VPValue *Step =
84 // It is always safe to copy over the NoWrap and FastMath flags. In
85 // particular, when folding tail by masking, the masked-off lanes are
86 // never used, so it is safe.
88 NewRecipe = new VPWidenIntOrFpInductionRecipe(
89 Phi, Start, Step, &Plan.getVF(), *II, Flags,
90 Ingredient.getDebugLoc());
91 }
92 } else {
93 auto *VPI = cast<VPInstruction>(&Ingredient);
94 assert(!isa<PHINode>(Inst) && "phis should be handled above");
95 // Create VPWidenMemoryRecipe for loads and stores.
96 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
97 NewRecipe = new VPWidenLoadRecipe(
98 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
99 false /*Consecutive*/, false /*Reverse*/, *VPI,
100 Ingredient.getDebugLoc());
101 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
102 NewRecipe = new VPWidenStoreRecipe(
103 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
104 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
105 Ingredient.getDebugLoc());
107 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
108 Ingredient.getDebugLoc());
109 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
110 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
111 if (VectorID == Intrinsic::not_intrinsic)
112 return false;
113 NewRecipe = new VPWidenIntrinsicRecipe(
114 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
115 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
116 *VPI, CI->getDebugLoc());
117 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
118 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
119 *VPI, Ingredient.getDebugLoc());
120 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
121 NewRecipe = new VPWidenCastRecipe(
122 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
123 VPIRFlags(*CI), VPIRMetadata(*CI));
124 } else {
125 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
126 *VPI, Ingredient.getDebugLoc());
127 }
128 }
129
130 NewRecipe->insertBefore(&Ingredient);
131 if (NewRecipe->getNumDefinedValues() == 1)
132 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
133 else
134 assert(NewRecipe->getNumDefinedValues() == 0 &&
135 "Only recpies with zero or one defined values expected");
136 Ingredient.eraseFromParent();
137 }
138 }
139 return true;
140}
141
142// Check if a memory operation doesn't alias with memory operations in blocks
143// between FirstBB and LastBB using scoped noalias metadata.
144// For load hoisting, we only check writes in one direction.
145// For store sinking, we check both reads and writes bidirectionally.
147 const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
148 bool CheckReads,
149 const SmallPtrSetImpl<VPRecipeBase *> *ExcludeRecipes = nullptr) {
150 if (!MemLoc.AATags.Scope)
151 return false;
152
153 const AAMDNodes &MemAA = MemLoc.AATags;
154
155 for (VPBlockBase *Block = FirstBB; Block;
156 Block = Block->getSingleSuccessor()) {
157 assert(Block->getNumSuccessors() <= 1 &&
158 "Expected at most one successor in block chain");
159 auto *VPBB = cast<VPBasicBlock>(Block);
160 for (VPRecipeBase &R : *VPBB) {
161 if (ExcludeRecipes && ExcludeRecipes->contains(&R))
162 continue;
163
164 // Skip recipes that don't need checking.
165 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
166 continue;
167
169 if (!Loc)
170 // Conservatively assume aliasing for memory operations without
171 // location.
172 return false;
173
174 // For reads, check if they don't alias in the reverse direction and
175 // skip if so.
176 if (CheckReads && R.mayReadFromMemory() &&
178 MemAA.NoAlias))
179 continue;
180
181 // Check if the memory operations may alias in the forward direction.
183 Loc->AATags.NoAlias))
184 return false;
185 }
186
187 if (Block == LastBB)
188 break;
189 }
190 return true;
191}
192
193/// Return true if we do not know how to (mechanically) hoist or sink \p R out
194/// of a loop region.
196 // Assumes don't alias anything or throw; as long as they're guaranteed to
197 // execute, they're safe to hoist.
199 return false;
200
201 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
202 // memory location is not modified in the vector loop.
203 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
204 return true;
205
206 // Allocas cannot be hoisted.
207 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
208 return RepR && RepR->getOpcode() == Instruction::Alloca;
209}
210
211static bool sinkScalarOperands(VPlan &Plan) {
212 auto Iter = vp_depth_first_deep(Plan.getEntry());
213 bool ScalarVFOnly = Plan.hasScalarVFOnly();
214 bool Changed = false;
215
217 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
218 VPBasicBlock *SinkTo, VPValue *Op) {
219 auto *Candidate =
220 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
221 if (!Candidate)
222 return;
223
224 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
225 // for now.
227 return;
228
229 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
230 return;
231
232 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
233 if (!ScalarVFOnly && RepR->isSingleScalar())
234 return;
235
236 WorkList.insert({SinkTo, Candidate});
237 };
238
239 // First, collect the operands of all recipes in replicate blocks as seeds for
240 // sinking.
242 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
243 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
244 continue;
245 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
246 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
247 continue;
248 for (auto &Recipe : *VPBB)
249 for (VPValue *Op : Recipe.operands())
250 InsertIfValidSinkCandidate(VPBB, Op);
251 }
252
253 // Try to sink each replicate or scalar IV steps recipe in the worklist.
254 for (unsigned I = 0; I != WorkList.size(); ++I) {
255 VPBasicBlock *SinkTo;
256 VPSingleDefRecipe *SinkCandidate;
257 std::tie(SinkTo, SinkCandidate) = WorkList[I];
258
259 // All recipe users of SinkCandidate must be in the same block SinkTo or all
260 // users outside of SinkTo must only use the first lane of SinkCandidate. In
261 // the latter case, we need to duplicate SinkCandidate.
262 auto UsersOutsideSinkTo =
263 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
264 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
265 });
266 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
267 return !U->usesFirstLaneOnly(SinkCandidate);
268 }))
269 continue;
270 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
271
272 if (NeedsDuplicating) {
273 if (ScalarVFOnly)
274 continue;
275 VPSingleDefRecipe *Clone;
276 if (auto *SinkCandidateRepR =
277 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
278 // TODO: Handle converting to uniform recipes as separate transform,
279 // then cloning should be sufficient here.
280 Instruction *I = SinkCandidate->getUnderlyingInstr();
281 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
282 nullptr /*Mask*/, *SinkCandidateRepR,
283 *SinkCandidateRepR);
284 // TODO: add ".cloned" suffix to name of Clone's VPValue.
285 } else {
286 Clone = SinkCandidate->clone();
287 }
288
289 Clone->insertBefore(SinkCandidate);
290 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
291 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
292 });
293 }
294 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
295 for (VPValue *Op : SinkCandidate->operands())
296 InsertIfValidSinkCandidate(SinkTo, Op);
297 Changed = true;
298 }
299 return Changed;
300}
301
302/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
303/// the mask.
305 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
306 if (!EntryBB || EntryBB->size() != 1 ||
307 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
308 return nullptr;
309
310 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
311}
312
313/// If \p R is a triangle region, return the 'then' block of the triangle.
315 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
316 if (EntryBB->getNumSuccessors() != 2)
317 return nullptr;
318
319 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
320 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
321 if (!Succ0 || !Succ1)
322 return nullptr;
323
324 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
325 return nullptr;
326 if (Succ0->getSingleSuccessor() == Succ1)
327 return Succ0;
328 if (Succ1->getSingleSuccessor() == Succ0)
329 return Succ1;
330 return nullptr;
331}
332
333// Merge replicate regions in their successor region, if a replicate region
334// is connected to a successor replicate region with the same predicate by a
335// single, empty VPBasicBlock.
337 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
338
339 // Collect replicate regions followed by an empty block, followed by another
340 // replicate region with matching masks to process front. This is to avoid
341 // iterator invalidation issues while merging regions.
344 vp_depth_first_deep(Plan.getEntry()))) {
345 if (!Region1->isReplicator())
346 continue;
347 auto *MiddleBasicBlock =
348 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
349 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
350 continue;
351
352 auto *Region2 =
353 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
354 if (!Region2 || !Region2->isReplicator())
355 continue;
356
357 VPValue *Mask1 = getPredicatedMask(Region1);
358 VPValue *Mask2 = getPredicatedMask(Region2);
359 if (!Mask1 || Mask1 != Mask2)
360 continue;
361
362 assert(Mask1 && Mask2 && "both region must have conditions");
363 WorkList.push_back(Region1);
364 }
365
366 // Move recipes from Region1 to its successor region, if both are triangles.
367 for (VPRegionBlock *Region1 : WorkList) {
368 if (TransformedRegions.contains(Region1))
369 continue;
370 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
371 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
372
373 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
374 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
375 if (!Then1 || !Then2)
376 continue;
377
378 // Note: No fusion-preventing memory dependencies are expected in either
379 // region. Such dependencies should be rejected during earlier dependence
380 // checks, which guarantee accesses can be re-ordered for vectorization.
381 //
382 // Move recipes to the successor region.
383 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
384 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
385
386 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
387 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
388
389 // Move VPPredInstPHIRecipes from the merge block to the successor region's
390 // merge block. Update all users inside the successor region to use the
391 // original values.
392 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
393 VPValue *PredInst1 =
394 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
395 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
396 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
397 return cast<VPRecipeBase>(&U)->getParent() == Then2;
398 });
399
400 // Remove phi recipes that are unused after merging the regions.
401 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
402 Phi1ToMove.eraseFromParent();
403 continue;
404 }
405 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
406 }
407
408 // Remove the dead recipes in Region1's entry block.
409 for (VPRecipeBase &R :
410 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
411 R.eraseFromParent();
412
413 // Finally, remove the first region.
414 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
415 VPBlockUtils::disconnectBlocks(Pred, Region1);
416 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
417 }
418 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
419 TransformedRegions.insert(Region1);
420 }
421
422 return !TransformedRegions.empty();
423}
424
426 VPlan &Plan) {
427 Instruction *Instr = PredRecipe->getUnderlyingInstr();
428 // Build the triangular if-then region.
429 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
430 assert(Instr->getParent() && "Predicated instruction not in any basic block");
431 auto *BlockInMask = PredRecipe->getMask();
432 auto *MaskDef = BlockInMask->getDefiningRecipe();
433 auto *BOMRecipe = new VPBranchOnMaskRecipe(
434 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
435 auto *Entry =
436 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
437
438 // Replace predicated replicate recipe with a replicate recipe without a
439 // mask but in the replicate region.
440 auto *RecipeWithoutMask = new VPReplicateRecipe(
441 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
442 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
443 PredRecipe->getDebugLoc());
444 auto *Pred =
445 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
446
447 VPPredInstPHIRecipe *PHIRecipe = nullptr;
448 if (PredRecipe->getNumUsers() != 0) {
449 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
450 RecipeWithoutMask->getDebugLoc());
451 PredRecipe->replaceAllUsesWith(PHIRecipe);
452 PHIRecipe->setOperand(0, RecipeWithoutMask);
453 }
454 PredRecipe->eraseFromParent();
455 auto *Exiting =
456 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
458 Plan.createReplicateRegion(Entry, Exiting, RegionName);
459
460 // Note: first set Entry as region entry and then connect successors starting
461 // from it in order, to propagate the "parent" of each VPBasicBlock.
462 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
463 VPBlockUtils::connectBlocks(Pred, Exiting);
464
465 return Region;
466}
467
468static void addReplicateRegions(VPlan &Plan) {
471 vp_depth_first_deep(Plan.getEntry()))) {
472 for (VPRecipeBase &R : *VPBB)
473 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
474 if (RepR->isPredicated())
475 WorkList.push_back(RepR);
476 }
477 }
478
479 unsigned BBNum = 0;
480 for (VPReplicateRecipe *RepR : WorkList) {
481 VPBasicBlock *CurrentBlock = RepR->getParent();
482 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
483
484 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
485 SplitBlock->setName(
486 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
487 // Record predicated instructions for above packing optimizations.
489 Region->setParent(CurrentBlock->getParent());
491
492 VPRegionBlock *ParentRegion = Region->getParent();
493 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
494 ParentRegion->setExiting(SplitBlock);
495 }
496}
497
498/// Remove redundant VPBasicBlocks by merging them into their predecessor if
499/// the predecessor has a single successor.
503 vp_depth_first_deep(Plan.getEntry()))) {
504 // Don't fold the blocks in the skeleton of the Plan into their single
505 // predecessors for now.
506 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
507 if (!VPBB->getParent())
508 continue;
509 auto *PredVPBB =
510 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
511 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
512 isa<VPIRBasicBlock>(PredVPBB))
513 continue;
514 WorkList.push_back(VPBB);
515 }
516
517 for (VPBasicBlock *VPBB : WorkList) {
518 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
519 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
520 R.moveBefore(*PredVPBB, PredVPBB->end());
521 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
522 auto *ParentRegion = VPBB->getParent();
523 if (ParentRegion && ParentRegion->getExiting() == VPBB)
524 ParentRegion->setExiting(PredVPBB);
525 for (auto *Succ : to_vector(VPBB->successors())) {
527 VPBlockUtils::connectBlocks(PredVPBB, Succ);
528 }
529 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
530 }
531 return !WorkList.empty();
532}
533
535 // Convert masked VPReplicateRecipes to if-then region blocks.
537
538 bool ShouldSimplify = true;
539 while (ShouldSimplify) {
540 ShouldSimplify = sinkScalarOperands(Plan);
541 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
542 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
543 }
544}
545
546/// Remove redundant casts of inductions.
547///
548/// Such redundant casts are casts of induction variables that can be ignored,
549/// because we already proved that the casted phi is equal to the uncasted phi
550/// in the vectorized loop. There is no need to vectorize the cast - the same
551/// value can be used for both the phi and casts in the vector loop.
553 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
555 if (!IV || IV->getTruncInst())
556 continue;
557
558 // A sequence of IR Casts has potentially been recorded for IV, which
559 // *must be bypassed* when the IV is vectorized, because the vectorized IV
560 // will produce the desired casted value. This sequence forms a def-use
561 // chain and is provided in reverse order, ending with the cast that uses
562 // the IV phi. Search for the recipe of the last cast in the chain and
563 // replace it with the original IV. Note that only the final cast is
564 // expected to have users outside the cast-chain and the dead casts left
565 // over will be cleaned up later.
566 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
567 VPValue *FindMyCast = IV;
568 for (Instruction *IRCast : reverse(Casts)) {
569 VPSingleDefRecipe *FoundUserCast = nullptr;
570 for (auto *U : FindMyCast->users()) {
571 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
572 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
573 FoundUserCast = UserCast;
574 break;
575 }
576 }
577 FindMyCast = FoundUserCast;
578 }
579 FindMyCast->replaceAllUsesWith(IV);
580 }
581}
582
583/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
584/// recipe, if it exists.
586 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
587 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
588 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
589 for (VPUser *U : CanonicalIV->users()) {
591 if (WidenNewIV)
592 break;
593 }
594
595 if (!WidenNewIV)
596 return;
597
598 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
599 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
600 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
601
602 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
603 continue;
604
605 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
606 // everything WidenNewIV's users need. That is, WidenOriginalIV will
607 // generate a vector phi or all users of WidenNewIV demand the first lane
608 // only.
609 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
610 vputils::onlyFirstLaneUsed(WidenNewIV)) {
611 // We are replacing a wide canonical iv with a suitable wide induction.
612 // This is used to compute header mask, hence all lanes will be used and
613 // we need to drop wrap flags only applying to lanes guranteed to execute
614 // in the original scalar loop.
615 WidenOriginalIV->dropPoisonGeneratingFlags();
616 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
617 WidenNewIV->eraseFromParent();
618 return;
619 }
620 }
621}
622
623/// Returns true if \p R is dead and can be removed.
624static bool isDeadRecipe(VPRecipeBase &R) {
625 // Do remove conditional assume instructions as their conditions may be
626 // flattened.
627 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
628 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
630 if (IsConditionalAssume)
631 return true;
632
633 if (R.mayHaveSideEffects())
634 return false;
635
636 // Recipe is dead if no user keeps the recipe alive.
637 return all_of(R.definedValues(),
638 [](VPValue *V) { return V->getNumUsers() == 0; });
639}
640
643 vp_post_order_deep(Plan.getEntry()))) {
644 // The recipes in the block are processed in reverse order, to catch chains
645 // of dead recipes.
646 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
647 if (isDeadRecipe(R)) {
648 R.eraseFromParent();
649 continue;
650 }
651
652 // Check if R is a dead VPPhi <-> update cycle and remove it.
653 auto *PhiR = dyn_cast<VPPhi>(&R);
654 if (!PhiR || PhiR->getNumOperands() != 2)
655 continue;
656 VPUser *PhiUser = PhiR->getSingleUser();
657 if (!PhiUser)
658 continue;
659 VPValue *Incoming = PhiR->getOperand(1);
660 if (PhiUser != Incoming->getDefiningRecipe() ||
661 Incoming->getNumUsers() != 1)
662 continue;
663 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
664 PhiR->eraseFromParent();
665 Incoming->getDefiningRecipe()->eraseFromParent();
666 }
667 }
668}
669
672 Instruction::BinaryOps InductionOpcode,
673 FPMathOperator *FPBinOp, Instruction *TruncI,
674 VPValue *StartV, VPValue *Step, DebugLoc DL,
675 VPBuilder &Builder) {
676 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
677 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
678 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
679 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
680 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
681
682 // Truncate base induction if needed.
683 VPTypeAnalysis TypeInfo(Plan);
684 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
685 if (TruncI) {
686 Type *TruncTy = TruncI->getType();
687 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
688 "Not truncating.");
689 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
690 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
691 ResultTy = TruncTy;
692 }
693
694 // Truncate step if needed.
695 Type *StepTy = TypeInfo.inferScalarType(Step);
696 if (ResultTy != StepTy) {
697 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
698 "Not truncating.");
699 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
700 auto *VecPreheader =
702 VPBuilder::InsertPointGuard Guard(Builder);
703 Builder.setInsertPoint(VecPreheader);
704 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
705 }
706 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
707 &Plan.getVF(), DL);
708}
709
712 for (unsigned I = 0; I != Users.size(); ++I) {
714 if (isa<VPHeaderPHIRecipe>(Cur))
715 continue;
716 for (VPValue *V : Cur->definedValues())
717 Users.insert_range(V->users());
718 }
719 return Users.takeVector();
720}
721
722/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
723/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
724/// generates scalar values.
725static VPValue *
727 VPlan &Plan, VPBuilder &Builder) {
729 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
730 VPValue *StepV = PtrIV->getOperand(1);
732 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
733 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
734
735 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
736 PtrIV->getDebugLoc(), "next.gep");
737}
738
739/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
740/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
741/// VPWidenPointerInductionRecipe will generate vectors only. If some users
742/// require vectors while other require scalars, the scalar uses need to extract
743/// the scalars from the generated vectors (Note that this is different to how
744/// int/fp inductions are handled). Legalize extract-from-ends using uniform
745/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
746/// the correct end value is available. Also optimize
747/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
748/// providing them scalar steps built on the canonical scalar IV and update the
749/// original IV's users. This is an optional optimization to reduce the needs of
750/// vector extracts.
753 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
754 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
755 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
756 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
757 if (!PhiR)
758 continue;
759
760 // Try to narrow wide and replicating recipes to uniform recipes, based on
761 // VPlan analysis.
762 // TODO: Apply to all recipes in the future, to replace legacy uniformity
763 // analysis.
764 auto Users = collectUsersRecursively(PhiR);
765 for (VPUser *U : reverse(Users)) {
766 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
767 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
768 // Skip recipes that shouldn't be narrowed.
769 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
770 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
771 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
772 continue;
773
774 // Skip recipes that may have other lanes than their first used.
776 continue;
777
778 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
779 Def->operands(), /*IsUniform*/ true,
780 /*Mask*/ nullptr, /*Flags*/ *Def);
781 Clone->insertAfter(Def);
782 Def->replaceAllUsesWith(Clone);
783 }
784
785 // Replace wide pointer inductions which have only their scalars used by
786 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
787 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
788 if (!Plan.hasScalarVFOnly() &&
789 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
790 continue;
791
792 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
793 PtrIV->replaceAllUsesWith(PtrAdd);
794 continue;
795 }
796
797 // Replace widened induction with scalar steps for users that only use
798 // scalars.
799 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
800 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
801 return U->usesScalars(WideIV);
802 }))
803 continue;
804
805 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
807 Plan, ID.getKind(), ID.getInductionOpcode(),
808 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
809 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
810 WideIV->getDebugLoc(), Builder);
811
812 // Update scalar users of IV to use Step instead.
813 if (!HasOnlyVectorVFs) {
814 assert(!Plan.hasScalableVF() &&
815 "plans containing a scalar VF cannot also include scalable VFs");
816 WideIV->replaceAllUsesWith(Steps);
817 } else {
818 bool HasScalableVF = Plan.hasScalableVF();
819 WideIV->replaceUsesWithIf(Steps,
820 [WideIV, HasScalableVF](VPUser &U, unsigned) {
821 if (HasScalableVF)
822 return U.usesFirstLaneOnly(WideIV);
823 return U.usesScalars(WideIV);
824 });
825 }
826 }
827}
828
829/// Check if \p VPV is an untruncated wide induction, either before or after the
830/// increment. If so return the header IV (before the increment), otherwise
831/// return null.
833 ScalarEvolution &SE) {
834 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
835 if (WideIV) {
836 // VPV itself is a wide induction, separately compute the end value for exit
837 // users if it is not a truncated IV.
838 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
839 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
840 }
841
842 // Check if VPV is an optimizable induction increment.
843 VPRecipeBase *Def = VPV->getDefiningRecipe();
844 if (!Def || Def->getNumOperands() != 2)
845 return nullptr;
846 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
847 if (!WideIV)
848 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
849 if (!WideIV)
850 return nullptr;
851
852 auto IsWideIVInc = [&]() {
853 auto &ID = WideIV->getInductionDescriptor();
854
855 // Check if VPV increments the induction by the induction step.
856 VPValue *IVStep = WideIV->getStepValue();
857 switch (ID.getInductionOpcode()) {
858 case Instruction::Add:
859 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
860 case Instruction::FAdd:
862 m_Specific(IVStep)));
863 case Instruction::FSub:
864 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
865 m_Specific(IVStep)));
866 case Instruction::Sub: {
867 // IVStep will be the negated step of the subtraction. Check if Step == -1
868 // * IVStep.
869 VPValue *Step;
870 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
871 return false;
872 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
873 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
874 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
875 !isa<SCEVCouldNotCompute>(StepSCEV) &&
876 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
877 }
878 default:
879 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
880 match(VPV, m_GetElementPtr(m_Specific(WideIV),
881 m_Specific(WideIV->getStepValue())));
882 }
883 llvm_unreachable("should have been covered by switch above");
884 };
885 return IsWideIVInc() ? WideIV : nullptr;
886}
887
888/// Attempts to optimize the induction variable exit values for users in the
889/// early exit block.
891 VPTypeAnalysis &TypeInfo,
892 VPBlockBase *PredVPBB,
893 VPValue *Op,
894 ScalarEvolution &SE) {
895 VPValue *Incoming, *Mask;
898 return nullptr;
899
900 auto *WideIV = getOptimizableIVOf(Incoming, SE);
901 if (!WideIV)
902 return nullptr;
903
904 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
905 if (WideIntOrFp && WideIntOrFp->getTruncInst())
906 return nullptr;
907
908 // Calculate the final index.
909 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
910 auto *CanonicalIV = LoopRegion->getCanonicalIV();
911 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
912 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
913
914 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
915 VPValue *FirstActiveLane =
916 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
917 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
918 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
919 FirstActiveLaneType, DL);
920 VPValue *EndValue =
921 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
922
923 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
924 // changed it means the exit is using the incremented value, so we need to
925 // add the step.
926 if (Incoming != WideIV) {
927 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
928 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
929 }
930
931 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
932 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
933 VPValue *Start = WideIV->getStartValue();
934 VPValue *Step = WideIV->getStepValue();
935 EndValue = B.createDerivedIV(
936 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
937 Start, EndValue, Step);
938 }
939
940 return EndValue;
941}
942
943/// Attempts to optimize the induction variable exit values for users in the
944/// exit block coming from the latch in the original scalar loop.
946 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
950 return nullptr;
951
952 auto *WideIV = getOptimizableIVOf(Incoming, SE);
953 if (!WideIV)
954 return nullptr;
955
956 VPValue *EndValue = EndValues.lookup(WideIV);
957 assert(EndValue && "end value must have been pre-computed");
958
959 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
960 // changed it means the exit is using the incremented value, so we don't
961 // need to subtract the step.
962 if (Incoming != WideIV)
963 return EndValue;
964
965 // Otherwise, subtract the step from the EndValue.
966 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
967 VPValue *Step = WideIV->getStepValue();
968 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
969 if (ScalarTy->isIntegerTy())
970 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
971 DebugLoc::getUnknown(), "ind.escape");
972 if (ScalarTy->isPointerTy()) {
973 Type *StepTy = TypeInfo.inferScalarType(Step);
974 auto *Zero = Plan.getConstantInt(StepTy, 0);
975 return B.createPtrAdd(EndValue,
976 B.createNaryOp(Instruction::Sub, {Zero, Step}),
977 DebugLoc::getUnknown(), "ind.escape");
978 }
979 if (ScalarTy->isFloatingPointTy()) {
980 const auto &ID = WideIV->getInductionDescriptor();
981 return B.createNaryOp(
982 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
983 ? Instruction::FSub
984 : Instruction::FAdd,
985 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
986 }
987 llvm_unreachable("all possible induction types must be handled");
988 return nullptr;
989}
990
992 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
993 ScalarEvolution &SE) {
994 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
995 VPTypeAnalysis TypeInfo(Plan);
996 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
997 for (VPRecipeBase &R : ExitVPBB->phis()) {
998 auto *ExitIRI = cast<VPIRPhi>(&R);
999
1000 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1001 VPValue *Escape = nullptr;
1002 if (PredVPBB == MiddleVPBB)
1003 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1004 ExitIRI->getOperand(Idx),
1005 EndValues, SE);
1006 else
1007 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
1008 ExitIRI->getOperand(Idx), SE);
1009 if (Escape)
1010 ExitIRI->setOperand(Idx, Escape);
1011 }
1012 }
1013 }
1014}
1015
1016/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1017/// them with already existing recipes expanding the same SCEV expression.
1020
1021 for (VPRecipeBase &R :
1023 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1024 if (!ExpR)
1025 continue;
1026
1027 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1028 if (Inserted)
1029 continue;
1030 ExpR->replaceAllUsesWith(V->second);
1031 ExpR->eraseFromParent();
1032 }
1033}
1034
1036 SmallVector<VPValue *> WorkList;
1038 WorkList.push_back(V);
1039
1040 while (!WorkList.empty()) {
1041 VPValue *Cur = WorkList.pop_back_val();
1042 if (!Seen.insert(Cur).second)
1043 continue;
1044 VPRecipeBase *R = Cur->getDefiningRecipe();
1045 if (!R)
1046 continue;
1047 if (!isDeadRecipe(*R))
1048 continue;
1049 append_range(WorkList, R->operands());
1050 R->eraseFromParent();
1051 }
1052}
1053
1054/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1055/// Returns an optional pair, where the first element indicates whether it is
1056/// an intrinsic ID.
1057static std::optional<std::pair<bool, unsigned>>
1059 return TypeSwitch<const VPSingleDefRecipe *,
1060 std::optional<std::pair<bool, unsigned>>>(R)
1063 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1064 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1065 return std::make_pair(true, I->getVectorIntrinsicID());
1066 })
1067 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1068 // For recipes that do not directly map to LLVM IR instructions,
1069 // assign opcodes after the last VPInstruction opcode (which is also
1070 // after the last IR Instruction opcode), based on the VPDefID.
1071 return std::make_pair(false,
1072 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1073 })
1074 .Default([](auto *) { return std::nullopt; });
1075}
1076
1077/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1078/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1079/// Operands are foldable live-ins.
1081 ArrayRef<VPValue *> Operands,
1082 const DataLayout &DL,
1083 VPTypeAnalysis &TypeInfo) {
1084 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1085 if (!OpcodeOrIID)
1086 return nullptr;
1087
1089 for (VPValue *Op : Operands) {
1090 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1091 return nullptr;
1092 Ops.push_back(Op->getLiveInIRValue());
1093 }
1094
1095 auto FoldToIRValue = [&]() -> Value * {
1096 InstSimplifyFolder Folder(DL);
1097 if (OpcodeOrIID->first) {
1098 if (R.getNumOperands() != 2)
1099 return nullptr;
1100 unsigned ID = OpcodeOrIID->second;
1101 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1102 TypeInfo.inferScalarType(&R));
1103 }
1104 unsigned Opcode = OpcodeOrIID->second;
1105 if (Instruction::isBinaryOp(Opcode))
1106 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1107 Ops[0], Ops[1]);
1108 if (Instruction::isCast(Opcode))
1109 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1110 TypeInfo.inferScalarType(R.getVPSingleValue()));
1111 switch (Opcode) {
1113 return Folder.FoldSelect(Ops[0], Ops[1],
1115 case VPInstruction::Not:
1116 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1118 case Instruction::Select:
1119 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1120 case Instruction::ICmp:
1121 case Instruction::FCmp:
1122 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1123 Ops[1]);
1124 case Instruction::GetElementPtr: {
1125 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1126 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1127 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1128 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1129 }
1132 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1133 Ops[0], Ops[1],
1134 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1135 // An extract of a live-in is an extract of a broadcast, so return the
1136 // broadcasted element.
1137 case Instruction::ExtractElement:
1138 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1139 return Ops[0];
1140 }
1141 return nullptr;
1142 };
1143
1144 if (Value *V = FoldToIRValue())
1145 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1146 return nullptr;
1147}
1148
1149/// Try to simplify VPSingleDefRecipe \p Def.
1151 VPlan *Plan = Def->getParent()->getPlan();
1152
1153 // Simplification of live-in IR values for SingleDef recipes using
1154 // InstSimplifyFolder.
1155 const DataLayout &DL =
1157 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1158 return Def->replaceAllUsesWith(V);
1159
1160 // Fold PredPHI LiveIn -> LiveIn.
1161 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1162 VPValue *Op = PredPHI->getOperand(0);
1163 if (Op->isLiveIn())
1164 PredPHI->replaceAllUsesWith(Op);
1165 }
1166
1167 VPBuilder Builder(Def);
1168 VPValue *A;
1169 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1170 Type *TruncTy = TypeInfo.inferScalarType(Def);
1171 Type *ATy = TypeInfo.inferScalarType(A);
1172 if (TruncTy == ATy) {
1173 Def->replaceAllUsesWith(A);
1174 } else {
1175 // Don't replace a scalarizing recipe with a widened cast.
1176 if (isa<VPReplicateRecipe>(Def))
1177 return;
1178 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1179
1180 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1181 ? Instruction::SExt
1182 : Instruction::ZExt;
1183 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1184 TruncTy);
1185 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1186 // UnderlyingExt has distinct return type, used to retain legacy cost.
1187 Ext->setUnderlyingValue(UnderlyingExt);
1188 }
1189 Def->replaceAllUsesWith(Ext);
1190 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1191 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1192 Def->replaceAllUsesWith(Trunc);
1193 }
1194 }
1195#ifndef NDEBUG
1196 // Verify that the cached type info is for both A and its users is still
1197 // accurate by comparing it to freshly computed types.
1198 VPTypeAnalysis TypeInfo2(*Plan);
1199 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1200 for (VPUser *U : A->users()) {
1201 auto *R = cast<VPRecipeBase>(U);
1202 for (VPValue *VPV : R->definedValues())
1203 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1204 }
1205#endif
1206 }
1207
1208 // Simplify (X && Y) || (X && !Y) -> X.
1209 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1210 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1211 // recipes to be visited during simplification.
1212 VPValue *X, *Y, *Z;
1213 if (match(Def,
1216 Def->replaceAllUsesWith(X);
1217 Def->eraseFromParent();
1218 return;
1219 }
1220
1221 // x | 1 -> 1
1222 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1223 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1224
1225 // x | 0 -> x
1226 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1227 return Def->replaceAllUsesWith(X);
1228
1229 // x & 0 -> 0
1230 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1231 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1232
1233 // x && false -> false
1234 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1235 return Def->replaceAllUsesWith(Def->getOperand(1));
1236
1237 // (x && y) || (x && z) -> x && (y || z)
1240 // Simplify only if one of the operands has one use to avoid creating an
1241 // extra recipe.
1242 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1243 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1244 return Def->replaceAllUsesWith(
1245 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1246
1247 // x && !x -> 0
1249 return Def->replaceAllUsesWith(Plan->getFalse());
1250
1251 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1252 return Def->replaceAllUsesWith(X);
1253
1254 // select !c, x, y -> select c, y, x
1255 VPValue *C;
1256 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1257 Def->setOperand(0, C);
1258 Def->setOperand(1, Y);
1259 Def->setOperand(2, X);
1260 return;
1261 }
1262
1263 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1264 // tail folding it is likely that x is a header mask and can be simplified
1265 // further.
1267 m_VPValue(Z))) &&
1268 X->hasMoreThanOneUniqueUser())
1269 return Def->replaceAllUsesWith(
1270 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1271
1272 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1273 return Def->replaceAllUsesWith(A);
1274
1275 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1276 return Def->replaceAllUsesWith(A);
1277
1278 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1279 return Def->replaceAllUsesWith(
1280 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1281
1282 if (match(Def, m_Not(m_VPValue(A)))) {
1283 if (match(A, m_Not(m_VPValue(A))))
1284 return Def->replaceAllUsesWith(A);
1285
1286 // Try to fold Not into compares by adjusting the predicate in-place.
1287 CmpPredicate Pred;
1288 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1289 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1290 if (all_of(Cmp->users(),
1292 m_Not(m_Specific(Cmp)),
1293 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1294 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1295 for (VPUser *U : to_vector(Cmp->users())) {
1296 auto *R = cast<VPSingleDefRecipe>(U);
1297 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1298 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1299 R->setOperand(1, Y);
1300 R->setOperand(2, X);
1301 } else {
1302 // not (cmp pred) -> cmp inv_pred
1303 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1304 R->replaceAllUsesWith(Cmp);
1305 }
1306 }
1307 // If Cmp doesn't have a debug location, use the one from the negation,
1308 // to preserve the location.
1309 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1310 Cmp->setDebugLoc(Def->getDebugLoc());
1311 }
1312 }
1313 }
1314
1315 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1316 // any-of (fcmp uno %A, %B), ...
1317 if (match(Def, m_AnyOf())) {
1319 VPRecipeBase *UnpairedCmp = nullptr;
1320 for (VPValue *Op : Def->operands()) {
1321 VPValue *X;
1322 if (Op->getNumUsers() > 1 ||
1324 m_Deferred(X)))) {
1325 NewOps.push_back(Op);
1326 } else if (!UnpairedCmp) {
1327 UnpairedCmp = Op->getDefiningRecipe();
1328 } else {
1329 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1330 UnpairedCmp->getOperand(0), X));
1331 UnpairedCmp = nullptr;
1332 }
1333 }
1334
1335 if (UnpairedCmp)
1336 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1337
1338 if (NewOps.size() < Def->getNumOperands()) {
1339 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1340 return Def->replaceAllUsesWith(NewAnyOf);
1341 }
1342 }
1343
1344 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1345 // This is useful for fmax/fmin without fast-math flags, where we need to
1346 // check if any operand is NaN.
1348 m_Deferred(X)),
1350 m_Deferred(Y))))) {
1351 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1352 return Def->replaceAllUsesWith(NewCmp);
1353 }
1354
1355 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1356 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1357 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1358 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1359 TypeInfo.inferScalarType(Def))
1360 return Def->replaceAllUsesWith(Def->getOperand(1));
1361
1363 m_One()))) {
1364 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1365 if (TypeInfo.inferScalarType(X) != WideStepTy)
1366 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1367 Def->replaceAllUsesWith(X);
1368 return;
1369 }
1370
1371 // For i1 vp.merges produced by AnyOf reductions:
1372 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1374 m_VPValue(X), m_VPValue())) &&
1376 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1377 Def->setOperand(1, Def->getOperand(0));
1378 Def->setOperand(0, Y);
1379 return;
1380 }
1381
1382 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1383 if (Phi->getOperand(0) == Phi->getOperand(1))
1384 Phi->replaceAllUsesWith(Phi->getOperand(0));
1385 return;
1386 }
1387
1388 // Look through ExtractLastLane (BuildVector ....).
1389 if (match(Def, m_ExtractLastLane(m_BuildVector()))) {
1390 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1391 Def->replaceAllUsesWith(
1392 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1393 return;
1394 }
1395
1396 // Look through ExtractPenultimateElement (BuildVector ....).
1398 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1399 Def->replaceAllUsesWith(
1400 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1401 return;
1402 }
1403
1404 uint64_t Idx;
1406 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1407 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1408 return;
1409 }
1410
1411 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1412 Def->replaceAllUsesWith(
1413 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1414 return;
1415 }
1416
1417 // Look through broadcast of single-scalar when used as select conditions; in
1418 // that case the scalar condition can be used directly.
1419 if (match(Def,
1422 "broadcast operand must be single-scalar");
1423 Def->setOperand(0, C);
1424 return;
1425 }
1426
1427 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1428 if (Phi->getNumOperands() == 1)
1429 Phi->replaceAllUsesWith(Phi->getOperand(0));
1430 return;
1431 }
1432
1433 // Some simplifications can only be applied after unrolling. Perform them
1434 // below.
1435 if (!Plan->isUnrolled())
1436 return;
1437
1438 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1439 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1440 isa<VPPhi>(X)) {
1441 auto *Phi = cast<VPPhi>(X);
1442 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1443 Phi->getSingleUser() == Def) {
1444 Phi->setOperand(0, Y);
1445 Def->replaceAllUsesWith(Phi);
1446 return;
1447 }
1448 }
1449
1450 // VPVectorPointer for part 0 can be replaced by their start pointer.
1451 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1452 if (VecPtr->isFirstPart()) {
1453 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1454 return;
1455 }
1456 }
1457
1458 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1459 // the first lane is demanded.
1460 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1461 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1462 Steps->replaceAllUsesWith(Steps->getOperand(0));
1463 return;
1464 }
1465 }
1466 // Simplify redundant ReductionStartVector recipes after unrolling.
1467 VPValue *StartV;
1469 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1470 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1471 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1472 return PhiR && PhiR->isInLoop();
1473 });
1474 return;
1475 }
1476
1478 Def->replaceAllUsesWith(A);
1479 return;
1480 }
1481
1482 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1485 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1486 all_of(A->users(),
1487 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1488 return Def->replaceAllUsesWith(A);
1489 }
1490
1491 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1492 return Def->replaceAllUsesWith(A);
1493}
1494
1497 Plan.getEntry());
1498 VPTypeAnalysis TypeInfo(Plan);
1500 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1501 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1502 simplifyRecipe(Def, TypeInfo);
1503 }
1504}
1505
1507 if (Plan.hasScalarVFOnly())
1508 return;
1509
1510 // Try to narrow wide and replicating recipes to single scalar recipes,
1511 // based on VPlan analysis. Only process blocks in the loop region for now,
1512 // without traversing into nested regions, as recipes in replicate regions
1513 // cannot be converted yet.
1516 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1518 VPReplicateRecipe>(&R))
1519 continue;
1520 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1521 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1522 continue;
1523
1524 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1525 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1526 vputils::isSingleScalar(RepR->getOperand(1))) {
1527 auto *Clone = new VPReplicateRecipe(
1528 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1529 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1530 *RepR /*Metadata*/, RepR->getDebugLoc());
1531 Clone->insertBefore(RepOrWidenR);
1532 VPBuilder Builder(Clone);
1533 VPValue *ExtractOp = Clone->getOperand(0);
1534 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1535 ExtractOp =
1536 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1537 ExtractOp =
1538 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1539 Clone->setOperand(0, ExtractOp);
1540 RepR->eraseFromParent();
1541 continue;
1542 }
1543
1544 // Skip recipes that aren't single scalars.
1545 if (!vputils::isSingleScalar(RepOrWidenR))
1546 continue;
1547
1548 // Skip recipes for which conversion to single-scalar does introduce
1549 // additional broadcasts. No extra broadcasts are needed, if either only
1550 // the scalars of the recipe are used, or at least one of the operands
1551 // would require a broadcast. In the latter case, the single-scalar may
1552 // need to be broadcasted, but another broadcast is removed.
1553 if (!all_of(RepOrWidenR->users(),
1554 [RepOrWidenR](const VPUser *U) {
1555 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1556 unsigned Opcode = VPI->getOpcode();
1557 if (Opcode == VPInstruction::ExtractLastLane ||
1558 Opcode == VPInstruction::ExtractLastPart ||
1559 Opcode == VPInstruction::ExtractPenultimateElement)
1560 return true;
1561 }
1562
1563 return U->usesScalars(RepOrWidenR);
1564 }) &&
1565 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1566 if (Op->getSingleUser() != RepOrWidenR)
1567 return false;
1568 // Non-constant live-ins require broadcasts, while constants do not
1569 // need explicit broadcasts.
1570 bool LiveInNeedsBroadcast =
1571 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1572 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1573 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1574 }))
1575 continue;
1576
1577 auto *Clone = new VPReplicateRecipe(
1578 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1579 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1580 Clone->insertBefore(RepOrWidenR);
1581 RepOrWidenR->replaceAllUsesWith(Clone);
1582 if (isDeadRecipe(*RepOrWidenR))
1583 RepOrWidenR->eraseFromParent();
1584 }
1585 }
1586}
1587
1588/// Try to see if all of \p Blend's masks share a common value logically and'ed
1589/// and remove it from the masks.
1591 if (Blend->isNormalized())
1592 return;
1593 VPValue *CommonEdgeMask;
1594 if (!match(Blend->getMask(0),
1595 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1596 return;
1597 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1598 if (!match(Blend->getMask(I),
1599 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1600 return;
1601 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1602 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1603}
1604
1605/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1606/// to make sure the masks are simplified.
1607static void simplifyBlends(VPlan &Plan) {
1610 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1611 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1612 if (!Blend)
1613 continue;
1614
1615 removeCommonBlendMask(Blend);
1616
1617 // Try to remove redundant blend recipes.
1618 SmallPtrSet<VPValue *, 4> UniqueValues;
1619 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1620 UniqueValues.insert(Blend->getIncomingValue(0));
1621 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1622 if (!match(Blend->getMask(I), m_False()))
1623 UniqueValues.insert(Blend->getIncomingValue(I));
1624
1625 if (UniqueValues.size() == 1) {
1626 Blend->replaceAllUsesWith(*UniqueValues.begin());
1627 Blend->eraseFromParent();
1628 continue;
1629 }
1630
1631 if (Blend->isNormalized())
1632 continue;
1633
1634 // Normalize the blend so its first incoming value is used as the initial
1635 // value with the others blended into it.
1636
1637 unsigned StartIndex = 0;
1638 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1639 // If a value's mask is used only by the blend then is can be deadcoded.
1640 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1641 // that's used by multiple blends where it can be removed from them all.
1642 VPValue *Mask = Blend->getMask(I);
1643 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1644 StartIndex = I;
1645 break;
1646 }
1647 }
1648
1649 SmallVector<VPValue *, 4> OperandsWithMask;
1650 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1651
1652 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1653 if (I == StartIndex)
1654 continue;
1655 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1656 OperandsWithMask.push_back(Blend->getMask(I));
1657 }
1658
1659 auto *NewBlend =
1660 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1661 OperandsWithMask, Blend->getDebugLoc());
1662 NewBlend->insertBefore(&R);
1663
1664 VPValue *DeadMask = Blend->getMask(StartIndex);
1665 Blend->replaceAllUsesWith(NewBlend);
1666 Blend->eraseFromParent();
1668
1669 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1670 VPValue *NewMask;
1671 if (NewBlend->getNumOperands() == 3 &&
1672 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1673 VPValue *Inc0 = NewBlend->getOperand(0);
1674 VPValue *Inc1 = NewBlend->getOperand(1);
1675 VPValue *OldMask = NewBlend->getOperand(2);
1676 NewBlend->setOperand(0, Inc1);
1677 NewBlend->setOperand(1, Inc0);
1678 NewBlend->setOperand(2, NewMask);
1679 if (OldMask->getNumUsers() == 0)
1680 cast<VPInstruction>(OldMask)->eraseFromParent();
1681 }
1682 }
1683 }
1684}
1685
1686/// Optimize the width of vector induction variables in \p Plan based on a known
1687/// constant Trip Count, \p BestVF and \p BestUF.
1689 ElementCount BestVF,
1690 unsigned BestUF) {
1691 // Only proceed if we have not completely removed the vector region.
1692 if (!Plan.getVectorLoopRegion())
1693 return false;
1694
1695 const APInt *TC;
1696 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1697 return false;
1698
1699 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1700 // and UF. Returns at least 8.
1701 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1702 APInt AlignedTC =
1705 APInt MaxVal = AlignedTC - 1;
1706 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1707 };
1708 unsigned NewBitWidth =
1709 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1710
1711 LLVMContext &Ctx = Plan.getContext();
1712 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1713
1714 bool MadeChange = false;
1715
1716 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1717 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1718 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1719
1720 // Currently only handle canonical IVs as it is trivial to replace the start
1721 // and stop values, and we currently only perform the optimization when the
1722 // IV has a single use.
1723 if (!WideIV || !WideIV->isCanonical() ||
1724 WideIV->hasMoreThanOneUniqueUser() ||
1725 NewIVTy == WideIV->getScalarType())
1726 continue;
1727
1728 // Currently only handle cases where the single user is a header-mask
1729 // comparison with the backedge-taken-count.
1730 VPUser *SingleUser = WideIV->getSingleUser();
1731 if (!SingleUser ||
1732 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1735 continue;
1736
1737 // Update IV operands and comparison bound to use new narrower type.
1738 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1739 WideIV->setStartValue(NewStart);
1740 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1741 WideIV->setStepValue(NewStep);
1742
1743 auto *NewBTC = new VPWidenCastRecipe(
1744 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1745 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1746 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1747 Cmp->setOperand(1, NewBTC);
1748
1749 MadeChange = true;
1750 }
1751
1752 return MadeChange;
1753}
1754
1755/// Return true if \p Cond is known to be true for given \p BestVF and \p
1756/// BestUF.
1758 ElementCount BestVF, unsigned BestUF,
1759 ScalarEvolution &SE) {
1761 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1762 &SE](VPValue *C) {
1763 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1764 });
1765
1766 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1768 m_Specific(CanIV->getBackedgeValue()),
1769 m_Specific(&Plan.getVectorTripCount()))))
1770 return false;
1771
1772 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1773 // count is not conveniently available as SCEV so far, so we compare directly
1774 // against the original trip count. This is stricter than necessary, as we
1775 // will only return true if the trip count == vector trip count.
1776 const SCEV *VectorTripCount =
1778 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1779 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1780 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1781 "Trip count SCEV must be computable");
1782 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1783 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1784 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1785}
1786
1787/// Try to replace multiple active lane masks used for control flow with
1788/// a single, wide active lane mask instruction followed by multiple
1789/// extract subvector intrinsics. This applies to the active lane mask
1790/// instructions both in the loop and in the preheader.
1791/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1792/// new extracts from the first active lane mask, which has it's last
1793/// operand (multiplier) set to UF.
1795 unsigned UF) {
1796 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1797 return false;
1798
1799 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1800 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1801 auto *Term = &ExitingVPBB->back();
1802
1803 using namespace llvm::VPlanPatternMatch;
1805 m_VPValue(), m_VPValue(), m_VPValue())))))
1806 return false;
1807
1808 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1809 LLVMContext &Ctx = Plan.getContext();
1810
1811 auto ExtractFromALM = [&](VPInstruction *ALM,
1812 SmallVectorImpl<VPValue *> &Extracts) {
1813 DebugLoc DL = ALM->getDebugLoc();
1814 for (unsigned Part = 0; Part < UF; ++Part) {
1816 Ops.append({ALM, Plan.getOrAddLiveIn(
1817 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1818 VF.getKnownMinValue() * Part))});
1819 auto *Ext =
1820 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1821 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1822 Extracts[Part] = Ext;
1823 Ext->insertAfter(ALM);
1824 }
1825 };
1826
1827 // Create a list of each active lane mask phi, ordered by unroll part.
1829 for (VPRecipeBase &R : Header->phis()) {
1831 if (!Phi)
1832 continue;
1833 VPValue *Index = nullptr;
1834 match(Phi->getBackedgeValue(),
1836 assert(Index && "Expected index from ActiveLaneMask instruction");
1837
1838 uint64_t Part;
1839 if (match(Index,
1841 m_VPValue(), m_ConstantInt(Part))))
1842 Phis[Part] = Phi;
1843 else
1844 // Anything other than a CanonicalIVIncrementForPart is part 0
1845 Phis[0] = Phi;
1846 }
1847
1848 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1849 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1850
1851 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1852 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1853
1854 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1855 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1856 "Expected incoming values of Phi to be ActiveLaneMasks");
1857
1858 // When using wide lane masks, the return type of the get.active.lane.mask
1859 // intrinsic is VF x UF (last operand).
1860 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1861 EntryALM->setOperand(2, ALMMultiplier);
1862 LoopALM->setOperand(2, ALMMultiplier);
1863
1864 // Create UF x extract vectors and insert into preheader.
1865 SmallVector<VPValue *> EntryExtracts(UF);
1866 ExtractFromALM(EntryALM, EntryExtracts);
1867
1868 // Create UF x extract vectors and insert before the loop compare & branch,
1869 // updating the compare to use the first extract.
1870 SmallVector<VPValue *> LoopExtracts(UF);
1871 ExtractFromALM(LoopALM, LoopExtracts);
1872 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1873 Not->setOperand(0, LoopExtracts[0]);
1874
1875 // Update the incoming values of active lane mask phis.
1876 for (unsigned Part = 0; Part < UF; ++Part) {
1877 Phis[Part]->setStartValue(EntryExtracts[Part]);
1878 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1879 }
1880
1881 return true;
1882}
1883
1884/// Try to simplify the branch condition of \p Plan. This may restrict the
1885/// resulting plan to \p BestVF and \p BestUF.
1887 unsigned BestUF,
1889 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1890 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1891 auto *Term = &ExitingVPBB->back();
1892 VPValue *Cond;
1893 ScalarEvolution &SE = *PSE.getSE();
1894 if (match(Term, m_BranchOnCount()) ||
1896 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1897 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1898 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1899 const SCEV *VectorTripCount =
1901 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1902 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1903 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1904 "Trip count SCEV must be computable");
1905 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1906 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1907 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1908 return false;
1909 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1910 // For BranchOnCond, check if we can prove the condition to be true using VF
1911 // and UF.
1912 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1913 return false;
1914 } else {
1915 return false;
1916 }
1917
1918 // The vector loop region only executes once. If possible, completely remove
1919 // the region, otherwise replace the terminator controlling the latch with
1920 // (BranchOnCond true).
1921 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1922 // support for other non-canonical widen induction recipes (e.g.,
1923 // VPWidenPointerInductionRecipe).
1924 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1925 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1926 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1927 return R->isCanonical();
1928 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1929 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1930 })) {
1931 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1932 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1933 VPBuilder Builder(Plan.getVectorPreheader());
1934 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1935 R->getScalarType());
1936 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1937 HeaderR.eraseFromParent();
1938 continue;
1939 }
1940 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1941 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1942 HeaderR.eraseFromParent();
1943 }
1944
1945 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1946 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1947 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1948 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1949
1950 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1951 B->setParent(nullptr);
1952
1953 VPBlockUtils::connectBlocks(Preheader, Header);
1954 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1956 } else {
1957 // The vector region contains header phis for which we cannot remove the
1958 // loop region yet.
1959 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1960 {}, {}, Term->getDebugLoc());
1961 ExitingVPBB->appendRecipe(BOC);
1962 }
1963
1964 Term->eraseFromParent();
1965
1966 return true;
1967}
1968
1969/// From the definition of llvm.experimental.get.vector.length,
1970/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
1974 vp_depth_first_deep(Plan.getEntry()))) {
1975 for (VPRecipeBase &R : *VPBB) {
1976 VPValue *AVL;
1977 if (!match(&R, m_EVL(m_VPValue(AVL))))
1978 continue;
1979
1980 ScalarEvolution &SE = *PSE.getSE();
1981 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
1982 if (isa<SCEVCouldNotCompute>(AVLSCEV))
1983 continue;
1984 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
1985 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
1986 continue;
1987
1989 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
1990 R.getDebugLoc());
1991 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
1992 return true;
1993 }
1994 }
1995 return false;
1996}
1997
1999 unsigned BestUF,
2001 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2002 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2003
2004 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2005 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2006 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2007 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2008
2009 if (MadeChange) {
2010 Plan.setVF(BestVF);
2011 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2012 }
2013}
2014
2015/// Sink users of \p FOR after the recipe defining the previous value \p
2016/// Previous of the recurrence. \returns true if all users of \p FOR could be
2017/// re-arranged as needed or false if it is not possible.
2018static bool
2020 VPRecipeBase *Previous,
2021 VPDominatorTree &VPDT) {
2022 // Collect recipes that need sinking.
2025 Seen.insert(Previous);
2026 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2027 // The previous value must not depend on the users of the recurrence phi. In
2028 // that case, FOR is not a fixed order recurrence.
2029 if (SinkCandidate == Previous)
2030 return false;
2031
2032 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2033 !Seen.insert(SinkCandidate).second ||
2034 VPDT.properlyDominates(Previous, SinkCandidate))
2035 return true;
2036
2037 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2038 return false;
2039
2040 WorkList.push_back(SinkCandidate);
2041 return true;
2042 };
2043
2044 // Recursively sink users of FOR after Previous.
2045 WorkList.push_back(FOR);
2046 for (unsigned I = 0; I != WorkList.size(); ++I) {
2047 VPRecipeBase *Current = WorkList[I];
2048 assert(Current->getNumDefinedValues() == 1 &&
2049 "only recipes with a single defined value expected");
2050
2051 for (VPUser *User : Current->getVPSingleValue()->users()) {
2052 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2053 return false;
2054 }
2055 }
2056
2057 // Keep recipes to sink ordered by dominance so earlier instructions are
2058 // processed first.
2059 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2060 return VPDT.properlyDominates(A, B);
2061 });
2062
2063 for (VPRecipeBase *SinkCandidate : WorkList) {
2064 if (SinkCandidate == FOR)
2065 continue;
2066
2067 SinkCandidate->moveAfter(Previous);
2068 Previous = SinkCandidate;
2069 }
2070 return true;
2071}
2072
2073/// Try to hoist \p Previous and its operands before all users of \p FOR.
2075 VPRecipeBase *Previous,
2076 VPDominatorTree &VPDT) {
2077 if (cannotHoistOrSinkRecipe(*Previous))
2078 return false;
2079
2080 // Collect recipes that need hoisting.
2081 SmallVector<VPRecipeBase *> HoistCandidates;
2083 VPRecipeBase *HoistPoint = nullptr;
2084 // Find the closest hoist point by looking at all users of FOR and selecting
2085 // the recipe dominating all other users.
2086 for (VPUser *U : FOR->users()) {
2087 auto *R = cast<VPRecipeBase>(U);
2088 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2089 HoistPoint = R;
2090 }
2091 assert(all_of(FOR->users(),
2092 [&VPDT, HoistPoint](VPUser *U) {
2093 auto *R = cast<VPRecipeBase>(U);
2094 return HoistPoint == R ||
2095 VPDT.properlyDominates(HoistPoint, R);
2096 }) &&
2097 "HoistPoint must dominate all users of FOR");
2098
2099 auto NeedsHoisting = [HoistPoint, &VPDT,
2100 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2101 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2102 if (!HoistCandidate)
2103 return nullptr;
2104 VPRegionBlock *EnclosingLoopRegion =
2105 HoistCandidate->getParent()->getEnclosingLoopRegion();
2106 assert((!HoistCandidate->getRegion() ||
2107 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2108 "CFG in VPlan should still be flat, without replicate regions");
2109 // Hoist candidate was already visited, no need to hoist.
2110 if (!Visited.insert(HoistCandidate).second)
2111 return nullptr;
2112
2113 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2114 // hoisting.
2115 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2116 return nullptr;
2117
2118 // If we reached a recipe that dominates HoistPoint, we don't need to
2119 // hoist the recipe.
2120 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2121 return nullptr;
2122 return HoistCandidate;
2123 };
2124
2125 if (!NeedsHoisting(Previous->getVPSingleValue()))
2126 return true;
2127
2128 // Recursively try to hoist Previous and its operands before all users of FOR.
2129 HoistCandidates.push_back(Previous);
2130
2131 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2132 VPRecipeBase *Current = HoistCandidates[I];
2133 assert(Current->getNumDefinedValues() == 1 &&
2134 "only recipes with a single defined value expected");
2135 if (cannotHoistOrSinkRecipe(*Current))
2136 return false;
2137
2138 for (VPValue *Op : Current->operands()) {
2139 // If we reach FOR, it means the original Previous depends on some other
2140 // recurrence that in turn depends on FOR. If that is the case, we would
2141 // also need to hoist recipes involving the other FOR, which may break
2142 // dependencies.
2143 if (Op == FOR)
2144 return false;
2145
2146 if (auto *R = NeedsHoisting(Op)) {
2147 // Bail out if the recipe defines multiple values.
2148 // TODO: Hoisting such recipes requires additional handling.
2149 if (R->getNumDefinedValues() != 1)
2150 return false;
2151 HoistCandidates.push_back(R);
2152 }
2153 }
2154 }
2155
2156 // Order recipes to hoist by dominance so earlier instructions are processed
2157 // first.
2158 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2159 return VPDT.properlyDominates(A, B);
2160 });
2161
2162 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2163 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2164 HoistPoint->getIterator());
2165 }
2166
2167 return true;
2168}
2169
2171 VPBuilder &LoopBuilder) {
2172 VPDominatorTree VPDT(Plan);
2173
2175 for (VPRecipeBase &R :
2178 RecurrencePhis.push_back(FOR);
2179
2180 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2182 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2183 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2184 // to terminate.
2185 while (auto *PrevPhi =
2187 assert(PrevPhi->getParent() == FOR->getParent());
2188 assert(SeenPhis.insert(PrevPhi).second);
2189 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2190 }
2191
2192 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2193 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2194 return false;
2195
2196 // Introduce a recipe to combine the incoming and previous values of a
2197 // fixed-order recurrence.
2198 VPBasicBlock *InsertBlock = Previous->getParent();
2199 if (isa<VPHeaderPHIRecipe>(Previous))
2200 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2201 else
2202 LoopBuilder.setInsertPoint(InsertBlock,
2203 std::next(Previous->getIterator()));
2204
2205 auto *RecurSplice =
2207 {FOR, FOR->getBackedgeValue()});
2208
2209 FOR->replaceAllUsesWith(RecurSplice);
2210 // Set the first operand of RecurSplice to FOR again, after replacing
2211 // all users.
2212 RecurSplice->setOperand(0, FOR);
2213
2214 // Check for users extracting at the penultimate active lane of the FOR.
2215 // If only a single lane is active in the current iteration, we need to
2216 // select the last element from the previous iteration (from the FOR phi
2217 // directly).
2218 for (VPUser *U : RecurSplice->users()) {
2220 m_Specific(RecurSplice))))
2221 continue;
2222
2224 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2225 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2226 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2227 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2228 VPValue *PenultimateIndex =
2229 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2230 VPValue *PenultimateLastIter =
2231 B.createNaryOp(VPInstruction::ExtractLane,
2232 {PenultimateIndex, FOR->getBackedgeValue()});
2233 VPValue *LastPrevIter =
2234 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2235
2236 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2237 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2238 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2239 }
2240 }
2241 return true;
2242}
2243
2245 for (VPRecipeBase &R :
2247 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2248 if (!PhiR)
2249 continue;
2250 RecurKind RK = PhiR->getRecurrenceKind();
2251 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2253 continue;
2254
2255 for (VPUser *U : collectUsersRecursively(PhiR))
2256 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2257 RecWithFlags->dropPoisonGeneratingFlags();
2258 }
2259 }
2260}
2261
2262namespace {
2263struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2264 static bool isSentinel(const VPSingleDefRecipe *Def) {
2265 return Def == getEmptyKey() || Def == getTombstoneKey();
2266 }
2267
2268 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2269 /// return that source element type.
2270 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2271 // All VPInstructions that lower to GEPs must have the i8 source element
2272 // type (as they are PtrAdds), so we omit it.
2274 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2275 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2276 return GEP->getSourceElementType();
2277 return nullptr;
2278 })
2279 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2280 [](auto *I) { return I->getSourceElementType(); })
2281 .Default([](auto *) { return nullptr; });
2282 }
2283
2284 /// Returns true if recipe \p Def can be safely handed for CSE.
2285 static bool canHandle(const VPSingleDefRecipe *Def) {
2286 // We can extend the list of handled recipes in the future,
2287 // provided we account for the data embedded in them while checking for
2288 // equality or hashing.
2289 auto C = getOpcodeOrIntrinsicID(Def);
2290
2291 // The issue with (Insert|Extract)Value is that the index of the
2292 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2293 // VPlan.
2294 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2295 C->second == Instruction::ExtractValue)))
2296 return false;
2297
2298 // During CSE, we can only handle recipes that don't read from memory: if
2299 // they read from memory, there could be an intervening write to memory
2300 // before the next instance is CSE'd, leading to an incorrect result.
2301 return !Def->mayReadFromMemory();
2302 }
2303
2304 /// Hash the underlying data of \p Def.
2305 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2306 const VPlan *Plan = Def->getParent()->getPlan();
2307 VPTypeAnalysis TypeInfo(*Plan);
2308 hash_code Result = hash_combine(
2309 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2310 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2312 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2313 if (RFlags->hasPredicate())
2314 return hash_combine(Result, RFlags->getPredicate());
2315 return Result;
2316 }
2317
2318 /// Check equality of underlying data of \p L and \p R.
2319 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2320 if (isSentinel(L) || isSentinel(R))
2321 return L == R;
2322 if (L->getVPDefID() != R->getVPDefID() ||
2324 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2326 !equal(L->operands(), R->operands()))
2327 return false;
2329 "must have valid opcode info for both recipes");
2330 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2331 if (LFlags->hasPredicate() &&
2332 LFlags->getPredicate() !=
2333 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2334 return false;
2335 // Recipes in replicate regions implicitly depend on predicate. If either
2336 // recipe is in a replicate region, only consider them equal if both have
2337 // the same parent.
2338 const VPRegionBlock *RegionL = L->getRegion();
2339 const VPRegionBlock *RegionR = R->getRegion();
2340 if (((RegionL && RegionL->isReplicator()) ||
2341 (RegionR && RegionR->isReplicator())) &&
2342 L->getParent() != R->getParent())
2343 return false;
2344 const VPlan *Plan = L->getParent()->getPlan();
2345 VPTypeAnalysis TypeInfo(*Plan);
2346 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2347 }
2348};
2349} // end anonymous namespace
2350
2351/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2352/// Plan.
2354 VPDominatorTree VPDT(Plan);
2356
2358 vp_depth_first_deep(Plan.getEntry()))) {
2359 for (VPRecipeBase &R : *VPBB) {
2360 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2361 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2362 continue;
2363 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2364 // V must dominate Def for a valid replacement.
2365 if (!VPDT.dominates(V->getParent(), VPBB))
2366 continue;
2367 // Only keep flags present on both V and Def.
2368 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2369 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2370 Def->replaceAllUsesWith(V);
2371 continue;
2372 }
2373 CSEMap[Def] = Def;
2374 }
2375 }
2376}
2377
2378/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2379static void licm(VPlan &Plan) {
2380 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2381
2382 // Hoist any loop invariant recipes from the vector loop region to the
2383 // preheader. Preform a shallow traversal of the vector loop region, to
2384 // exclude recipes in replicate regions. Since the top-level blocks in the
2385 // vector loop region are guaranteed to execute if the vector pre-header is,
2386 // we don't need to check speculation safety.
2387 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2388 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2389 "Expected vector prehader's successor to be the vector loop region");
2391 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2392 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2394 continue;
2395 if (any_of(R.operands(), [](VPValue *Op) {
2396 return !Op->isDefinedOutsideLoopRegions();
2397 }))
2398 continue;
2399 R.moveBefore(*Preheader, Preheader->end());
2400 }
2401 }
2402}
2403
2405 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2406 if (Plan.hasScalarVFOnly())
2407 return;
2408 // Keep track of created truncates, so they can be re-used. Note that we
2409 // cannot use RAUW after creating a new truncate, as this would could make
2410 // other uses have different types for their operands, making them invalidly
2411 // typed.
2413 VPTypeAnalysis TypeInfo(Plan);
2414 VPBasicBlock *PH = Plan.getVectorPreheader();
2417 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2420 &R))
2421 continue;
2422
2423 VPValue *ResultVPV = R.getVPSingleValue();
2424 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2425 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2426 if (!NewResSizeInBits)
2427 continue;
2428
2429 // If the value wasn't vectorized, we must maintain the original scalar
2430 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2431 // skip casts which do not need to be handled explicitly here, as
2432 // redundant casts will be removed during recipe simplification.
2434 continue;
2435
2436 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2437 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2438 assert(OldResTy->isIntegerTy() && "only integer types supported");
2439 (void)OldResSizeInBits;
2440
2441 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2442
2443 // Any wrapping introduced by shrinking this operation shouldn't be
2444 // considered undefined behavior. So, we can't unconditionally copy
2445 // arithmetic wrapping flags to VPW.
2446 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2447 VPW->dropPoisonGeneratingFlags();
2448
2449 if (OldResSizeInBits != NewResSizeInBits &&
2450 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2451 // Extend result to original width.
2452 auto *Ext =
2453 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2454 Ext->insertAfter(&R);
2455 ResultVPV->replaceAllUsesWith(Ext);
2456 Ext->setOperand(0, ResultVPV);
2457 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2458 } else {
2459 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2460 "Only ICmps should not need extending the result.");
2461 }
2462
2463 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2465 continue;
2466
2467 // Shrink operands by introducing truncates as needed.
2468 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2469 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2470 auto *Op = R.getOperand(Idx);
2471 unsigned OpSizeInBits =
2473 if (OpSizeInBits == NewResSizeInBits)
2474 continue;
2475 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2476 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2477 if (!IterIsEmpty) {
2478 R.setOperand(Idx, ProcessedIter->second);
2479 continue;
2480 }
2481
2482 VPBuilder Builder;
2483 if (Op->isLiveIn())
2484 Builder.setInsertPoint(PH);
2485 else
2486 Builder.setInsertPoint(&R);
2487 VPWidenCastRecipe *NewOp =
2488 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2489 ProcessedIter->second = NewOp;
2490 R.setOperand(Idx, NewOp);
2491 }
2492
2493 }
2494 }
2495}
2496
2500 VPValue *Cond;
2501 // Skip blocks that are not terminated by BranchOnCond.
2502 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2503 continue;
2504
2505 assert(VPBB->getNumSuccessors() == 2 &&
2506 "Two successors expected for BranchOnCond");
2507 unsigned RemovedIdx;
2508 if (match(Cond, m_True()))
2509 RemovedIdx = 1;
2510 else if (match(Cond, m_False()))
2511 RemovedIdx = 0;
2512 else
2513 continue;
2514
2515 VPBasicBlock *RemovedSucc =
2516 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2517 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2518 "There must be a single edge between VPBB and its successor");
2519 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2520 // these recipes.
2521 for (VPRecipeBase &R : RemovedSucc->phis())
2522 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2523
2524 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2525 // automatically on VPlan destruction if it becomes unreachable.
2526 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2527 VPBB->back().eraseFromParent();
2528 }
2529}
2530
2550
2551// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2552// the loop terminator with a branch-on-cond recipe with the negated
2553// active-lane-mask as operand. Note that this turns the loop into an
2554// uncountable one. Only the existing terminator is replaced, all other existing
2555// recipes/users remain unchanged, except for poison-generating flags being
2556// dropped from the canonical IV increment. Return the created
2557// VPActiveLaneMaskPHIRecipe.
2558//
2559// The function uses the following definitions:
2560//
2561// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2562// calculate-trip-count-minus-VF (original TC) : original TC
2563// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2564// CanonicalIVPhi : CanonicalIVIncrement
2565// %StartV is the canonical induction start value.
2566//
2567// The function adds the following recipes:
2568//
2569// vector.ph:
2570// %TripCount = calculate-trip-count-minus-VF (original TC)
2571// [if DataWithControlFlowWithoutRuntimeCheck]
2572// %EntryInc = canonical-iv-increment-for-part %StartV
2573// %EntryALM = active-lane-mask %EntryInc, %TripCount
2574//
2575// vector.body:
2576// ...
2577// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2578// ...
2579// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2580// %ALM = active-lane-mask %InLoopInc, TripCount
2581// %Negated = Not %ALM
2582// branch-on-cond %Negated
2583//
2586 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2587 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2588 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2589 VPValue *StartV = CanonicalIVPHI->getStartValue();
2590
2591 auto *CanonicalIVIncrement =
2592 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2593 // TODO: Check if dropping the flags is needed if
2594 // !DataAndControlFlowWithoutRuntimeCheck.
2595 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2596 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2597 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2598 // we have to take unrolling into account. Each part needs to start at
2599 // Part * VF
2600 auto *VecPreheader = Plan.getVectorPreheader();
2601 VPBuilder Builder(VecPreheader);
2602
2603 // Create the ActiveLaneMask instruction using the correct start values.
2604 VPValue *TC = Plan.getTripCount();
2605
2606 VPValue *TripCount, *IncrementValue;
2608 // When the loop is guarded by a runtime overflow check for the loop
2609 // induction variable increment by VF, we can increment the value before
2610 // the get.active.lane mask and use the unmodified tripcount.
2611 IncrementValue = CanonicalIVIncrement;
2612 TripCount = TC;
2613 } else {
2614 // When avoiding a runtime check, the active.lane.mask inside the loop
2615 // uses a modified trip count and the induction variable increment is
2616 // done after the active.lane.mask intrinsic is called.
2617 IncrementValue = CanonicalIVPHI;
2618 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2619 {TC}, DL);
2620 }
2621 auto *EntryIncrement = Builder.createOverflowingOp(
2622 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2623 "index.part.next");
2624
2625 // Create the active lane mask instruction in the VPlan preheader.
2626 VPValue *ALMMultiplier =
2627 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2628 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2629 {EntryIncrement, TC, ALMMultiplier}, DL,
2630 "active.lane.mask.entry");
2631
2632 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2633 // preheader ActiveLaneMask instruction.
2634 auto *LaneMaskPhi =
2636 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2637
2638 // Create the active lane mask for the next iteration of the loop before the
2639 // original terminator.
2640 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2641 Builder.setInsertPoint(OriginalTerminator);
2642 auto *InLoopIncrement =
2643 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2644 {IncrementValue}, {false, false}, DL);
2645 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2646 {InLoopIncrement, TripCount, ALMMultiplier},
2647 DL, "active.lane.mask.next");
2648 LaneMaskPhi->addOperand(ALM);
2649
2650 // Replace the original terminator with BranchOnCond. We have to invert the
2651 // mask here because a true condition means jumping to the exit block.
2652 auto *NotMask = Builder.createNot(ALM, DL);
2653 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2654 OriginalTerminator->eraseFromParent();
2655 return LaneMaskPhi;
2656}
2657
2658/// Collect the header mask with the pattern:
2659/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2660/// TODO: Introduce explicit recipe for header-mask instead of searching
2661/// for the header-mask pattern manually.
2663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2664 SmallVector<VPValue *> WideCanonicalIVs;
2665 auto *FoundWidenCanonicalIVUser = find_if(
2667 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2669 "Must have at most one VPWideCanonicalIVRecipe");
2670 if (FoundWidenCanonicalIVUser !=
2671 LoopRegion->getCanonicalIV()->users().end()) {
2672 auto *WideCanonicalIV =
2673 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2674 WideCanonicalIVs.push_back(WideCanonicalIV);
2675 }
2676
2677 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2678 // version of the canonical induction.
2679 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2680 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2681 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2682 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2683 WideCanonicalIVs.push_back(WidenOriginalIV);
2684 }
2685
2686 // Walk users of wide canonical IVs and find the single compare of the form
2687 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2688 VPSingleDefRecipe *HeaderMask = nullptr;
2689 for (auto *Wide : WideCanonicalIVs) {
2690 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2691 auto *VPI = dyn_cast<VPInstruction>(U);
2692 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2693 continue;
2694
2695 assert(VPI->getOperand(0) == Wide &&
2696 "WidenCanonicalIV must be the first operand of the compare");
2697 assert(!HeaderMask && "Multiple header masks found?");
2698 HeaderMask = VPI;
2699 }
2700 }
2701 return HeaderMask;
2702}
2703
2705 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2708 UseActiveLaneMaskForControlFlow) &&
2709 "DataAndControlFlowWithoutRuntimeCheck implies "
2710 "UseActiveLaneMaskForControlFlow");
2711
2712 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2713 auto *FoundWidenCanonicalIVUser = find_if(
2715 assert(FoundWidenCanonicalIVUser &&
2716 "Must have widened canonical IV when tail folding!");
2717 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2718 auto *WideCanonicalIV =
2719 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2720 VPSingleDefRecipe *LaneMask;
2721 if (UseActiveLaneMaskForControlFlow) {
2724 } else {
2725 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2726 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2727 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2728 LaneMask =
2729 B.createNaryOp(VPInstruction::ActiveLaneMask,
2730 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2731 nullptr, "active.lane.mask");
2732 }
2733
2734 // Walk users of WideCanonicalIV and replace the header mask of the form
2735 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2736 // removing the old one to ensure there is always only a single header mask.
2737 HeaderMask->replaceAllUsesWith(LaneMask);
2738 HeaderMask->eraseFromParent();
2739}
2740
2741template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2742 Op0_t In;
2744
2745 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2746
2747 template <typename OpTy> bool match(OpTy *V) const {
2748 if (m_Specific(In).match(V)) {
2749 Out = nullptr;
2750 return true;
2751 }
2752 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2753 }
2754};
2755
2756/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2757/// Returns the remaining part \p Out if so, or nullptr otherwise.
2758template <typename Op0_t, typename Op1_t>
2759static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2760 Op1_t &Out) {
2761 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2762}
2763
2764/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2765/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2766/// recipe could be created.
2767/// \p HeaderMask Header Mask.
2768/// \p CurRecipe Recipe to be transform.
2769/// \p TypeInfo VPlan-based type analysis.
2770/// \p EVL The explicit vector length parameter of vector-predication
2771/// intrinsics.
2773 VPRecipeBase &CurRecipe,
2774 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2775 VPlan *Plan = CurRecipe.getParent()->getPlan();
2776 DebugLoc DL = CurRecipe.getDebugLoc();
2777 VPValue *Addr, *Mask, *EndPtr;
2778
2779 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2780 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2781 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2782 EVLEndPtr->insertBefore(&CurRecipe);
2783 EVLEndPtr->setOperand(1, &EVL);
2784 return EVLEndPtr;
2785 };
2786
2787 if (match(&CurRecipe,
2788 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2789 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2790 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2791 EVL, Mask);
2792
2793 if (match(&CurRecipe,
2794 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2795 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2796 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2797 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2798 AdjustEndPtr(EndPtr), EVL, Mask);
2799
2800 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2801 m_RemoveMask(HeaderMask, Mask))) &&
2802 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2803 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2804 EVL, Mask);
2805
2806 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2807 m_RemoveMask(HeaderMask, Mask))) &&
2808 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2809 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2810 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2811 AdjustEndPtr(EndPtr), EVL, Mask);
2812
2813 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2814 if (Rdx->isConditional() &&
2815 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2816 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2817
2818 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2819 if (Interleave->getMask() &&
2820 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2821 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2822
2823 VPValue *LHS, *RHS;
2824 if (match(&CurRecipe,
2825 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2826 return new VPWidenIntrinsicRecipe(
2827 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2828 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2829
2830 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2831 m_VPValue(RHS))))
2832 return new VPWidenIntrinsicRecipe(
2833 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2834 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2835
2836 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2837 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2838 VPValue *ZExt =
2839 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2840 return new VPInstruction(Instruction::Sub,
2841 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2842 }
2843
2844 return nullptr;
2845}
2846
2847/// Replace recipes with their EVL variants.
2849 VPTypeAnalysis TypeInfo(Plan);
2850 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2851 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2852
2853 assert(all_of(Plan.getVF().users(),
2856 "User of VF that we can't transform to EVL.");
2857 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2859 });
2860
2861 assert(all_of(Plan.getVFxUF().users(),
2862 [&LoopRegion, &Plan](VPUser *U) {
2863 return match(U,
2864 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2865 m_Specific(&Plan.getVFxUF()))) ||
2866 isa<VPWidenPointerInductionRecipe>(U);
2867 }) &&
2868 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2869 "increment of the canonical induction.");
2870 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2871 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2872 // canonical induction must not be updated.
2874 });
2875
2876 // Defer erasing recipes till the end so that we don't invalidate the
2877 // VPTypeAnalysis cache.
2879
2880 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2881 // contained.
2882 bool ContainsFORs =
2884 if (ContainsFORs) {
2885 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2886 VPValue *MaxEVL = &Plan.getVF();
2887 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2888 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2889 MaxEVL = Builder.createScalarZExtOrTrunc(
2890 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2891 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2892
2893 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2894 VPValue *PrevEVL = Builder.createScalarPhi(
2895 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2896
2899 for (VPRecipeBase &R : *VPBB) {
2900 VPValue *V1, *V2;
2901 if (!match(&R,
2903 m_VPValue(V1), m_VPValue(V2))))
2904 continue;
2905 VPValue *Imm = Plan.getOrAddLiveIn(
2908 Intrinsic::experimental_vp_splice,
2909 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2910 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
2911 R.getDebugLoc());
2912 VPSplice->insertBefore(&R);
2913 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2914 ToErase.push_back(&R);
2915 }
2916 }
2917 }
2918
2919 VPValue *HeaderMask = findHeaderMask(Plan);
2920 if (!HeaderMask)
2921 return;
2922
2923 // Replace header masks with a mask equivalent to predicating by EVL:
2924 //
2925 // icmp ule widen-canonical-iv backedge-taken-count
2926 // ->
2927 // icmp ult step-vector, EVL
2928 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2929 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2930 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2931 VPValue *EVLMask = Builder.createICmp(
2933 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2934 HeaderMask->replaceAllUsesWith(EVLMask);
2935 ToErase.push_back(HeaderMask->getDefiningRecipe());
2936
2937 // Try to optimize header mask recipes away to their EVL variants.
2938 // TODO: Split optimizeMaskToEVL out and move into
2939 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2940 // tryToBuildVPlanWithVPRecipes beforehand.
2941 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2942 auto *CurRecipe = cast<VPRecipeBase>(U);
2943 VPRecipeBase *EVLRecipe =
2944 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2945 if (!EVLRecipe)
2946 continue;
2947
2948 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2949 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2950 "New recipe must define the same number of values as the "
2951 "original.");
2952 EVLRecipe->insertBefore(CurRecipe);
2954 EVLRecipe)) {
2955 for (unsigned I = 0; I < NumDefVal; ++I) {
2956 VPValue *CurVPV = CurRecipe->getVPValue(I);
2957 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2958 }
2959 }
2960 ToErase.push_back(CurRecipe);
2961 }
2962 // Remove dead EVL mask.
2963 if (EVLMask->getNumUsers() == 0)
2964 ToErase.push_back(EVLMask->getDefiningRecipe());
2965
2966 for (VPRecipeBase *R : reverse(ToErase)) {
2967 SmallVector<VPValue *> PossiblyDead(R->operands());
2968 R->eraseFromParent();
2969 for (VPValue *Op : PossiblyDead)
2971 }
2972}
2973
2974/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2975/// replaces all uses except the canonical IV increment of
2976/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2977/// is used only for loop iterations counting after this transformation.
2978///
2979/// The function uses the following definitions:
2980/// %StartV is the canonical induction start value.
2981///
2982/// The function adds the following recipes:
2983///
2984/// vector.ph:
2985/// ...
2986///
2987/// vector.body:
2988/// ...
2989/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2990/// [ %NextEVLIV, %vector.body ]
2991/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2992/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2993/// ...
2994/// %OpEVL = cast i32 %VPEVL to IVSize
2995/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2996/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2997/// ...
2998///
2999/// If MaxSafeElements is provided, the function adds the following recipes:
3000/// vector.ph:
3001/// ...
3002///
3003/// vector.body:
3004/// ...
3005/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3006/// [ %NextEVLIV, %vector.body ]
3007/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3008/// %cmp = cmp ult %AVL, MaxSafeElements
3009/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3010/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3011/// ...
3012/// %OpEVL = cast i32 %VPEVL to IVSize
3013/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3014/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3015/// ...
3016///
3018 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3019 if (Plan.hasScalarVFOnly())
3020 return;
3021 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3022 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3023
3024 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3025 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3026 VPValue *StartV = CanonicalIVPHI->getStartValue();
3027
3028 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3029 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3030 EVLPhi->insertAfter(CanonicalIVPHI);
3031 VPBuilder Builder(Header, Header->getFirstNonPhi());
3032 // Create the AVL (application vector length), starting from TC -> 0 in steps
3033 // of EVL.
3034 VPPhi *AVLPhi = Builder.createScalarPhi(
3035 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3036 VPValue *AVL = AVLPhi;
3037
3038 if (MaxSafeElements) {
3039 // Support for MaxSafeDist for correct loop emission.
3040 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3041 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3042 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3043 "safe_avl");
3044 }
3045 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3047
3048 auto *CanonicalIVIncrement =
3049 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3050 Builder.setInsertPoint(CanonicalIVIncrement);
3051 VPValue *OpVPEVL = VPEVL;
3052
3053 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3054 OpVPEVL = Builder.createScalarZExtOrTrunc(
3055 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3056
3057 auto *NextEVLIV = Builder.createOverflowingOp(
3058 Instruction::Add, {OpVPEVL, EVLPhi},
3059 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3060 CanonicalIVIncrement->hasNoSignedWrap()},
3061 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3062 EVLPhi->addOperand(NextEVLIV);
3063
3064 VPValue *NextAVL = Builder.createOverflowingOp(
3065 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3066 DebugLoc::getCompilerGenerated(), "avl.next");
3067 AVLPhi->addOperand(NextAVL);
3068
3069 transformRecipestoEVLRecipes(Plan, *VPEVL);
3070
3071 // Replace all uses of VPCanonicalIVPHIRecipe by
3072 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3073 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3074 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3075 // TODO: support unroll factor > 1.
3076 Plan.setUF(1);
3077}
3078
3080 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3081 // There should be only one EVL PHI in the entire plan.
3082 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3083
3086 for (VPRecipeBase &R : VPBB->phis())
3087 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3088 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3089 EVLPhi = PhiR;
3090 }
3091
3092 // Early return if no EVL PHI is found.
3093 if (!EVLPhi)
3094 return;
3095
3096 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3097 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3098 VPValue *AVL;
3099 [[maybe_unused]] bool FoundAVL =
3100 match(EVLIncrement,
3101 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3102 assert(FoundAVL && "Didn't find AVL?");
3103
3104 // The AVL may be capped to a safe distance.
3105 VPValue *SafeAVL;
3106 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3107 AVL = SafeAVL;
3108
3109 VPValue *AVLNext;
3110 [[maybe_unused]] bool FoundAVLNext =
3112 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3113 assert(FoundAVLNext && "Didn't find AVL backedge?");
3114
3115 // Convert EVLPhi to concrete recipe.
3116 auto *ScalarR =
3117 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3118 EVLPhi->getDebugLoc(), "evl.based.iv");
3119 EVLPhi->replaceAllUsesWith(ScalarR);
3120 EVLPhi->eraseFromParent();
3121
3122 // Replace CanonicalIVInc with EVL-PHI increment.
3123 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3124 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3125 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3126 m_Specific(&Plan.getVFxUF()))) &&
3127 "Unexpected canonical iv");
3128 Backedge->replaceAllUsesWith(EVLIncrement);
3129
3130 // Remove unused phi and increment.
3131 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3132 CanonicalIVIncrement->eraseFromParent();
3133 CanonicalIV->eraseFromParent();
3134
3135 // Replace the use of VectorTripCount in the latch-exiting block.
3136 // Before: (branch-on-count EVLIVInc, VectorTripCount)
3137 // After: (branch-on-cond eq AVLNext, 0)
3138
3139 VPBasicBlock *LatchExiting =
3140 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3141 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3142 // Skip single-iteration loop region
3143 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3144 return;
3145 assert(LatchExitingBr &&
3146 match(LatchExitingBr,
3147 m_BranchOnCount(m_VPValue(EVLIncrement),
3148 m_Specific(&Plan.getVectorTripCount()))) &&
3149 "Unexpected terminator in EVL loop");
3150
3151 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3152 VPBuilder Builder(LatchExitingBr);
3153 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3154 Plan.getConstantInt(AVLTy, 0));
3155 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3156 LatchExitingBr->eraseFromParent();
3157}
3158
3160 VPlan &Plan, PredicatedScalarEvolution &PSE,
3161 const DenseMap<Value *, const SCEV *> &StridesMap) {
3162 // Replace VPValues for known constant strides guaranteed by predicate scalar
3163 // evolution.
3164 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3165 auto *R = cast<VPRecipeBase>(&U);
3166 return R->getRegion() ||
3167 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3168 };
3169 ValueToSCEVMapTy RewriteMap;
3170 for (const SCEV *Stride : StridesMap.values()) {
3171 using namespace SCEVPatternMatch;
3172 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3173 const APInt *StrideConst;
3174 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3175 // Only handle constant strides for now.
3176 continue;
3177
3178 auto *CI = Plan.getConstantInt(*StrideConst);
3179 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3180 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3181
3182 // The versioned value may not be used in the loop directly but through a
3183 // sext/zext. Add new live-ins in those cases.
3184 for (Value *U : StrideV->users()) {
3186 continue;
3187 VPValue *StrideVPV = Plan.getLiveIn(U);
3188 if (!StrideVPV)
3189 continue;
3190 unsigned BW = U->getType()->getScalarSizeInBits();
3191 APInt C =
3192 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3193 VPValue *CI = Plan.getConstantInt(C);
3194 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3195 }
3196 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3197 }
3198
3199 for (VPRecipeBase &R : *Plan.getEntry()) {
3200 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3201 if (!ExpSCEV)
3202 continue;
3203 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3204 auto *NewSCEV =
3205 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3206 if (NewSCEV != ScevExpr) {
3207 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3208 ExpSCEV->replaceAllUsesWith(NewExp);
3209 if (Plan.getTripCount() == ExpSCEV)
3210 Plan.resetTripCount(NewExp);
3211 }
3212 }
3213}
3214
3216 VPlan &Plan,
3217 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3218 // Collect recipes in the backward slice of `Root` that may generate a poison
3219 // value that is used after vectorization.
3221 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3223 Worklist.push_back(Root);
3224
3225 // Traverse the backward slice of Root through its use-def chain.
3226 while (!Worklist.empty()) {
3227 VPRecipeBase *CurRec = Worklist.pop_back_val();
3228
3229 if (!Visited.insert(CurRec).second)
3230 continue;
3231
3232 // Prune search if we find another recipe generating a widen memory
3233 // instruction. Widen memory instructions involved in address computation
3234 // will lead to gather/scatter instructions, which don't need to be
3235 // handled.
3237 VPHeaderPHIRecipe>(CurRec))
3238 continue;
3239
3240 // This recipe contributes to the address computation of a widen
3241 // load/store. If the underlying instruction has poison-generating flags,
3242 // drop them directly.
3243 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3244 VPValue *A, *B;
3245 // Dropping disjoint from an OR may yield incorrect results, as some
3246 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3247 // for dependence analysis). Instead, replace it with an equivalent Add.
3248 // This is possible as all users of the disjoint OR only access lanes
3249 // where the operands are disjoint or poison otherwise.
3250 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3251 RecWithFlags->isDisjoint()) {
3252 VPBuilder Builder(RecWithFlags);
3253 VPInstruction *New = Builder.createOverflowingOp(
3254 Instruction::Add, {A, B}, {false, false},
3255 RecWithFlags->getDebugLoc());
3256 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3257 RecWithFlags->replaceAllUsesWith(New);
3258 RecWithFlags->eraseFromParent();
3259 CurRec = New;
3260 } else
3261 RecWithFlags->dropPoisonGeneratingFlags();
3262 } else {
3265 (void)Instr;
3266 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3267 "found instruction with poison generating flags not covered by "
3268 "VPRecipeWithIRFlags");
3269 }
3270
3271 // Add new definitions to the worklist.
3272 for (VPValue *Operand : CurRec->operands())
3273 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3274 Worklist.push_back(OpDef);
3275 }
3276 });
3277
3278 // Traverse all the recipes in the VPlan and collect the poison-generating
3279 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3280 // VPInterleaveRecipe.
3281 auto Iter = vp_depth_first_deep(Plan.getEntry());
3283 for (VPRecipeBase &Recipe : *VPBB) {
3284 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3285 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3286 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3287 if (AddrDef && WidenRec->isConsecutive() &&
3288 BlockNeedsPredication(UnderlyingInstr.getParent()))
3289 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3290 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3291 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3292 if (AddrDef) {
3293 // Check if any member of the interleave group needs predication.
3294 const InterleaveGroup<Instruction> *InterGroup =
3295 InterleaveRec->getInterleaveGroup();
3296 bool NeedPredication = false;
3297 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3298 I < NumMembers; ++I) {
3299 Instruction *Member = InterGroup->getMember(I);
3300 if (Member)
3301 NeedPredication |= BlockNeedsPredication(Member->getParent());
3302 }
3303
3304 if (NeedPredication)
3305 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3306 }
3307 }
3308 }
3309 }
3310}
3311
3313 VPlan &Plan,
3315 &InterleaveGroups,
3316 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3317 if (InterleaveGroups.empty())
3318 return;
3319
3320 // Interleave memory: for each Interleave Group we marked earlier as relevant
3321 // for this VPlan, replace the Recipes widening its memory instructions with a
3322 // single VPInterleaveRecipe at its insertion point.
3323 VPDominatorTree VPDT(Plan);
3324 for (const auto *IG : InterleaveGroups) {
3325 auto *Start =
3326 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3327 VPIRMetadata InterleaveMD(*Start);
3328 SmallVector<VPValue *, 4> StoredValues;
3329 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3330 StoredValues.push_back(StoreR->getStoredValue());
3331 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3332 Instruction *MemberI = IG->getMember(I);
3333 if (!MemberI)
3334 continue;
3335 VPWidenMemoryRecipe *MemoryR =
3336 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3337 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3338 StoredValues.push_back(StoreR->getStoredValue());
3339 InterleaveMD.intersect(*MemoryR);
3340 }
3341
3342 bool NeedsMaskForGaps =
3343 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3344 (!StoredValues.empty() && !IG->isFull());
3345
3346 Instruction *IRInsertPos = IG->getInsertPos();
3347 auto *InsertPos =
3348 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3349
3351 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3352 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3353 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3354
3355 // Get or create the start address for the interleave group.
3356 VPValue *Addr = Start->getAddr();
3357 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3358 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3359 // We cannot re-use the address of member zero because it does not
3360 // dominate the insert position. Instead, use the address of the insert
3361 // position and create a PtrAdd adjusting it to the address of member
3362 // zero.
3363 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3364 // InsertPos or sink loads above zero members to join it.
3365 assert(IG->getIndex(IRInsertPos) != 0 &&
3366 "index of insert position shouldn't be zero");
3367 auto &DL = IRInsertPos->getDataLayout();
3368 APInt Offset(32,
3369 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3370 IG->getIndex(IRInsertPos),
3371 /*IsSigned=*/true);
3372 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3373 VPBuilder B(InsertPos);
3374 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3375 }
3376 // If the group is reverse, adjust the index to refer to the last vector
3377 // lane instead of the first. We adjust the index from the first vector
3378 // lane, rather than directly getting the pointer for lane VF - 1, because
3379 // the pointer operand of the interleaved access is supposed to be uniform.
3380 if (IG->isReverse()) {
3381 auto *ReversePtr = new VPVectorEndPointerRecipe(
3382 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3383 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3384 ReversePtr->insertBefore(InsertPos);
3385 Addr = ReversePtr;
3386 }
3387 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3388 InsertPos->getMask(), NeedsMaskForGaps,
3389 InterleaveMD, InsertPos->getDebugLoc());
3390 VPIG->insertBefore(InsertPos);
3391
3392 unsigned J = 0;
3393 for (unsigned i = 0; i < IG->getFactor(); ++i)
3394 if (Instruction *Member = IG->getMember(i)) {
3395 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3396 if (!Member->getType()->isVoidTy()) {
3397 VPValue *OriginalV = MemberR->getVPSingleValue();
3398 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3399 J++;
3400 }
3401 MemberR->eraseFromParent();
3402 }
3403 }
3404}
3405
3406/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3407/// value, phi and backedge value. In the following example:
3408///
3409/// vector.ph:
3410/// Successor(s): vector loop
3411///
3412/// <x1> vector loop: {
3413/// vector.body:
3414/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3415/// ...
3416/// EMIT branch-on-count ...
3417/// No successors
3418/// }
3419///
3420/// WIDEN-INDUCTION will get expanded to:
3421///
3422/// vector.ph:
3423/// ...
3424/// vp<%induction.start> = ...
3425/// vp<%induction.increment> = ...
3426///
3427/// Successor(s): vector loop
3428///
3429/// <x1> vector loop: {
3430/// vector.body:
3431/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3432/// ...
3433/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3434/// EMIT branch-on-count ...
3435/// No successors
3436/// }
3437static void
3439 VPTypeAnalysis &TypeInfo) {
3440 VPlan *Plan = WidenIVR->getParent()->getPlan();
3441 VPValue *Start = WidenIVR->getStartValue();
3442 VPValue *Step = WidenIVR->getStepValue();
3443 VPValue *VF = WidenIVR->getVFValue();
3444 DebugLoc DL = WidenIVR->getDebugLoc();
3445
3446 // The value from the original loop to which we are mapping the new induction
3447 // variable.
3448 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3449
3450 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3453 VPIRFlags Flags = *WidenIVR;
3454 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3455 AddOp = Instruction::Add;
3456 MulOp = Instruction::Mul;
3457 } else {
3458 AddOp = ID.getInductionOpcode();
3459 MulOp = Instruction::FMul;
3460 }
3461
3462 // If the phi is truncated, truncate the start and step values.
3463 VPBuilder Builder(Plan->getVectorPreheader());
3464 Type *StepTy = TypeInfo.inferScalarType(Step);
3465 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3466 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3467 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3468 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3469 // Truncation doesn't preserve WrapFlags.
3470 Flags.dropPoisonGeneratingFlags();
3471 StepTy = Ty;
3472 }
3473
3474 // Construct the initial value of the vector IV in the vector loop preheader.
3475 Type *IVIntTy =
3477 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3478 if (StepTy->isFloatingPointTy())
3479 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3480
3481 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3482 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3483
3484 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3485 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3486 DebugLoc::getUnknown(), "induction");
3487
3488 // Create the widened phi of the vector IV.
3489 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3490 WidenIVR->getDebugLoc(), "vec.ind");
3491 WidePHI->insertBefore(WidenIVR);
3492
3493 // Create the backedge value for the vector IV.
3494 VPValue *Inc;
3495 VPValue *Prev;
3496 // If unrolled, use the increment and prev value from the operands.
3497 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3498 Inc = SplatVF;
3499 Prev = WidenIVR->getLastUnrolledPartOperand();
3500 } else {
3501 if (VPRecipeBase *R = VF->getDefiningRecipe())
3502 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3503 // Multiply the vectorization factor by the step using integer or
3504 // floating-point arithmetic as appropriate.
3505 if (StepTy->isFloatingPointTy())
3506 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3507 DL);
3508 else
3509 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3510 TypeInfo.inferScalarType(VF), DL);
3511
3512 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3513 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3514 Prev = WidePHI;
3515 }
3516
3518 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3519 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3520 WidenIVR->getDebugLoc(), "vec.ind.next");
3521
3522 WidePHI->addOperand(Next);
3523
3524 WidenIVR->replaceAllUsesWith(WidePHI);
3525}
3526
3527/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3528/// initial value, phi and backedge value. In the following example:
3529///
3530/// <x1> vector loop: {
3531/// vector.body:
3532/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3533/// ...
3534/// EMIT branch-on-count ...
3535/// }
3536///
3537/// WIDEN-POINTER-INDUCTION will get expanded to:
3538///
3539/// <x1> vector loop: {
3540/// vector.body:
3541/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3542/// EMIT %mul = mul %stepvector, %step
3543/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3544/// ...
3545/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3546/// EMIT branch-on-count ...
3547/// }
3549 VPTypeAnalysis &TypeInfo) {
3550 VPlan *Plan = R->getParent()->getPlan();
3551 VPValue *Start = R->getStartValue();
3552 VPValue *Step = R->getStepValue();
3553 VPValue *VF = R->getVFValue();
3554
3555 assert(R->getInductionDescriptor().getKind() ==
3557 "Not a pointer induction according to InductionDescriptor!");
3558 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3559 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3560 "Recipe should have been replaced");
3561
3562 VPBuilder Builder(R);
3563 DebugLoc DL = R->getDebugLoc();
3564
3565 // Build a scalar pointer phi.
3566 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3567
3568 // Create actual address geps that use the pointer phi as base and a
3569 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3570 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3571 Type *StepTy = TypeInfo.inferScalarType(Step);
3572 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3573 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3574 VPValue *PtrAdd = Builder.createNaryOp(
3575 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3576 R->replaceAllUsesWith(PtrAdd);
3577
3578 // Create the backedge value for the scalar pointer phi.
3580 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3581 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3582 DL);
3583 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3584
3585 VPValue *InductionGEP =
3586 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3587 ScalarPtrPhi->addOperand(InductionGEP);
3588}
3589
3591 // Replace loop regions with explicity CFG.
3592 SmallVector<VPRegionBlock *> LoopRegions;
3594 vp_depth_first_deep(Plan.getEntry()))) {
3595 if (!R->isReplicator())
3596 LoopRegions.push_back(R);
3597 }
3598 for (VPRegionBlock *R : LoopRegions)
3599 R->dissolveToCFGLoop();
3600}
3601
3603 VPTypeAnalysis TypeInfo(Plan);
3606 vp_depth_first_deep(Plan.getEntry()))) {
3607 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3608 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3609 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3610 ToRemove.push_back(WidenIVR);
3611 continue;
3612 }
3613
3614 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3615 // If the recipe only generates scalars, scalarize it instead of
3616 // expanding it.
3617 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3618 VPBuilder Builder(WidenIVR);
3619 VPValue *PtrAdd =
3620 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3621 WidenIVR->replaceAllUsesWith(PtrAdd);
3622 ToRemove.push_back(WidenIVR);
3623 continue;
3624 }
3625 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3626 ToRemove.push_back(WidenIVR);
3627 continue;
3628 }
3629
3630 // Expand VPBlendRecipe into VPInstruction::Select.
3631 VPBuilder Builder(&R);
3632 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3633 VPValue *Select = Blend->getIncomingValue(0);
3634 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3635 Select = Builder.createSelect(Blend->getMask(I),
3636 Blend->getIncomingValue(I), Select,
3637 R.getDebugLoc(), "predphi");
3638 Blend->replaceAllUsesWith(Select);
3639 ToRemove.push_back(Blend);
3640 }
3641
3642 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3643 Expr->decompose();
3644 ToRemove.push_back(Expr);
3645 }
3646
3647 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3648 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3649 if (LastActiveL &&
3650 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3651 // Create Not(Mask) for all operands.
3653 for (VPValue *Op : LastActiveL->operands()) {
3654 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3655 NotMasks.push_back(NotMask);
3656 }
3657
3658 // Create FirstActiveLane on the inverted masks.
3659 VPValue *FirstInactiveLane = Builder.createNaryOp(
3661 LastActiveL->getDebugLoc(), "first.inactive.lane");
3662
3663 // Subtract 1 to get the last active lane.
3664 VPValue *One = Plan.getOrAddLiveIn(
3665 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3666 VPValue *LastLane = Builder.createNaryOp(
3667 Instruction::Sub, {FirstInactiveLane, One},
3668 LastActiveL->getDebugLoc(), "last.active.lane");
3669
3670 LastActiveL->replaceAllUsesWith(LastLane);
3671 ToRemove.push_back(LastActiveL);
3672 continue;
3673 }
3674
3675 VPValue *VectorStep;
3676 VPValue *ScalarStep;
3678 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3679 continue;
3680
3681 // Expand WideIVStep.
3682 auto *VPI = cast<VPInstruction>(&R);
3683 Type *IVTy = TypeInfo.inferScalarType(VPI);
3684 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3686 ? Instruction::UIToFP
3687 : Instruction::Trunc;
3688 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3689 }
3690
3691 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3692 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3693 ScalarStep =
3694 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3695 }
3696
3697 VPIRFlags Flags;
3698 if (IVTy->isFloatingPointTy())
3699 Flags = {VPI->getFastMathFlags()};
3700
3701 unsigned MulOpc =
3702 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3703 VPInstruction *Mul = Builder.createNaryOp(
3704 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3705 VectorStep = Mul;
3706 VPI->replaceAllUsesWith(VectorStep);
3707 ToRemove.push_back(VPI);
3708 }
3709 }
3710
3711 for (VPRecipeBase *R : ToRemove)
3712 R->eraseFromParent();
3713}
3714
3716 VPBasicBlock *EarlyExitVPBB,
3717 VPlan &Plan,
3718 VPBasicBlock *HeaderVPBB,
3719 VPBasicBlock *LatchVPBB) {
3720 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3721 if (!EarlyExitVPBB->getSinglePredecessor() &&
3722 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3723 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3724 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3725 "unsupported early exit VPBB");
3726 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3727 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3728 // of the phis.
3729 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3730 cast<VPIRPhi>(&R)->swapOperands();
3731 }
3732
3733 VPBuilder Builder(LatchVPBB->getTerminator());
3734 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3735 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3736 "Terminator must be be BranchOnCond");
3737 VPValue *CondOfEarlyExitingVPBB =
3738 EarlyExitingVPBB->getTerminator()->getOperand(0);
3739 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3740 ? CondOfEarlyExitingVPBB
3741 : Builder.createNot(CondOfEarlyExitingVPBB);
3742
3743 // Split the middle block and have it conditionally branch to the early exit
3744 // block if CondToEarlyExit.
3745 VPValue *IsEarlyExitTaken =
3746 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3747 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3748 VPBasicBlock *VectorEarlyExitVPBB =
3749 Plan.createVPBasicBlock("vector.early.exit");
3750 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3751 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3752 NewMiddle->swapSuccessors();
3753
3754 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3755
3756 // Update the exit phis in the early exit block.
3757 VPBuilder MiddleBuilder(NewMiddle);
3758 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3759 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3760 auto *ExitIRI = cast<VPIRPhi>(&R);
3761 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3762 // a single predecessor and 1 if it has two.
3763 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3764 if (ExitIRI->getNumOperands() != 1) {
3765 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3766 // predecessor. Extract its final lane.
3767 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3768 }
3769
3770 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3771 if (!IncomingFromEarlyExit->isLiveIn()) {
3772 // Update the incoming value from the early exit.
3773 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3774 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3775 DebugLoc::getUnknown(), "first.active.lane");
3776 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3777 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3778 DebugLoc::getUnknown(), "early.exit.value");
3779 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3780 }
3781 }
3782 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3783
3784 // Replace the condition controlling the non-early exit from the vector loop
3785 // with one exiting if either the original condition of the vector latch is
3786 // true or the early exit has been taken.
3787 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3788 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3789 "Unexpected terminator");
3790 auto *IsLatchExitTaken =
3791 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3792 LatchExitingBranch->getOperand(1));
3793 auto *AnyExitTaken = Builder.createNaryOp(
3794 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3795 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3796 LatchExitingBranch->eraseFromParent();
3797}
3798
3799/// This function tries convert extended in-loop reductions to
3800/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3801/// valid. The created recipe must be decomposed to its constituent
3802/// recipes before execution.
3803static VPExpressionRecipe *
3805 VFRange &Range) {
3806 Type *RedTy = Ctx.Types.inferScalarType(Red);
3807 VPValue *VecOp = Red->getVecOp();
3808
3809 // Clamp the range if using extended-reduction is profitable.
3810 auto IsExtendedRedValidAndClampRange =
3811 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3813 [&](ElementCount VF) {
3814 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3816
3817 InstructionCost ExtRedCost;
3818 InstructionCost ExtCost =
3819 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3820 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3821
3822 if (Red->isPartialReduction()) {
3825 // FIXME: Move partial reduction creation, costing and clamping
3826 // here from LoopVectorize.cpp.
3827 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3828 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3829 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3830 } else {
3831 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3832 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3833 Red->getFastMathFlags(), CostKind);
3834 }
3835 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3836 },
3837 Range);
3838 };
3839
3840 VPValue *A;
3841 // Match reduce(ext)).
3842 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3843 IsExtendedRedValidAndClampRange(
3844 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3845 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3846 Ctx.Types.inferScalarType(A)))
3847 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3848
3849 return nullptr;
3850}
3851
3852/// This function tries convert extended in-loop reductions to
3853/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3854/// and valid. The created VPExpressionRecipe must be decomposed to its
3855/// constituent recipes before execution. Patterns of the
3856/// VPExpressionRecipe:
3857/// reduce.add(mul(...)),
3858/// reduce.add(mul(ext(A), ext(B))),
3859/// reduce.add(ext(mul(ext(A), ext(B)))).
3860static VPExpressionRecipe *
3862 VPCostContext &Ctx, VFRange &Range) {
3863 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3864 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3865 return nullptr;
3866
3867 Type *RedTy = Ctx.Types.inferScalarType(Red);
3868
3869 // Clamp the range if using multiply-accumulate-reduction is profitable.
3870 auto IsMulAccValidAndClampRange =
3872 VPWidenCastRecipe *OuterExt) -> bool {
3874 [&](ElementCount VF) {
3876 Type *SrcTy =
3877 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3878 InstructionCost MulAccCost;
3879
3880 if (Red->isPartialReduction()) {
3881 Type *SrcTy2 =
3882 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3883 // FIXME: Move partial reduction creation, costing and clamping
3884 // here from LoopVectorize.cpp.
3885 MulAccCost = Ctx.TTI.getPartialReductionCost(
3886 Opcode, SrcTy, SrcTy2, RedTy, VF,
3888 Ext0->getOpcode())
3891 Ext1->getOpcode())
3893 Mul->getOpcode(), CostKind);
3894 } else {
3895 // Only partial reductions support mixed extends at the moment.
3896 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3897 return false;
3898
3899 bool IsZExt =
3900 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3901 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3902 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3903 SrcVecTy, CostKind);
3904 }
3905
3906 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3907 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3908 InstructionCost ExtCost = 0;
3909 if (Ext0)
3910 ExtCost += Ext0->computeCost(VF, Ctx);
3911 if (Ext1)
3912 ExtCost += Ext1->computeCost(VF, Ctx);
3913 if (OuterExt)
3914 ExtCost += OuterExt->computeCost(VF, Ctx);
3915
3916 return MulAccCost.isValid() &&
3917 MulAccCost < ExtCost + MulCost + RedCost;
3918 },
3919 Range);
3920 };
3921
3922 VPValue *VecOp = Red->getVecOp();
3923 VPRecipeBase *Sub = nullptr;
3924 VPValue *A, *B;
3925 VPValue *Tmp = nullptr;
3926 // Sub reductions could have a sub between the add reduction and vec op.
3927 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3928 Sub = VecOp->getDefiningRecipe();
3929 VecOp = Tmp;
3930 }
3931
3932 // If ValB is a constant and can be safely extended, truncate it to the same
3933 // type as ExtA's operand, then extend it to the same type as ExtA. This
3934 // creates two uniform extends that can more easily be matched by the rest of
3935 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3936 // replaced with the new extend of the constant.
3937 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3938 VPWidenCastRecipe *&ExtB,
3939 VPValue *&ValB, VPWidenRecipe *Mul) {
3940 if (!ExtA || ExtB || !ValB->isLiveIn())
3941 return;
3942 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3943 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3944 const APInt *Const;
3945 if (!match(ValB, m_APInt(Const)) ||
3947 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3948 return;
3949 // The truncate ensures that the type of each extended operand is the
3950 // same, and it's been proven that the constant can be extended from
3951 // NarrowTy safely. Necessary since ExtA's extended operand would be
3952 // e.g. an i8, while the const will likely be an i32. This will be
3953 // elided by later optimisations.
3954 VPBuilder Builder(Mul);
3955 auto *Trunc =
3956 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3957 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3958 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3959 Mul->setOperand(1, ExtB);
3960 };
3961
3962 // Try to match reduce.add(mul(...)).
3963 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3966 auto *Mul = cast<VPWidenRecipe>(VecOp);
3967
3968 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3969 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3970
3971 // Match reduce.add/sub(mul(ext, ext)).
3972 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3973 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3974 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3975 if (Sub)
3976 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3977 cast<VPWidenRecipe>(Sub), Red);
3978 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3979 }
3980 // TODO: Add an expression type for this variant with a negated mul
3981 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3982 return new VPExpressionRecipe(Mul, Red);
3983 }
3984 // TODO: Add an expression type for negated versions of other expression
3985 // variants.
3986 if (Sub)
3987 return nullptr;
3988
3989 // Match reduce.add(ext(mul(A, B))).
3990 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3991 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
3992 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
3995
3996 // reduce.add(ext(mul(ext, const)))
3997 // -> reduce.add(ext(mul(ext, ext(const))))
3998 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3999
4000 // reduce.add(ext(mul(ext(A), ext(B))))
4001 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4002 // The inner extends must either have the same opcode as the outer extend or
4003 // be the same, in which case the multiply can never result in a negative
4004 // value and the outer extend can be folded away by doing wider
4005 // extends for the operands of the mul.
4006 if (Ext0 && Ext1 &&
4007 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4008 Ext0->getOpcode() == Ext1->getOpcode() &&
4009 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4010 auto *NewExt0 = new VPWidenCastRecipe(
4011 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4012 *Ext0, *Ext0, Ext0->getDebugLoc());
4013 NewExt0->insertBefore(Ext0);
4014
4015 VPWidenCastRecipe *NewExt1 = NewExt0;
4016 if (Ext0 != Ext1) {
4017 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4018 Ext->getResultType(), nullptr, *Ext1,
4019 *Ext1, Ext1->getDebugLoc());
4020 NewExt1->insertBefore(Ext1);
4021 }
4022 Mul->setOperand(0, NewExt0);
4023 Mul->setOperand(1, NewExt1);
4024 Red->setOperand(1, Mul);
4025 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4026 }
4027 }
4028 return nullptr;
4029}
4030
4031/// This function tries to create abstract recipes from the reduction recipe for
4032/// following optimizations and cost estimation.
4034 VPCostContext &Ctx,
4035 VFRange &Range) {
4036 VPExpressionRecipe *AbstractR = nullptr;
4037 auto IP = std::next(Red->getIterator());
4038 auto *VPBB = Red->getParent();
4039 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4040 AbstractR = MulAcc;
4041 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4042 AbstractR = ExtRed;
4043 // Cannot create abstract inloop reduction recipes.
4044 if (!AbstractR)
4045 return;
4046
4047 AbstractR->insertBefore(*VPBB, IP);
4048 Red->replaceAllUsesWith(AbstractR);
4049}
4050
4061
4063 if (Plan.hasScalarVFOnly())
4064 return;
4065
4066#ifndef NDEBUG
4067 VPDominatorTree VPDT(Plan);
4068#endif
4069
4070 SmallVector<VPValue *> VPValues;
4073 append_range(VPValues, Plan.getLiveIns());
4074 for (VPRecipeBase &R : *Plan.getEntry())
4075 append_range(VPValues, R.definedValues());
4076
4077 auto *VectorPreheader = Plan.getVectorPreheader();
4078 for (VPValue *VPV : VPValues) {
4080 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4081 isa<Constant>(VPV->getLiveInIRValue())))
4082 continue;
4083
4084 // Add explicit broadcast at the insert point that dominates all users.
4085 VPBasicBlock *HoistBlock = VectorPreheader;
4086 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4087 for (VPUser *User : VPV->users()) {
4088 if (User->usesScalars(VPV))
4089 continue;
4090 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4091 HoistPoint = HoistBlock->begin();
4092 else
4093 assert(VPDT.dominates(VectorPreheader,
4094 cast<VPRecipeBase>(User)->getParent()) &&
4095 "All users must be in the vector preheader or dominated by it");
4096 }
4097
4098 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4099 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4100 VPV->replaceUsesWithIf(Broadcast,
4101 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4102 return Broadcast != &U && !U.usesScalars(VPV);
4103 });
4104 }
4105}
4106
4108 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4109
4110 // Collect candidate loads with invariant addresses and noalias scopes
4111 // metadata and memory-writing recipes with noalias metadata.
4115 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4116 for (VPRecipeBase &R : *VPBB) {
4117 // Only handle single-scalar replicated loads with invariant addresses.
4118 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4119 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4120 RepR->getOpcode() != Instruction::Load)
4121 continue;
4122
4123 VPValue *Addr = RepR->getOperand(0);
4124 if (Addr->isDefinedOutsideLoopRegions()) {
4126 if (!Loc.AATags.Scope)
4127 continue;
4128 CandidateLoads.push_back({RepR, Loc});
4129 }
4130 }
4131 if (R.mayWriteToMemory()) {
4133 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4134 return;
4135 Stores.push_back(*Loc);
4136 }
4137 }
4138 }
4139
4140 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4141 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4142 // Hoist the load to the preheader if it doesn't alias with any stores
4143 // according to the noalias metadata. Other loads should have been hoisted
4144 // by other passes
4145 const AAMDNodes &LoadAA = LoadLoc.AATags;
4146 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4148 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4149 })) {
4150 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4151 }
4152 }
4153}
4154
4155// Collect common metadata from a group of replicate recipes by intersecting
4156// metadata from all recipes in the group.
4158 VPIRMetadata CommonMetadata = *Recipes.front();
4159 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4160 CommonMetadata.intersect(*Recipe);
4161 return CommonMetadata;
4162}
4163
4164template <unsigned Opcode>
4167 const Loop *L) {
4168 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4169 "Only Load and Store opcodes supported");
4170 constexpr bool IsLoad = (Opcode == Instruction::Load);
4171 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4172 VPTypeAnalysis TypeInfo(Plan);
4173
4174 // Group predicated operations by their address SCEV.
4176 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4177 auto *VPBB = cast<VPBasicBlock>(Block);
4178 for (VPRecipeBase &R : *VPBB) {
4179 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4180 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4181 continue;
4182
4183 // For loads, operand 0 is address; for stores, operand 1 is address.
4184 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4185 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
4186 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4187 RecipesByAddress[AddrSCEV].push_back(RepR);
4188 }
4189 }
4190
4191 // For each address, collect operations with the same or complementary masks.
4193 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4194 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4195 };
4196 for (auto &[Addr, Recipes] : RecipesByAddress) {
4197 if (Recipes.size() < 2)
4198 continue;
4199
4200 // Collect groups with the same or complementary masks.
4201 for (VPReplicateRecipe *&RecipeI : Recipes) {
4202 if (!RecipeI)
4203 continue;
4204
4205 VPValue *MaskI = RecipeI->getMask();
4206 Type *TypeI = GetLoadStoreValueType(RecipeI);
4208 Group.push_back(RecipeI);
4209 RecipeI = nullptr;
4210
4211 // Find all operations with the same or complementary masks.
4212 bool HasComplementaryMask = false;
4213 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4214 if (!RecipeJ)
4215 continue;
4216
4217 VPValue *MaskJ = RecipeJ->getMask();
4218 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4219 if (TypeI == TypeJ) {
4220 // Check if any operation in the group has a complementary mask with
4221 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4222 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4223 match(MaskJ, m_Not(m_Specific(MaskI)));
4224 Group.push_back(RecipeJ);
4225 RecipeJ = nullptr;
4226 }
4227 }
4228
4229 if (HasComplementaryMask) {
4230 assert(Group.size() >= 2 && "must have at least 2 entries");
4231 AllGroups.push_back(std::move(Group));
4232 }
4233 }
4234 }
4235
4236 return AllGroups;
4237}
4238
4239// Find the recipe with minimum alignment in the group.
4240template <typename InstType>
4241static VPReplicateRecipe *
4243 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4244 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4245 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4246 });
4247}
4248
4250 const Loop *L) {
4251 auto Groups =
4253 if (Groups.empty())
4254 return;
4255
4256 VPDominatorTree VPDT(Plan);
4257
4258 // Process each group of loads.
4259 for (auto &Group : Groups) {
4260 // Sort loads by dominance order, with earliest (most dominating) first.
4261 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4262 return VPDT.properlyDominates(A, B);
4263 });
4264
4265 // Try to use the earliest (most dominating) load to replace all others.
4266 VPReplicateRecipe *EarliestLoad = Group[0];
4267 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4268 VPBasicBlock *LastBB = Group.back()->getParent();
4269
4270 // Check that the load doesn't alias with stores between first and last.
4271 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4272 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
4273 /*CheckReads=*/false))
4274 continue;
4275
4276 // Collect common metadata from all loads in the group.
4277 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4278
4279 // Find the load with minimum alignment to use.
4280 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4281
4282 // Create an unpredicated version of the earliest load with common
4283 // metadata.
4284 auto *UnpredicatedLoad = new VPReplicateRecipe(
4285 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4286 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4287 CommonMetadata);
4288
4289 UnpredicatedLoad->insertBefore(EarliestLoad);
4290
4291 // Replace all loads in the group with the unpredicated load.
4292 for (VPReplicateRecipe *Load : Group) {
4293 Load->replaceAllUsesWith(UnpredicatedLoad);
4294 Load->eraseFromParent();
4295 }
4296 }
4297}
4298
4299static bool
4301 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4302 if (!StoreLoc || !StoreLoc->AATags.Scope)
4303 return false;
4304
4305 // When sinking a group of stores, all members of the group alias each other.
4306 // Skip them during the alias checks.
4307 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4308 StoresToSink.end());
4309
4310 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4311 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4312 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB,
4313 /*CheckReads=*/true, &StoresToSinkSet);
4314}
4315
4317 const Loop *L) {
4318 auto Groups =
4320 if (Groups.empty())
4321 return;
4322
4323 VPDominatorTree VPDT(Plan);
4324
4325 for (auto &Group : Groups) {
4326 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4327 return VPDT.properlyDominates(A, B);
4328 });
4329
4330 if (!canSinkStoreWithNoAliasCheck(Group))
4331 continue;
4332
4333 // Use the last (most dominated) store's location for the unconditional
4334 // store.
4335 VPReplicateRecipe *LastStore = Group.back();
4336 VPBasicBlock *InsertBB = LastStore->getParent();
4337
4338 // Collect common alias metadata from all stores in the group.
4339 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4340
4341 // Build select chain for stored values.
4342 VPValue *SelectedValue = Group[0]->getOperand(0);
4343 VPBuilder Builder(InsertBB, LastStore->getIterator());
4344
4345 for (unsigned I = 1; I < Group.size(); ++I) {
4346 VPValue *Mask = Group[I]->getMask();
4347 VPValue *Value = Group[I]->getOperand(0);
4348 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4349 Group[I]->getDebugLoc());
4350 }
4351
4352 // Find the store with minimum alignment to use.
4353 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4354
4355 // Create unconditional store with selected value and common metadata.
4356 auto *UnpredicatedStore =
4357 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4358 {SelectedValue, LastStore->getOperand(1)},
4359 /*IsSingleScalar=*/false,
4360 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4361 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4362
4363 // Remove all predicated stores from the group.
4364 for (VPReplicateRecipe *Store : Group)
4365 Store->eraseFromParent();
4366 }
4367}
4368
4370 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4372 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4373 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4374
4375 VPValue *TC = Plan.getTripCount();
4376 // Skip cases for which the trip count may be non-trivial to materialize.
4377 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4378 // tail is required.
4379 if (!Plan.hasScalarTail() ||
4381 Plan.getScalarPreheader() ||
4382 !TC->isLiveIn())
4383 return;
4384
4385 // Materialize vector trip counts for constants early if it can simply
4386 // be computed as (Original TC / VF * UF) * VF * UF.
4387 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4388 // tail-folded loops.
4389 ScalarEvolution &SE = *PSE.getSE();
4390 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4391 if (!isa<SCEVConstant>(TCScev))
4392 return;
4393 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4394 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4395 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4396 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4397}
4398
4400 VPBasicBlock *VectorPH) {
4402 if (BTC->getNumUsers() == 0)
4403 return;
4404
4405 VPBuilder Builder(VectorPH, VectorPH->begin());
4406 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4407 auto *TCMO = Builder.createNaryOp(
4408 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4409 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4410 BTC->replaceAllUsesWith(TCMO);
4411}
4412
4414 if (Plan.hasScalarVFOnly())
4415 return;
4416
4417 VPTypeAnalysis TypeInfo(Plan);
4418 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4419 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4421 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4422 vp_depth_first_shallow(LoopRegion->getEntry()));
4423 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4424 // VPInstructions, excluding ones in replicate regions. Those are not
4425 // materialized explicitly yet. Those vector users are still handled in
4426 // VPReplicateRegion::execute(), via shouldPack().
4427 // TODO: materialize build vectors for replicating recipes in replicating
4428 // regions.
4429 for (VPBasicBlock *VPBB :
4430 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4431 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4433 continue;
4434 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4435 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4436 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4437 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4438 };
4439 if ((isa<VPReplicateRecipe>(DefR) &&
4440 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4441 (isa<VPInstruction>(DefR) &&
4443 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4444 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4445 continue;
4446
4447 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4448 unsigned Opcode = ScalarTy->isStructTy()
4451 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4452 BuildVector->insertAfter(DefR);
4453
4454 DefR->replaceUsesWithIf(
4455 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4456 VPUser &U, unsigned) {
4457 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4458 });
4459 }
4460 }
4461
4462 // Create explicit VPInstructions to convert vectors to scalars. The current
4463 // implementation is conservative - it may miss some cases that may or may not
4464 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4465 // if they are known to operate on scalar values.
4466 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4467 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4470 continue;
4471 for (VPValue *Def : R.definedValues()) {
4472 // Skip recipes that are single-scalar or only have their first lane
4473 // used.
4474 // TODO: The Defs skipped here may or may not be vector values.
4475 // Introduce Unpacks, and remove them later, if they are guaranteed to
4476 // produce scalar values.
4478 continue;
4479
4480 // At the moment, we create unpacks only for scalar users outside
4481 // replicate regions. Recipes inside replicate regions still extract the
4482 // required lanes implicitly.
4483 // TODO: Remove once replicate regions are unrolled completely.
4484 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4485 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4486 return U->usesScalars(Def) &&
4487 (!ParentRegion || !ParentRegion->isReplicator());
4488 };
4489 if (none_of(Def->users(), IsCandidateUnpackUser))
4490 continue;
4491
4492 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4493 if (R.isPhi())
4494 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4495 else
4496 Unpack->insertAfter(&R);
4497 Def->replaceUsesWithIf(Unpack,
4498 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4499 return IsCandidateUnpackUser(&U);
4500 });
4501 }
4502 }
4503 }
4504}
4505
4507 VPBasicBlock *VectorPHVPBB,
4508 bool TailByMasking,
4509 bool RequiresScalarEpilogue) {
4510 VPValue &VectorTC = Plan.getVectorTripCount();
4511 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4512 // There's nothing to do if there are no users of the vector trip count or its
4513 // IR value has already been set.
4514 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4515 return;
4516
4517 VPValue *TC = Plan.getTripCount();
4518 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4519 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4520 VPValue *Step = &Plan.getVFxUF();
4521
4522 // If the tail is to be folded by masking, round the number of iterations N
4523 // up to a multiple of Step instead of rounding down. This is done by first
4524 // adding Step-1 and then rounding down. Note that it's ok if this addition
4525 // overflows: the vector induction variable will eventually wrap to zero given
4526 // that it starts at zero and its Step is a power of two; the loop will then
4527 // exit, with the last early-exit vector comparison also producing all-true.
4528 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4529 // is accounted for in emitIterationCountCheck that adds an overflow check.
4530 if (TailByMasking) {
4531 TC = Builder.createNaryOp(
4532 Instruction::Add,
4533 {TC, Builder.createNaryOp(Instruction::Sub,
4534 {Step, Plan.getConstantInt(TCTy, 1)})},
4535 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4536 }
4537
4538 // Now we need to generate the expression for the part of the loop that the
4539 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4540 // iterations are not required for correctness, or N - Step, otherwise. Step
4541 // is equal to the vectorization factor (number of SIMD elements) times the
4542 // unroll factor (number of SIMD instructions).
4543 VPValue *R =
4544 Builder.createNaryOp(Instruction::URem, {TC, Step},
4545 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4546
4547 // There are cases where we *must* run at least one iteration in the remainder
4548 // loop. See the cost model for when this can happen. If the step evenly
4549 // divides the trip count, we set the remainder to be equal to the step. If
4550 // the step does not evenly divide the trip count, no adjustment is necessary
4551 // since there will already be scalar iterations. Note that the minimum
4552 // iterations check ensures that N >= Step.
4553 if (RequiresScalarEpilogue) {
4554 assert(!TailByMasking &&
4555 "requiring scalar epilogue is not supported with fail folding");
4556 VPValue *IsZero =
4557 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4558 R = Builder.createSelect(IsZero, Step, R);
4559 }
4560
4561 VPValue *Res = Builder.createNaryOp(
4562 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4563 VectorTC.replaceAllUsesWith(Res);
4564}
4565
4567 ElementCount VFEC) {
4568 VPBuilder Builder(VectorPH, VectorPH->begin());
4569 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4570 VPValue &VF = Plan.getVF();
4571 VPValue &VFxUF = Plan.getVFxUF();
4572 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4573 // used.
4574 // TODO: Assert that they aren't used.
4575
4576 // If there are no users of the runtime VF, compute VFxUF by constant folding
4577 // the multiplication of VF and UF.
4578 if (VF.getNumUsers() == 0) {
4579 VPValue *RuntimeVFxUF =
4580 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4581 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4582 return;
4583 }
4584
4585 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4586 // vscale) * UF.
4587 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4589 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4591 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4592 }
4593 VF.replaceAllUsesWith(RuntimeVF);
4594
4595 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4596 VPValue *MulByUF = Builder.createOverflowingOp(
4597 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4598 VFxUF.replaceAllUsesWith(MulByUF);
4599}
4600
4603 const DataLayout &DL = SE.getDataLayout();
4604 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4605
4606 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4607 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4608 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4609 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4611 continue;
4612 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4613 if (!ExpSCEV)
4614 break;
4615 const SCEV *Expr = ExpSCEV->getSCEV();
4616 Value *Res =
4617 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4618 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4619 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4620 ExpSCEV->replaceAllUsesWith(Exp);
4621 if (Plan.getTripCount() == ExpSCEV)
4622 Plan.resetTripCount(Exp);
4623 ExpSCEV->eraseFromParent();
4624 }
4626 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4627 "after any VPIRInstructions");
4628 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4629 // to the VPIRBasicBlock.
4630 auto EI = Entry->begin();
4631 for (Instruction &I : drop_end(*EntryBB)) {
4632 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4633 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4634 EI++;
4635 continue;
4636 }
4638 }
4639
4640 return ExpandedSCEVs;
4641}
4642
4643/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4644/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4645/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4646/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4647/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4648/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4649/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4650/// is defined at \p Idx of a load interleave group.
4651static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4652 VPValue *OpV, unsigned Idx) {
4653 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4654 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4655 if (!Member0OpR)
4656 return Member0Op == OpV;
4657 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4658 return !W->getMask() && Member0Op == OpV;
4659 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4660 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4661 return false;
4662}
4663
4664/// Returns true if \p IR is a full interleave group with factor and number of
4665/// members both equal to \p VF. The interleave group must also access the full
4666/// vector width \p VectorRegWidth.
4668 ElementCount VF,
4669 VPTypeAnalysis &TypeInfo,
4670 TypeSize VectorRegWidth) {
4671 if (!InterleaveR || InterleaveR->getMask())
4672 return false;
4673
4674 Type *GroupElementTy = nullptr;
4675 if (InterleaveR->getStoredValues().empty()) {
4676 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4677 if (!all_of(InterleaveR->definedValues(),
4678 [&TypeInfo, GroupElementTy](VPValue *Op) {
4679 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4680 }))
4681 return false;
4682 } else {
4683 GroupElementTy =
4684 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4685 if (!all_of(InterleaveR->getStoredValues(),
4686 [&TypeInfo, GroupElementTy](VPValue *Op) {
4687 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4688 }))
4689 return false;
4690 }
4691
4692 unsigned VFMin = VF.getKnownMinValue();
4693 TypeSize GroupSize = TypeSize::get(
4694 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4695 const auto *IG = InterleaveR->getInterleaveGroup();
4696 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4697 GroupSize == VectorRegWidth;
4698}
4699
4700/// Returns true if \p VPValue is a narrow VPValue.
4701static bool isAlreadyNarrow(VPValue *VPV) {
4702 if (VPV->isLiveIn())
4703 return true;
4704 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4705 return RepR && RepR->isSingleScalar();
4706}
4707
4708// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4709// a narrow variant.
4710static VPValue *
4712 auto *R = V->getDefiningRecipe();
4713 if (!R || NarrowedOps.contains(V))
4714 return V;
4715
4716 if (isAlreadyNarrow(V))
4717 return V;
4718
4719 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4720 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4721 WideMember0->setOperand(
4722 Idx,
4723 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4724 return V;
4725 }
4726
4727 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4728 // Narrow interleave group to wide load, as transformed VPlan will only
4729 // process one original iteration.
4730 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4731 auto *L = new VPWidenLoadRecipe(
4732 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4733 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4734 L->insertBefore(LoadGroup);
4735 NarrowedOps.insert(L);
4736 return L;
4737 }
4738
4739 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4740 assert(RepR->isSingleScalar() &&
4741 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4742 "must be a single scalar load");
4743 NarrowedOps.insert(RepR);
4744 return RepR;
4745 }
4746
4747 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4748 VPValue *PtrOp = WideLoad->getAddr();
4749 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4750 PtrOp = VecPtr->getOperand(0);
4751 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4752 // process one original iteration.
4753 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4754 /*IsUniform*/ true,
4755 /*Mask*/ nullptr, {}, *WideLoad);
4756 N->insertBefore(WideLoad);
4757 NarrowedOps.insert(N);
4758 return N;
4759}
4760
4762 TypeSize VectorRegWidth) {
4763 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4764 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4765 return;
4766
4767 VPTypeAnalysis TypeInfo(Plan);
4768
4770 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4772 continue;
4773
4776 continue;
4777
4778 // Bail out on recipes not supported at the moment:
4779 // * phi recipes other than the canonical induction
4780 // * recipes writing to memory except interleave groups
4781 // Only support plans with a canonical induction phi.
4782 if (R.isPhi())
4783 return;
4784
4785 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4786 if (R.mayWriteToMemory() && !InterleaveR)
4787 return;
4788
4789 // Do not narrow interleave groups if there are VectorPointer recipes and
4790 // the plan was unrolled. The recipe implicitly uses VF from
4791 // VPTransformState.
4792 // TODO: Remove restriction once the VF for the VectorPointer offset is
4793 // modeled explicitly as operand.
4794 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4795 return;
4796
4797 // All other ops are allowed, but we reject uses that cannot be converted
4798 // when checking all allowed consumers (store interleave groups) below.
4799 if (!InterleaveR)
4800 continue;
4801
4802 // Bail out on non-consecutive interleave groups.
4803 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4804 VectorRegWidth))
4805 return;
4806
4807 // Skip read interleave groups.
4808 if (InterleaveR->getStoredValues().empty())
4809 continue;
4810
4811 // Narrow interleave groups, if all operands are already matching narrow
4812 // ops.
4813 auto *Member0 = InterleaveR->getStoredValues()[0];
4814 if (isAlreadyNarrow(Member0) &&
4815 all_of(InterleaveR->getStoredValues(),
4816 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4817 StoreGroups.push_back(InterleaveR);
4818 continue;
4819 }
4820
4821 // For now, we only support full interleave groups storing load interleave
4822 // groups.
4823 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4824 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4825 if (!DefR)
4826 return false;
4827 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4828 return IR && IR->getInterleaveGroup()->isFull() &&
4829 IR->getVPValue(Op.index()) == Op.value();
4830 })) {
4831 StoreGroups.push_back(InterleaveR);
4832 continue;
4833 }
4834
4835 // Check if all values feeding InterleaveR are matching wide recipes, which
4836 // operands that can be narrowed.
4837 auto *WideMember0 =
4838 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4839 if (!WideMember0)
4840 return;
4841 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4843 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4844 R->getNumOperands() > 2)
4845 return;
4846 if (any_of(enumerate(R->operands()),
4847 [WideMember0, Idx = I](const auto &P) {
4848 const auto &[OpIdx, OpV] = P;
4849 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4850 }))
4851 return;
4852 }
4853 StoreGroups.push_back(InterleaveR);
4854 }
4855
4856 if (StoreGroups.empty())
4857 return;
4858
4859 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4860 SmallPtrSet<VPValue *, 4> NarrowedOps;
4861 // Narrow operation tree rooted at store groups.
4862 for (auto *StoreGroup : StoreGroups) {
4863 VPValue *Res =
4864 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4865 auto *SI =
4866 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4867 auto *S = new VPWidenStoreRecipe(
4868 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4869 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4870 S->insertBefore(StoreGroup);
4871 StoreGroup->eraseFromParent();
4872 }
4873
4874 // Adjust induction to reflect that the transformed plan only processes one
4875 // original iteration.
4876 auto *CanIV = VectorLoop->getCanonicalIV();
4877 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4878 VPBuilder PHBuilder(Plan.getVectorPreheader());
4879
4880 VPValue *UF = Plan.getOrAddLiveIn(
4881 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4882 if (VF.isScalable()) {
4883 VPValue *VScale = PHBuilder.createElementCount(
4885 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
4886 Instruction::Mul, {VScale, UF}, {true, false});
4887 Inc->setOperand(1, VScaleUF);
4888 Plan.getVF().replaceAllUsesWith(VScale);
4889 } else {
4890 Inc->setOperand(1, UF);
4892 Plan.getConstantInt(CanIV->getScalarType(), 1));
4893 }
4894 removeDeadRecipes(Plan);
4895}
4896
4897/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4898/// BranchOnCond recipe.
4900 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4901 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4902 auto *MiddleTerm =
4904 // Only add branch metadata if there is a (conditional) terminator.
4905 if (!MiddleTerm)
4906 return;
4907
4908 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4909 "must have a BranchOnCond");
4910 // Assume that `TripCount % VectorStep ` is equally distributed.
4911 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4912 if (VF.isScalable() && VScaleForTuning.has_value())
4913 VectorStep *= *VScaleForTuning;
4914 assert(VectorStep > 0 && "trip count should not be zero");
4915 MDBuilder MDB(Plan.getContext());
4916 MDNode *BranchWeights =
4917 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4918 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
4919}
4920
4921/// Compute and return the end value for \p WideIV, unless it is truncated. If
4922/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
4923/// compute the end value of the induction.
4925 VPBuilder &VectorPHBuilder,
4926 VPTypeAnalysis &TypeInfo,
4927 VPValue *VectorTC) {
4928 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4929 // Truncated wide inductions resume from the last lane of their vector value
4930 // in the last vector iteration which is handled elsewhere.
4931 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4932 return nullptr;
4933
4934 VPValue *Start = WideIV->getStartValue();
4935 VPValue *Step = WideIV->getStepValue();
4937 VPValue *EndValue = VectorTC;
4938 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4939 EndValue = VectorPHBuilder.createDerivedIV(
4940 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4941 Start, VectorTC, Step);
4942 }
4943
4944 // EndValue is derived from the vector trip count (which has the same type as
4945 // the widest induction) and thus may be wider than the induction here.
4946 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4947 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4948 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4949 ScalarTypeOfWideIV,
4950 WideIV->getDebugLoc());
4951 }
4952
4953 return EndValue;
4954}
4955
4957 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
4958 VPTypeAnalysis TypeInfo(Plan);
4959 auto *ScalarPH = Plan.getScalarPreheader();
4960 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4961 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4962 VPBuilder VectorPHBuilder(
4963 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4964 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4965 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
4966 auto *ResumePhiR = cast<VPPhi>(&PhiR);
4967
4968 // TODO: Extract final value from induction recipe initially, optimize to
4969 // pre-computed end value together in optimizeInductionExitUsers.
4970 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
4971 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4973 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
4974 IVEndValues[WideIVR] = EndValue;
4975 ResumePhiR->setOperand(0, EndValue);
4976 ResumePhiR->setName("bc.resume.val");
4977 continue;
4978 }
4979 // TODO: Also handle truncated inductions here. Computing end-values
4980 // separately should be done as VPlan-to-VPlan optimization, after
4981 // legalizing all resume values to use the last lane from the loop.
4982 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4983 "should only skip truncated wide inductions");
4984 continue;
4985 }
4986
4987 // The backedge value provides the value to resume coming out of a loop,
4988 // which for FORs is a vector whose last element needs to be extracted. The
4989 // start value provides the value if the loop is bypassed.
4990 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4991 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4992 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4993 "Cannot handle loops with uncountable early exits");
4994 if (IsFOR) {
4995 auto *ExtractPart = MiddleBuilder.createNaryOp(
4996 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
4997 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4999 "vector.recur.extract");
5000 }
5001 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5002 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5003 }
5004}
5005
5007 VFRange &Range) {
5008 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5009 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5010 auto *MiddleVPBB = Plan.getMiddleBlock();
5011 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5012 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5013
5014 auto IsScalableOne = [](ElementCount VF) -> bool {
5015 return VF == ElementCount::getScalable(1);
5016 };
5017
5018 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5019 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5020 if (!FOR)
5021 continue;
5022
5023 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5024 "Cannot handle loops with uncountable early exits");
5025
5026 // This is the second phase of vectorizing first-order recurrences, creating
5027 // extract for users outside the loop. An overview of the transformation is
5028 // described below. Suppose we have the following loop with some use after
5029 // the loop of the last a[i-1],
5030 //
5031 // for (int i = 0; i < n; ++i) {
5032 // t = a[i - 1];
5033 // b[i] = a[i] - t;
5034 // }
5035 // use t;
5036 //
5037 // There is a first-order recurrence on "a". For this loop, the shorthand
5038 // scalar IR looks like:
5039 //
5040 // scalar.ph:
5041 // s.init = a[-1]
5042 // br scalar.body
5043 //
5044 // scalar.body:
5045 // i = phi [0, scalar.ph], [i+1, scalar.body]
5046 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5047 // s2 = a[i]
5048 // b[i] = s2 - s1
5049 // br cond, scalar.body, exit.block
5050 //
5051 // exit.block:
5052 // use = lcssa.phi [s1, scalar.body]
5053 //
5054 // In this example, s1 is a recurrence because it's value depends on the
5055 // previous iteration. In the first phase of vectorization, we created a
5056 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5057 // for users in the scalar preheader and exit block.
5058 //
5059 // vector.ph:
5060 // v_init = vector(..., ..., ..., a[-1])
5061 // br vector.body
5062 //
5063 // vector.body
5064 // i = phi [0, vector.ph], [i+4, vector.body]
5065 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5066 // v2 = a[i, i+1, i+2, i+3]
5067 // b[i] = v2 - v1
5068 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5069 // b[i, i+1, i+2, i+3] = v2 - v1
5070 // br cond, vector.body, middle.block
5071 //
5072 // middle.block:
5073 // vector.recur.extract.for.phi = v2(2)
5074 // vector.recur.extract = v2(3)
5075 // br cond, scalar.ph, exit.block
5076 //
5077 // scalar.ph:
5078 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5079 // [s.init, otherwise]
5080 // br scalar.body
5081 //
5082 // scalar.body:
5083 // i = phi [0, scalar.ph], [i+1, scalar.body]
5084 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5085 // s2 = a[i]
5086 // b[i] = s2 - s1
5087 // br cond, scalar.body, exit.block
5088 //
5089 // exit.block:
5090 // lo = lcssa.phi [s1, scalar.body],
5091 // [vector.recur.extract.for.phi, middle.block]
5092 //
5093 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5094 // Extract the penultimate value of the recurrence and use it as operand for
5095 // the VPIRInstruction modeling the phi.
5097 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5099 continue;
5100
5101 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5102 // penultimate value of the recurrence. Instead we rely on the existing
5103 // extract of the last element from the result of
5104 // VPInstruction::FirstOrderRecurrenceSplice.
5105 // TODO: Consider vscale_range info and UF.
5107 Range))
5108 return;
5109 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5110 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5111 "vector.recur.extract.for.phi");
5112 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5113 }
5114 }
5115}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink)
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool CheckReads, const SmallPtrSetImpl< VPRecipeBase * > *ExcludeRecipes=nullptr)
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3605
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3966
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4041
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3993
iterator end()
Definition VPlan.h:4003
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4001
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4054
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4015
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2512
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2546
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2536
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2552
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2532
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3016
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3547
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:431
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:404
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:416
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3716
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3637
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3061
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2049
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2092
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2081
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4119
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4143
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1127
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1071
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1066
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1063
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2654
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2646
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2675
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2728
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2686
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3203
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4271
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2890
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2779
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4154
const VPBlockBase * getEntry() const
Definition VPlan.h:4190
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4265
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4222
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4207
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4252
const VPBlockBase * getExiting() const
Definition VPlan.h:4202
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4215
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2935
bool isSingleScalar() const
Definition VPlan.h:2976
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3000
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3786
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1911
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3679
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1552
Instruction::CastOps getOpcode() const
Definition VPlan.h:1588
A recipe for handling GEP instructions.
Definition VPlan.h:1848
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2116
PHINode * getPHINode() const
Definition VPlan.h:2158
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2144
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2161
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2192
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2268
A recipe for widening vector intrinsics.
Definition VPlan.h:1602
A common base class for widening memory operations.
Definition VPlan.h:3246
A recipe for widened phis.
Definition VPlan.h:2326
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1512
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4284
bool hasVF(ElementCount VF) const
Definition VPlan.h:4489
LLVMContext & getContext() const
Definition VPlan.h:4477
VPBasicBlock * getEntry()
Definition VPlan.h:4377
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4468
bool hasScalableVF() const
Definition VPlan.h:4490
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4475
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4471
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4439
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4546
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4460
unsigned getUF() const
Definition VPlan.h:4509
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4623
bool hasUF(unsigned UF) const
Definition VPlan.h:4507
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4429
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4552
void setVF(ElementCount VF)
Definition VPlan.h:4483
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4522
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4453
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4402
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4601
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4549
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4531
bool hasScalarVFOnly() const
Definition VPlan.h:4500
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4420
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4571
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4425
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4568
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4382
void setUF(unsigned UF)
Definition VPlan.h:4514
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4655
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1966
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2100
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2368
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3378
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3337
A recipe for widening select instructions.
Definition VPlan.h:1801
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3462
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3419
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...