LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 VPlan &Plan,
46 GetIntOrFpInductionDescriptor,
47 const TargetLibraryInfo &TLI) {
48
50 Plan.getVectorLoopRegion());
52 // Skip blocks outside region
53 if (!VPBB->getParent())
54 break;
55 VPRecipeBase *Term = VPBB->getTerminator();
56 auto EndIter = Term ? Term->getIterator() : VPBB->end();
57 // Introduce each ingredient into VPlan.
58 for (VPRecipeBase &Ingredient :
59 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
60
61 VPValue *VPV = Ingredient.getVPSingleValue();
62 if (!VPV->getUnderlyingValue())
63 continue;
64
66
67 VPRecipeBase *NewRecipe = nullptr;
68 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
69 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
70 const auto *II = GetIntOrFpInductionDescriptor(Phi);
71 if (!II) {
72 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
73 for (VPValue *Op : PhiR->operands())
74 NewRecipe->addOperand(Op);
75 } else {
76 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
77 VPValue *Step =
79 NewRecipe = new VPWidenIntOrFpInductionRecipe(
80 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
81 }
82 } else {
83 assert(isa<VPInstruction>(&Ingredient) &&
84 "only VPInstructions expected here");
85 assert(!isa<PHINode>(Inst) && "phis should be handled above");
86 // Create VPWidenMemoryRecipe for loads and stores.
87 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
88 NewRecipe = new VPWidenLoadRecipe(
89 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
90 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
91 Ingredient.getDebugLoc());
92 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
93 NewRecipe = new VPWidenStoreRecipe(
94 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
95 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
96 VPIRMetadata(*Store), Ingredient.getDebugLoc());
98 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
99 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
100 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
101 if (VectorID == Intrinsic::not_intrinsic)
102 return false;
103 NewRecipe = new VPWidenIntrinsicRecipe(
104 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
105 drop_end(Ingredient.operands()), CI->getType(),
106 CI->getDebugLoc());
107 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
108 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
109 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
110 NewRecipe = new VPWidenCastRecipe(
111 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
112 } else {
113 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
114 }
115 }
116
117 NewRecipe->insertBefore(&Ingredient);
118 if (NewRecipe->getNumDefinedValues() == 1)
119 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
120 else
121 assert(NewRecipe->getNumDefinedValues() == 0 &&
122 "Only recpies with zero or one defined values expected");
123 Ingredient.eraseFromParent();
124 }
125 }
126 return true;
127}
128
129/// Return true if we do not know how to (mechanically) hoist or sink \p R out
130/// of a loop region.
132 // Assumes don't alias anything or throw; as long as they're guaranteed to
133 // execute, they're safe to hoist.
135 return false;
136
137 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
138 // memory location is not modified in the vector loop.
139 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
140 return true;
141
142 // Allocas cannot be hoisted.
143 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
144 return RepR && RepR->getOpcode() == Instruction::Alloca;
145}
146
147static bool sinkScalarOperands(VPlan &Plan) {
148 auto Iter = vp_depth_first_deep(Plan.getEntry());
149 bool ScalarVFOnly = Plan.hasScalarVFOnly();
150 bool Changed = false;
151
153 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
154 VPBasicBlock *SinkTo, VPValue *Op) {
155 auto *Candidate =
156 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
157 if (!Candidate)
158 return;
159
160 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
161 // for now.
163 return;
164
165 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
166 return;
167
168 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
169 if (!ScalarVFOnly && RepR->isSingleScalar())
170 return;
171
172 WorkList.insert({SinkTo, Candidate});
173 };
174
175 // First, collect the operands of all recipes in replicate blocks as seeds for
176 // sinking.
178 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
179 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
180 continue;
181 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
182 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
183 continue;
184 for (auto &Recipe : *VPBB)
185 for (VPValue *Op : Recipe.operands())
186 InsertIfValidSinkCandidate(VPBB, Op);
187 }
188
189 // Try to sink each replicate or scalar IV steps recipe in the worklist.
190 for (unsigned I = 0; I != WorkList.size(); ++I) {
191 VPBasicBlock *SinkTo;
192 VPSingleDefRecipe *SinkCandidate;
193 std::tie(SinkTo, SinkCandidate) = WorkList[I];
194
195 // All recipe users of SinkCandidate must be in the same block SinkTo or all
196 // users outside of SinkTo must only use the first lane of SinkCandidate. In
197 // the latter case, we need to duplicate SinkCandidate.
198 auto UsersOutsideSinkTo =
199 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
200 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
201 });
202 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
203 return !U->usesFirstLaneOnly(SinkCandidate);
204 }))
205 continue;
206 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
207
208 if (NeedsDuplicating) {
209 if (ScalarVFOnly)
210 continue;
211 VPSingleDefRecipe *Clone;
212 if (auto *SinkCandidateRepR =
213 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
214 // TODO: Handle converting to uniform recipes as separate transform,
215 // then cloning should be sufficient here.
216 Instruction *I = SinkCandidate->getUnderlyingInstr();
217 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
218 nullptr /*Mask*/, *SinkCandidateRepR);
219 // TODO: add ".cloned" suffix to name of Clone's VPValue.
220 } else {
221 Clone = SinkCandidate->clone();
222 }
223
224 Clone->insertBefore(SinkCandidate);
225 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
226 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
227 });
228 }
229 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
230 for (VPValue *Op : SinkCandidate->operands())
231 InsertIfValidSinkCandidate(SinkTo, Op);
232 Changed = true;
233 }
234 return Changed;
235}
236
237/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
238/// the mask.
240 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
241 if (!EntryBB || EntryBB->size() != 1 ||
242 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
243 return nullptr;
244
245 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
246}
247
248/// If \p R is a triangle region, return the 'then' block of the triangle.
250 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
251 if (EntryBB->getNumSuccessors() != 2)
252 return nullptr;
253
254 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
255 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
256 if (!Succ0 || !Succ1)
257 return nullptr;
258
259 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
260 return nullptr;
261 if (Succ0->getSingleSuccessor() == Succ1)
262 return Succ0;
263 if (Succ1->getSingleSuccessor() == Succ0)
264 return Succ1;
265 return nullptr;
266}
267
268// Merge replicate regions in their successor region, if a replicate region
269// is connected to a successor replicate region with the same predicate by a
270// single, empty VPBasicBlock.
272 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
273
274 // Collect replicate regions followed by an empty block, followed by another
275 // replicate region with matching masks to process front. This is to avoid
276 // iterator invalidation issues while merging regions.
279 vp_depth_first_deep(Plan.getEntry()))) {
280 if (!Region1->isReplicator())
281 continue;
282 auto *MiddleBasicBlock =
283 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
284 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
285 continue;
286
287 auto *Region2 =
288 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
289 if (!Region2 || !Region2->isReplicator())
290 continue;
291
292 VPValue *Mask1 = getPredicatedMask(Region1);
293 VPValue *Mask2 = getPredicatedMask(Region2);
294 if (!Mask1 || Mask1 != Mask2)
295 continue;
296
297 assert(Mask1 && Mask2 && "both region must have conditions");
298 WorkList.push_back(Region1);
299 }
300
301 // Move recipes from Region1 to its successor region, if both are triangles.
302 for (VPRegionBlock *Region1 : WorkList) {
303 if (TransformedRegions.contains(Region1))
304 continue;
305 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
306 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
307
308 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
309 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
310 if (!Then1 || !Then2)
311 continue;
312
313 // Note: No fusion-preventing memory dependencies are expected in either
314 // region. Such dependencies should be rejected during earlier dependence
315 // checks, which guarantee accesses can be re-ordered for vectorization.
316 //
317 // Move recipes to the successor region.
318 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
319 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
320
321 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
322 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
323
324 // Move VPPredInstPHIRecipes from the merge block to the successor region's
325 // merge block. Update all users inside the successor region to use the
326 // original values.
327 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
328 VPValue *PredInst1 =
329 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
330 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
331 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
332 return cast<VPRecipeBase>(&U)->getParent() == Then2;
333 });
334
335 // Remove phi recipes that are unused after merging the regions.
336 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
337 Phi1ToMove.eraseFromParent();
338 continue;
339 }
340 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
341 }
342
343 // Remove the dead recipes in Region1's entry block.
344 for (VPRecipeBase &R :
345 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
346 R.eraseFromParent();
347
348 // Finally, remove the first region.
349 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
350 VPBlockUtils::disconnectBlocks(Pred, Region1);
351 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
352 }
353 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
354 TransformedRegions.insert(Region1);
355 }
356
357 return !TransformedRegions.empty();
358}
359
361 VPlan &Plan) {
362 Instruction *Instr = PredRecipe->getUnderlyingInstr();
363 // Build the triangular if-then region.
364 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
365 assert(Instr->getParent() && "Predicated instruction not in any basic block");
366 auto *BlockInMask = PredRecipe->getMask();
367 auto *MaskDef = BlockInMask->getDefiningRecipe();
368 auto *BOMRecipe = new VPBranchOnMaskRecipe(
369 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
370 auto *Entry =
371 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
372
373 // Replace predicated replicate recipe with a replicate recipe without a
374 // mask but in the replicate region.
375 auto *RecipeWithoutMask = new VPReplicateRecipe(
376 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
377 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
378 auto *Pred =
379 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
380
381 VPPredInstPHIRecipe *PHIRecipe = nullptr;
382 if (PredRecipe->getNumUsers() != 0) {
383 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
384 RecipeWithoutMask->getDebugLoc());
385 PredRecipe->replaceAllUsesWith(PHIRecipe);
386 PHIRecipe->setOperand(0, RecipeWithoutMask);
387 }
388 PredRecipe->eraseFromParent();
389 auto *Exiting =
390 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
392 Plan.createReplicateRegion(Entry, Exiting, RegionName);
393
394 // Note: first set Entry as region entry and then connect successors starting
395 // from it in order, to propagate the "parent" of each VPBasicBlock.
396 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
397 VPBlockUtils::connectBlocks(Pred, Exiting);
398
399 return Region;
400}
401
402static void addReplicateRegions(VPlan &Plan) {
405 vp_depth_first_deep(Plan.getEntry()))) {
406 for (VPRecipeBase &R : *VPBB)
407 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
408 if (RepR->isPredicated())
409 WorkList.push_back(RepR);
410 }
411 }
412
413 unsigned BBNum = 0;
414 for (VPReplicateRecipe *RepR : WorkList) {
415 VPBasicBlock *CurrentBlock = RepR->getParent();
416 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
417
418 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
419 SplitBlock->setName(
420 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
421 // Record predicated instructions for above packing optimizations.
423 Region->setParent(CurrentBlock->getParent());
425
426 VPRegionBlock *ParentRegion = Region->getParent();
427 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
428 ParentRegion->setExiting(SplitBlock);
429 }
430}
431
432/// Remove redundant VPBasicBlocks by merging them into their predecessor if
433/// the predecessor has a single successor.
437 vp_depth_first_deep(Plan.getEntry()))) {
438 // Don't fold the blocks in the skeleton of the Plan into their single
439 // predecessors for now.
440 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
441 if (!VPBB->getParent())
442 continue;
443 auto *PredVPBB =
444 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
445 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
446 isa<VPIRBasicBlock>(PredVPBB))
447 continue;
448 WorkList.push_back(VPBB);
449 }
450
451 for (VPBasicBlock *VPBB : WorkList) {
452 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
453 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
454 R.moveBefore(*PredVPBB, PredVPBB->end());
455 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
456 auto *ParentRegion = VPBB->getParent();
457 if (ParentRegion && ParentRegion->getExiting() == VPBB)
458 ParentRegion->setExiting(PredVPBB);
459 for (auto *Succ : to_vector(VPBB->successors())) {
461 VPBlockUtils::connectBlocks(PredVPBB, Succ);
462 }
463 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
464 }
465 return !WorkList.empty();
466}
467
469 // Convert masked VPReplicateRecipes to if-then region blocks.
471
472 bool ShouldSimplify = true;
473 while (ShouldSimplify) {
474 ShouldSimplify = sinkScalarOperands(Plan);
475 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
476 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
477 }
478}
479
480/// Remove redundant casts of inductions.
481///
482/// Such redundant casts are casts of induction variables that can be ignored,
483/// because we already proved that the casted phi is equal to the uncasted phi
484/// in the vectorized loop. There is no need to vectorize the cast - the same
485/// value can be used for both the phi and casts in the vector loop.
487 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
489 if (!IV || IV->getTruncInst())
490 continue;
491
492 // A sequence of IR Casts has potentially been recorded for IV, which
493 // *must be bypassed* when the IV is vectorized, because the vectorized IV
494 // will produce the desired casted value. This sequence forms a def-use
495 // chain and is provided in reverse order, ending with the cast that uses
496 // the IV phi. Search for the recipe of the last cast in the chain and
497 // replace it with the original IV. Note that only the final cast is
498 // expected to have users outside the cast-chain and the dead casts left
499 // over will be cleaned up later.
500 auto &Casts = IV->getInductionDescriptor().getCastInsts();
501 VPValue *FindMyCast = IV;
502 for (Instruction *IRCast : reverse(Casts)) {
503 VPSingleDefRecipe *FoundUserCast = nullptr;
504 for (auto *U : FindMyCast->users()) {
505 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
506 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
507 FoundUserCast = UserCast;
508 break;
509 }
510 }
511 FindMyCast = FoundUserCast;
512 }
513 FindMyCast->replaceAllUsesWith(IV);
514 }
515}
516
517/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
518/// recipe, if it exists.
520 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
521 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
522 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
523 for (VPUser *U : CanonicalIV->users()) {
525 if (WidenNewIV)
526 break;
527 }
528
529 if (!WidenNewIV)
530 return;
531
532 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
533 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
534 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
535
536 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
537 continue;
538
539 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
540 // everything WidenNewIV's users need. That is, WidenOriginalIV will
541 // generate a vector phi or all users of WidenNewIV demand the first lane
542 // only.
543 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
544 vputils::onlyFirstLaneUsed(WidenNewIV)) {
545 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
546 WidenNewIV->eraseFromParent();
547 return;
548 }
549 }
550}
551
552/// Returns true if \p R is dead and can be removed.
553static bool isDeadRecipe(VPRecipeBase &R) {
554 // Do remove conditional assume instructions as their conditions may be
555 // flattened.
556 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
557 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
559 if (IsConditionalAssume)
560 return true;
561
562 if (R.mayHaveSideEffects())
563 return false;
564
565 // Recipe is dead if no user keeps the recipe alive.
566 return all_of(R.definedValues(),
567 [](VPValue *V) { return V->getNumUsers() == 0; });
568}
569
572 vp_post_order_deep(Plan.getEntry()))) {
573 // The recipes in the block are processed in reverse order, to catch chains
574 // of dead recipes.
575 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
576 if (isDeadRecipe(R)) {
577 R.eraseFromParent();
578 continue;
579 }
580
581 // Check if R is a dead VPPhi <-> update cycle and remove it.
582 auto *PhiR = dyn_cast<VPPhi>(&R);
583 if (!PhiR || PhiR->getNumOperands() != 2)
584 continue;
585 VPUser *PhiUser = PhiR->getSingleUser();
586 if (!PhiUser)
587 continue;
588 VPValue *Incoming = PhiR->getOperand(1);
589 if (PhiUser != Incoming->getDefiningRecipe() ||
590 Incoming->getNumUsers() != 1)
591 continue;
592 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
593 PhiR->eraseFromParent();
594 Incoming->getDefiningRecipe()->eraseFromParent();
595 }
596 }
597}
598
601 Instruction::BinaryOps InductionOpcode,
602 FPMathOperator *FPBinOp, Instruction *TruncI,
603 VPValue *StartV, VPValue *Step, DebugLoc DL,
604 VPBuilder &Builder) {
605 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
606 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
607 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
608 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
609 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
610
611 // Truncate base induction if needed.
612 VPTypeAnalysis TypeInfo(Plan);
613 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
614 if (TruncI) {
615 Type *TruncTy = TruncI->getType();
616 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
617 "Not truncating.");
618 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
619 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
620 ResultTy = TruncTy;
621 }
622
623 // Truncate step if needed.
624 Type *StepTy = TypeInfo.inferScalarType(Step);
625 if (ResultTy != StepTy) {
626 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
627 "Not truncating.");
628 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
629 auto *VecPreheader =
631 VPBuilder::InsertPointGuard Guard(Builder);
632 Builder.setInsertPoint(VecPreheader);
633 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
634 }
635 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
636 &Plan.getVF(), DL);
637}
638
641 for (unsigned I = 0; I != Users.size(); ++I) {
643 if (isa<VPHeaderPHIRecipe>(Cur))
644 continue;
645 for (VPValue *V : Cur->definedValues())
646 Users.insert_range(V->users());
647 }
648 return Users.takeVector();
649}
650
651/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
652/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
653/// VPWidenPointerInductionRecipe will generate vectors only. If some users
654/// require vectors while other require scalars, the scalar uses need to extract
655/// the scalars from the generated vectors (Note that this is different to how
656/// int/fp inductions are handled). Legalize extract-from-ends using uniform
657/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
658/// the correct end value is available. Also optimize
659/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
660/// providing them scalar steps built on the canonical scalar IV and update the
661/// original IV's users. This is an optional optimization to reduce the needs of
662/// vector extracts.
665 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
666 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
667 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
668 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
669 if (!PhiR)
670 continue;
671
672 // Try to narrow wide and replicating recipes to uniform recipes, based on
673 // VPlan analysis.
674 // TODO: Apply to all recipes in the future, to replace legacy uniformity
675 // analysis.
676 auto Users = collectUsersRecursively(PhiR);
677 for (VPUser *U : reverse(Users)) {
678 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
679 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
680 // Skip recipes that shouldn't be narrowed.
681 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
682 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
683 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
684 continue;
685
686 // Skip recipes that may have other lanes than their first used.
688 continue;
689
690 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
691 Def->operands(), /*IsUniform*/ true);
692 Clone->insertAfter(Def);
693 Def->replaceAllUsesWith(Clone);
694 }
695
696 // Replace wide pointer inductions which have only their scalars used by
697 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
698 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
699 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
700 continue;
701
702 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
703 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
704 VPValue *StepV = PtrIV->getOperand(1);
706 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
707 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
708
709 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
710 PtrIV->getDebugLoc(), "next.gep");
711
712 PtrIV->replaceAllUsesWith(PtrAdd);
713 continue;
714 }
715
716 // Replace widened induction with scalar steps for users that only use
717 // scalars.
718 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
719 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
720 return U->usesScalars(WideIV);
721 }))
722 continue;
723
724 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
726 Plan, ID.getKind(), ID.getInductionOpcode(),
727 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
728 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
729 WideIV->getDebugLoc(), Builder);
730
731 // Update scalar users of IV to use Step instead.
732 if (!HasOnlyVectorVFs)
733 WideIV->replaceAllUsesWith(Steps);
734 else
735 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
736 return U.usesScalars(WideIV);
737 });
738 }
739}
740
741/// Check if \p VPV is an untruncated wide induction, either before or after the
742/// increment. If so return the header IV (before the increment), otherwise
743/// return null.
745 ScalarEvolution &SE) {
746 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
747 if (WideIV) {
748 // VPV itself is a wide induction, separately compute the end value for exit
749 // users if it is not a truncated IV.
750 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
751 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
752 }
753
754 // Check if VPV is an optimizable induction increment.
755 VPRecipeBase *Def = VPV->getDefiningRecipe();
756 if (!Def || Def->getNumOperands() != 2)
757 return nullptr;
758 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
759 if (!WideIV)
760 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
761 if (!WideIV)
762 return nullptr;
763
764 auto IsWideIVInc = [&]() {
765 auto &ID = WideIV->getInductionDescriptor();
766
767 // Check if VPV increments the induction by the induction step.
768 VPValue *IVStep = WideIV->getStepValue();
769 switch (ID.getInductionOpcode()) {
770 case Instruction::Add:
771 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
772 case Instruction::FAdd:
774 m_Specific(IVStep)));
775 case Instruction::FSub:
776 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
777 m_Specific(IVStep)));
778 case Instruction::Sub: {
779 // IVStep will be the negated step of the subtraction. Check if Step == -1
780 // * IVStep.
781 VPValue *Step;
782 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
783 return false;
784 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
785 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
786 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
787 !isa<SCEVCouldNotCompute>(StepSCEV) &&
788 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
789 }
790 default:
791 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
792 match(VPV, m_GetElementPtr(m_Specific(WideIV),
793 m_Specific(WideIV->getStepValue())));
794 }
795 llvm_unreachable("should have been covered by switch above");
796 };
797 return IsWideIVInc() ? WideIV : nullptr;
798}
799
800/// Attempts to optimize the induction variable exit values for users in the
801/// early exit block.
803 VPTypeAnalysis &TypeInfo,
804 VPBlockBase *PredVPBB,
805 VPValue *Op,
806 ScalarEvolution &SE) {
807 VPValue *Incoming, *Mask;
810 return nullptr;
811
812 auto *WideIV = getOptimizableIVOf(Incoming, SE);
813 if (!WideIV)
814 return nullptr;
815
816 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
817 if (WideIntOrFp && WideIntOrFp->getTruncInst())
818 return nullptr;
819
820 // Calculate the final index.
821 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
822 auto *CanonicalIV = LoopRegion->getCanonicalIV();
823 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
824 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
825
826 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
827 VPValue *FirstActiveLane =
828 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
829 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
830 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
831 FirstActiveLaneType, DL);
832 VPValue *EndValue =
833 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
834
835 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
836 // changed it means the exit is using the incremented value, so we need to
837 // add the step.
838 if (Incoming != WideIV) {
839 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
840 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
841 }
842
843 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
844 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
845 VPValue *Start = WideIV->getStartValue();
846 VPValue *Step = WideIV->getStepValue();
847 EndValue = B.createDerivedIV(
848 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
849 Start, EndValue, Step);
850 }
851
852 return EndValue;
853}
854
855/// Attempts to optimize the induction variable exit values for users in the
856/// exit block coming from the latch in the original scalar loop.
858 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
862 return nullptr;
863
864 auto *WideIV = getOptimizableIVOf(Incoming, SE);
865 if (!WideIV)
866 return nullptr;
867
868 VPValue *EndValue = EndValues.lookup(WideIV);
869 assert(EndValue && "end value must have been pre-computed");
870
871 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
872 // changed it means the exit is using the incremented value, so we don't
873 // need to subtract the step.
874 if (Incoming != WideIV)
875 return EndValue;
876
877 // Otherwise, subtract the step from the EndValue.
878 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
879 VPValue *Step = WideIV->getStepValue();
880 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
881 if (ScalarTy->isIntegerTy())
882 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
883 if (ScalarTy->isPointerTy()) {
884 Type *StepTy = TypeInfo.inferScalarType(Step);
885 auto *Zero = Plan.getConstantInt(StepTy, 0);
886 return B.createPtrAdd(EndValue,
887 B.createNaryOp(Instruction::Sub, {Zero, Step}),
888 DebugLoc::getUnknown(), "ind.escape");
889 }
890 if (ScalarTy->isFloatingPointTy()) {
891 const auto &ID = WideIV->getInductionDescriptor();
892 return B.createNaryOp(
893 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
894 ? Instruction::FSub
895 : Instruction::FAdd,
896 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
897 }
898 llvm_unreachable("all possible induction types must be handled");
899 return nullptr;
900}
901
903 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
904 ScalarEvolution &SE) {
905 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
906 VPTypeAnalysis TypeInfo(Plan);
907 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
908 for (VPRecipeBase &R : ExitVPBB->phis()) {
909 auto *ExitIRI = cast<VPIRPhi>(&R);
910
911 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
912 VPValue *Escape = nullptr;
913 if (PredVPBB == MiddleVPBB)
914 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
915 ExitIRI->getOperand(Idx),
916 EndValues, SE);
917 else
918 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
919 ExitIRI->getOperand(Idx), SE);
920 if (Escape)
921 ExitIRI->setOperand(Idx, Escape);
922 }
923 }
924 }
925}
926
927/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
928/// them with already existing recipes expanding the same SCEV expression.
931
932 for (VPRecipeBase &R :
934 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
935 if (!ExpR)
936 continue;
937
938 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
939 if (Inserted)
940 continue;
941 ExpR->replaceAllUsesWith(V->second);
942 ExpR->eraseFromParent();
943 }
944}
945
947 SmallVector<VPValue *> WorkList;
949 WorkList.push_back(V);
950
951 while (!WorkList.empty()) {
952 VPValue *Cur = WorkList.pop_back_val();
953 if (!Seen.insert(Cur).second)
954 continue;
956 if (!R)
957 continue;
958 if (!isDeadRecipe(*R))
959 continue;
960 append_range(WorkList, R->operands());
961 R->eraseFromParent();
962 }
963}
964
965/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
966/// Returns an optional pair, where the first element indicates whether it is
967/// an intrinsic ID.
968static std::optional<std::pair<bool, unsigned>>
970 return TypeSwitch<const VPSingleDefRecipe *,
971 std::optional<std::pair<bool, unsigned>>>(R)
974 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
975 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
976 return std::make_pair(true, I->getVectorIntrinsicID());
977 })
978 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
979 // For recipes that do not directly map to LLVM IR instructions,
980 // assign opcodes after the last VPInstruction opcode (which is also
981 // after the last IR Instruction opcode), based on the VPDefID.
982 return std::make_pair(false,
983 VPInstruction::OpsEnd + 1 + I->getVPDefID());
984 })
985 .Default([](auto *) { return std::nullopt; });
986}
987
988/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
989/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
990/// Operands are foldable live-ins.
992 ArrayRef<VPValue *> Operands,
993 const DataLayout &DL,
994 VPTypeAnalysis &TypeInfo) {
995 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
996 if (!OpcodeOrIID)
997 return nullptr;
998
1000 for (VPValue *Op : Operands) {
1001 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1002 return nullptr;
1003 Ops.push_back(Op->getLiveInIRValue());
1004 }
1005
1006 auto FoldToIRValue = [&]() -> Value * {
1007 InstSimplifyFolder Folder(DL);
1008 if (OpcodeOrIID->first) {
1009 if (R.getNumOperands() != 2)
1010 return nullptr;
1011 unsigned ID = OpcodeOrIID->second;
1012 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1013 TypeInfo.inferScalarType(&R));
1014 }
1015 unsigned Opcode = OpcodeOrIID->second;
1016 if (Instruction::isBinaryOp(Opcode))
1017 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1018 Ops[0], Ops[1]);
1019 if (Instruction::isCast(Opcode))
1020 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1021 TypeInfo.inferScalarType(R.getVPSingleValue()));
1022 switch (Opcode) {
1024 return Folder.FoldSelect(Ops[0], Ops[1],
1026 case VPInstruction::Not:
1027 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1029 case Instruction::Select:
1030 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1031 case Instruction::ICmp:
1032 case Instruction::FCmp:
1033 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1034 Ops[1]);
1035 case Instruction::GetElementPtr: {
1036 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1037 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1038 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1039 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1040 }
1043 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1044 Ops[0], Ops[1],
1045 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1046 // An extract of a live-in is an extract of a broadcast, so return the
1047 // broadcasted element.
1048 case Instruction::ExtractElement:
1049 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1050 return Ops[0];
1051 }
1052 return nullptr;
1053 };
1054
1055 if (Value *V = FoldToIRValue())
1056 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1057 return nullptr;
1058}
1059
1060/// Try to simplify VPSingleDefRecipe \p Def.
1062 VPlan *Plan = Def->getParent()->getPlan();
1063
1064 // Simplification of live-in IR values for SingleDef recipes using
1065 // InstSimplifyFolder.
1066 const DataLayout &DL =
1068 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1069 return Def->replaceAllUsesWith(V);
1070
1071 // Fold PredPHI LiveIn -> LiveIn.
1072 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1073 VPValue *Op = PredPHI->getOperand(0);
1074 if (Op->isLiveIn())
1075 PredPHI->replaceAllUsesWith(Op);
1076 }
1077
1078 VPBuilder Builder(Def);
1079 VPValue *A;
1080 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1081 Type *TruncTy = TypeInfo.inferScalarType(Def);
1082 Type *ATy = TypeInfo.inferScalarType(A);
1083 if (TruncTy == ATy) {
1084 Def->replaceAllUsesWith(A);
1085 } else {
1086 // Don't replace a scalarizing recipe with a widened cast.
1087 if (isa<VPReplicateRecipe>(Def))
1088 return;
1089 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1090
1091 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1092 ? Instruction::SExt
1093 : Instruction::ZExt;
1094 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1095 TruncTy);
1096 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1097 // UnderlyingExt has distinct return type, used to retain legacy cost.
1098 Ext->setUnderlyingValue(UnderlyingExt);
1099 }
1100 Def->replaceAllUsesWith(Ext);
1101 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1102 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1103 Def->replaceAllUsesWith(Trunc);
1104 }
1105 }
1106#ifndef NDEBUG
1107 // Verify that the cached type info is for both A and its users is still
1108 // accurate by comparing it to freshly computed types.
1109 VPTypeAnalysis TypeInfo2(*Plan);
1110 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1111 for (VPUser *U : A->users()) {
1112 auto *R = cast<VPRecipeBase>(U);
1113 for (VPValue *VPV : R->definedValues())
1114 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1115 }
1116#endif
1117 }
1118
1119 // Simplify (X && Y) || (X && !Y) -> X.
1120 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1121 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1122 // recipes to be visited during simplification.
1123 VPValue *X, *Y, *Z;
1124 if (match(Def,
1127 Def->replaceAllUsesWith(X);
1128 Def->eraseFromParent();
1129 return;
1130 }
1131
1132 // x | 1 -> 1
1133 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1134 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1135
1136 // x | 0 -> x
1137 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1138 return Def->replaceAllUsesWith(X);
1139
1140 // x & 0 -> 0
1141 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1142 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1143
1144 // x && false -> false
1145 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1146 return Def->replaceAllUsesWith(Def->getOperand(1));
1147
1148 // (x && y) || (x && z) -> x && (y || z)
1151 // Simplify only if one of the operands has one use to avoid creating an
1152 // extra recipe.
1153 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1154 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1155 return Def->replaceAllUsesWith(
1156 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1157
1158 // x && !x -> 0
1160 return Def->replaceAllUsesWith(Plan->getFalse());
1161
1162 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1163 return Def->replaceAllUsesWith(X);
1164
1165 // select !c, x, y -> select c, y, x
1166 VPValue *C;
1167 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1168 Def->setOperand(0, C);
1169 Def->setOperand(1, Y);
1170 Def->setOperand(2, X);
1171 return;
1172 }
1173
1174 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1175 // tail folding it is likely that x is a header mask and can be simplified
1176 // further.
1178 m_VPValue(Z))) &&
1179 X->hasMoreThanOneUniqueUser())
1180 return Def->replaceAllUsesWith(
1181 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1182
1183 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1184 return Def->replaceAllUsesWith(A);
1185
1186 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1187 return Def->replaceAllUsesWith(
1188 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1189
1190 if (match(Def, m_Not(m_VPValue(A)))) {
1191 if (match(A, m_Not(m_VPValue(A))))
1192 return Def->replaceAllUsesWith(A);
1193
1194 // Try to fold Not into compares by adjusting the predicate in-place.
1195 CmpPredicate Pred;
1196 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1197 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1198 if (all_of(Cmp->users(),
1200 m_Not(m_Specific(Cmp)),
1201 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1202 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1203 for (VPUser *U : to_vector(Cmp->users())) {
1204 auto *R = cast<VPSingleDefRecipe>(U);
1205 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1206 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1207 R->setOperand(1, Y);
1208 R->setOperand(2, X);
1209 } else {
1210 // not (cmp pred) -> cmp inv_pred
1211 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1212 R->replaceAllUsesWith(Cmp);
1213 }
1214 }
1215 // If Cmp doesn't have a debug location, use the one from the negation,
1216 // to preserve the location.
1217 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1218 Cmp->setDebugLoc(Def->getDebugLoc());
1219 }
1220 }
1221 }
1222
1223 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1224 // This is useful for fmax/fmin without fast-math flags, where we need to
1225 // check if any operand is NaN.
1227 m_Deferred(X)),
1229 m_Deferred(Y))))) {
1230 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1231 return Def->replaceAllUsesWith(NewCmp);
1232 }
1233
1234 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1235 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1236 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1237 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1238 TypeInfo.inferScalarType(Def))
1239 return Def->replaceAllUsesWith(Def->getOperand(1));
1240
1242 m_One()))) {
1243 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1244 if (TypeInfo.inferScalarType(X) != WideStepTy)
1245 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1246 Def->replaceAllUsesWith(X);
1247 return;
1248 }
1249
1250 // For i1 vp.merges produced by AnyOf reductions:
1251 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1253 m_VPValue(X), m_VPValue())) &&
1255 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1256 Def->setOperand(1, Def->getOperand(0));
1257 Def->setOperand(0, Y);
1258 return;
1259 }
1260
1261 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1262 if (Phi->getOperand(0) == Phi->getOperand(1))
1263 Phi->replaceAllUsesWith(Phi->getOperand(0));
1264 return;
1265 }
1266
1267 // Look through ExtractLastElement (BuildVector ....).
1270 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1271 Def->replaceAllUsesWith(
1272 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1273 return;
1274 }
1275
1276 // Look through ExtractPenultimateElement (BuildVector ....).
1278 m_BuildVector()))) {
1279 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1280 Def->replaceAllUsesWith(
1281 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1282 return;
1283 }
1284
1285 uint64_t Idx;
1287 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1288 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1289 return;
1290 }
1291
1292 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1293 Def->replaceAllUsesWith(
1294 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1295 return;
1296 }
1297
1298 // Look through broadcast of single-scalar when used as select conditions; in
1299 // that case the scalar condition can be used directly.
1300 if (match(Def,
1303 "broadcast operand must be single-scalar");
1304 Def->setOperand(0, C);
1305 return;
1306 }
1307
1308 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1309 if (Phi->getNumOperands() == 1)
1310 Phi->replaceAllUsesWith(Phi->getOperand(0));
1311 return;
1312 }
1313
1314 // Some simplifications can only be applied after unrolling. Perform them
1315 // below.
1316 if (!Plan->isUnrolled())
1317 return;
1318
1319 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1320 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1321 isa<VPPhi>(X)) {
1322 auto *Phi = cast<VPPhi>(X);
1323 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1324 Phi->getSingleUser() == Def) {
1325 Phi->setOperand(0, Y);
1326 Def->replaceAllUsesWith(Phi);
1327 return;
1328 }
1329 }
1330
1331 // VPVectorPointer for part 0 can be replaced by their start pointer.
1332 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1333 if (VecPtr->isFirstPart()) {
1334 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1335 return;
1336 }
1337 }
1338
1339 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1340 // the first lane is demanded.
1341 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1342 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1343 Steps->replaceAllUsesWith(Steps->getOperand(0));
1344 return;
1345 }
1346 }
1347 // Simplify redundant ReductionStartVector recipes after unrolling.
1348 VPValue *StartV;
1350 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1351 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1352 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1353 return PhiR && PhiR->isInLoop();
1354 });
1355 return;
1356 }
1357
1358 if (match(Def,
1361 Def->replaceAllUsesWith(A);
1362 return;
1363 }
1364
1369 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1370 all_of(A->users(),
1371 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1372 return Def->replaceAllUsesWith(A);
1373 }
1374
1375 if (Plan->getUF() == 1 &&
1377 return Def->replaceAllUsesWith(
1378 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1379 }
1380}
1381
1384 Plan.getEntry());
1385 VPTypeAnalysis TypeInfo(Plan);
1387 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1388 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1389 simplifyRecipe(Def, TypeInfo);
1390 }
1391}
1392
1394 if (Plan.hasScalarVFOnly())
1395 return;
1396
1397 // Try to narrow wide and replicating recipes to single scalar recipes,
1398 // based on VPlan analysis. Only process blocks in the loop region for now,
1399 // without traversing into nested regions, as recipes in replicate regions
1400 // cannot be converted yet.
1403 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1405 continue;
1406 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1407 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1408 continue;
1409
1410 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1411 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1412 vputils::isSingleScalar(RepR->getOperand(1))) {
1413 auto *Clone = new VPReplicateRecipe(
1414 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1415 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1416 Clone->insertBefore(RepOrWidenR);
1417 unsigned ExtractOpc =
1418 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1421 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1422 Ext->insertBefore(Clone);
1423 Clone->setOperand(0, Ext);
1424 RepR->eraseFromParent();
1425 continue;
1426 }
1427
1428 // Skip recipes that aren't single scalars or don't have only their
1429 // scalar results used. In the latter case, we would introduce extra
1430 // broadcasts.
1431 if (!vputils::isSingleScalar(RepOrWidenR) ||
1432 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1433 if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
1434 // VPWidenStore doesn't have users, and stores are always
1435 // profitable to widen: hence, permitting address and mask
1436 // operands, and single-scalar stored values is an important leaf
1437 // condition. The assert must hold as we checked the RepOrWidenR
1438 // operand against vputils::isSingleScalar.
1439 assert(RepOrWidenR != Store->getStoredValue() ||
1440 vputils::isSingleScalar(Store->getStoredValue()));
1441 return true;
1442 }
1443
1444 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1445 unsigned Opcode = VPI->getOpcode();
1446 if (Opcode == VPInstruction::ExtractLastElement ||
1449 return true;
1450 }
1451
1452 return U->usesScalars(RepOrWidenR);
1453 }))
1454 continue;
1455
1456 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1457 RepOrWidenR->operands(),
1458 true /*IsSingleScalar*/);
1459 Clone->insertBefore(RepOrWidenR);
1460 RepOrWidenR->replaceAllUsesWith(Clone);
1461 if (isDeadRecipe(*RepOrWidenR))
1462 RepOrWidenR->eraseFromParent();
1463 }
1464 }
1465}
1466
1467/// Try to see if all of \p Blend's masks share a common value logically and'ed
1468/// and remove it from the masks.
1470 if (Blend->isNormalized())
1471 return;
1472 VPValue *CommonEdgeMask;
1473 if (!match(Blend->getMask(0),
1474 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1475 return;
1476 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1477 if (!match(Blend->getMask(I),
1478 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1479 return;
1480 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1481 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1482}
1483
1484/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1485/// to make sure the masks are simplified.
1486static void simplifyBlends(VPlan &Plan) {
1489 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1490 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1491 if (!Blend)
1492 continue;
1493
1494 removeCommonBlendMask(Blend);
1495
1496 // Try to remove redundant blend recipes.
1497 SmallPtrSet<VPValue *, 4> UniqueValues;
1498 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1499 UniqueValues.insert(Blend->getIncomingValue(0));
1500 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1501 if (!match(Blend->getMask(I), m_False()))
1502 UniqueValues.insert(Blend->getIncomingValue(I));
1503
1504 if (UniqueValues.size() == 1) {
1505 Blend->replaceAllUsesWith(*UniqueValues.begin());
1506 Blend->eraseFromParent();
1507 continue;
1508 }
1509
1510 if (Blend->isNormalized())
1511 continue;
1512
1513 // Normalize the blend so its first incoming value is used as the initial
1514 // value with the others blended into it.
1515
1516 unsigned StartIndex = 0;
1517 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1518 // If a value's mask is used only by the blend then is can be deadcoded.
1519 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1520 // that's used by multiple blends where it can be removed from them all.
1521 VPValue *Mask = Blend->getMask(I);
1522 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1523 StartIndex = I;
1524 break;
1525 }
1526 }
1527
1528 SmallVector<VPValue *, 4> OperandsWithMask;
1529 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1530
1531 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1532 if (I == StartIndex)
1533 continue;
1534 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1535 OperandsWithMask.push_back(Blend->getMask(I));
1536 }
1537
1538 auto *NewBlend =
1539 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1540 OperandsWithMask, Blend->getDebugLoc());
1541 NewBlend->insertBefore(&R);
1542
1543 VPValue *DeadMask = Blend->getMask(StartIndex);
1544 Blend->replaceAllUsesWith(NewBlend);
1545 Blend->eraseFromParent();
1547
1548 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1549 VPValue *NewMask;
1550 if (NewBlend->getNumOperands() == 3 &&
1551 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1552 VPValue *Inc0 = NewBlend->getOperand(0);
1553 VPValue *Inc1 = NewBlend->getOperand(1);
1554 VPValue *OldMask = NewBlend->getOperand(2);
1555 NewBlend->setOperand(0, Inc1);
1556 NewBlend->setOperand(1, Inc0);
1557 NewBlend->setOperand(2, NewMask);
1558 if (OldMask->getNumUsers() == 0)
1559 cast<VPInstruction>(OldMask)->eraseFromParent();
1560 }
1561 }
1562 }
1563}
1564
1565/// Optimize the width of vector induction variables in \p Plan based on a known
1566/// constant Trip Count, \p BestVF and \p BestUF.
1568 ElementCount BestVF,
1569 unsigned BestUF) {
1570 // Only proceed if we have not completely removed the vector region.
1571 if (!Plan.getVectorLoopRegion())
1572 return false;
1573
1574 const APInt *TC;
1575 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1576 return false;
1577
1578 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1579 // and UF. Returns at least 8.
1580 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1581 APInt AlignedTC =
1584 APInt MaxVal = AlignedTC - 1;
1585 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1586 };
1587 unsigned NewBitWidth =
1588 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1589
1590 LLVMContext &Ctx = Plan.getContext();
1591 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1592
1593 bool MadeChange = false;
1594
1595 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1596 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1597 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1598
1599 // Currently only handle canonical IVs as it is trivial to replace the start
1600 // and stop values, and we currently only perform the optimization when the
1601 // IV has a single use.
1602 if (!WideIV || !WideIV->isCanonical() ||
1603 WideIV->hasMoreThanOneUniqueUser() ||
1604 NewIVTy == WideIV->getScalarType())
1605 continue;
1606
1607 // Currently only handle cases where the single user is a header-mask
1608 // comparison with the backedge-taken-count.
1609 VPUser *SingleUser = WideIV->getSingleUser();
1610 if (!SingleUser ||
1611 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1614 continue;
1615
1616 // Update IV operands and comparison bound to use new narrower type.
1617 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1618 WideIV->setStartValue(NewStart);
1619 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1620 WideIV->setStepValue(NewStep);
1621
1622 auto *NewBTC = new VPWidenCastRecipe(
1623 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1624 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1625 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1626 Cmp->setOperand(1, NewBTC);
1627
1628 MadeChange = true;
1629 }
1630
1631 return MadeChange;
1632}
1633
1634/// Return true if \p Cond is known to be true for given \p BestVF and \p
1635/// BestUF.
1637 ElementCount BestVF, unsigned BestUF,
1638 ScalarEvolution &SE) {
1640 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1641 &SE](VPValue *C) {
1642 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1643 });
1644
1645 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1647 m_Specific(CanIV->getBackedgeValue()),
1648 m_Specific(&Plan.getVectorTripCount()))))
1649 return false;
1650
1651 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1652 // count is not conveniently available as SCEV so far, so we compare directly
1653 // against the original trip count. This is stricter than necessary, as we
1654 // will only return true if the trip count == vector trip count.
1655 const SCEV *VectorTripCount =
1657 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1658 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1659 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1660 "Trip count SCEV must be computable");
1661 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1662 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1663 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1664}
1665
1666/// Try to replace multiple active lane masks used for control flow with
1667/// a single, wide active lane mask instruction followed by multiple
1668/// extract subvector intrinsics. This applies to the active lane mask
1669/// instructions both in the loop and in the preheader.
1670/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1671/// new extracts from the first active lane mask, which has it's last
1672/// operand (multiplier) set to UF.
1674 unsigned UF) {
1675 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1676 return false;
1677
1678 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1679 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1680 auto *Term = &ExitingVPBB->back();
1681
1682 using namespace llvm::VPlanPatternMatch;
1684 m_VPValue(), m_VPValue(), m_VPValue())))))
1685 return false;
1686
1687 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1688 LLVMContext &Ctx = Plan.getContext();
1689
1690 auto ExtractFromALM = [&](VPInstruction *ALM,
1691 SmallVectorImpl<VPValue *> &Extracts) {
1692 DebugLoc DL = ALM->getDebugLoc();
1693 for (unsigned Part = 0; Part < UF; ++Part) {
1695 Ops.append({ALM, Plan.getOrAddLiveIn(
1696 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1697 VF.getKnownMinValue() * Part))});
1698 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1700 Extracts[Part] = Ext;
1701 Ext->insertAfter(ALM);
1702 }
1703 };
1704
1705 // Create a list of each active lane mask phi, ordered by unroll part.
1707 for (VPRecipeBase &R : Header->phis()) {
1709 if (!Phi)
1710 continue;
1711 VPValue *Index = nullptr;
1712 match(Phi->getBackedgeValue(),
1714 assert(Index && "Expected index from ActiveLaneMask instruction");
1715
1716 uint64_t Part;
1717 if (match(Index,
1719 m_VPValue(), m_ConstantInt(Part))))
1720 Phis[Part] = Phi;
1721 else
1722 // Anything other than a CanonicalIVIncrementForPart is part 0
1723 Phis[0] = Phi;
1724 }
1725
1726 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1727 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1728
1729 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1730 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1731
1732 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1733 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1734 "Expected incoming values of Phi to be ActiveLaneMasks");
1735
1736 // When using wide lane masks, the return type of the get.active.lane.mask
1737 // intrinsic is VF x UF (last operand).
1738 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1739 EntryALM->setOperand(2, ALMMultiplier);
1740 LoopALM->setOperand(2, ALMMultiplier);
1741
1742 // Create UF x extract vectors and insert into preheader.
1743 SmallVector<VPValue *> EntryExtracts(UF);
1744 ExtractFromALM(EntryALM, EntryExtracts);
1745
1746 // Create UF x extract vectors and insert before the loop compare & branch,
1747 // updating the compare to use the first extract.
1748 SmallVector<VPValue *> LoopExtracts(UF);
1749 ExtractFromALM(LoopALM, LoopExtracts);
1750 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1751 Not->setOperand(0, LoopExtracts[0]);
1752
1753 // Update the incoming values of active lane mask phis.
1754 for (unsigned Part = 0; Part < UF; ++Part) {
1755 Phis[Part]->setStartValue(EntryExtracts[Part]);
1756 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1757 }
1758
1759 return true;
1760}
1761
1762/// Try to simplify the branch condition of \p Plan. This may restrict the
1763/// resulting plan to \p BestVF and \p BestUF.
1765 unsigned BestUF,
1767 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1768 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1769 auto *Term = &ExitingVPBB->back();
1770 VPValue *Cond;
1771 ScalarEvolution &SE = *PSE.getSE();
1772 if (match(Term, m_BranchOnCount()) ||
1774 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1775 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1776 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1777 const SCEV *VectorTripCount =
1779 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1780 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1781 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1782 "Trip count SCEV must be computable");
1783 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1784 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1785 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1786 return false;
1787 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1788 // For BranchOnCond, check if we can prove the condition to be true using VF
1789 // and UF.
1790 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1791 return false;
1792 } else {
1793 return false;
1794 }
1795
1796 // The vector loop region only executes once. If possible, completely remove
1797 // the region, otherwise replace the terminator controlling the latch with
1798 // (BranchOnCond true).
1799 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1800 // support for other non-canonical widen induction recipes (e.g.,
1801 // VPWidenPointerInductionRecipe).
1802 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1803 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1804 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1805 return R->isCanonical();
1806 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1807 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1808 })) {
1809 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1810 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1811 VPBuilder Builder(Plan.getVectorPreheader());
1812 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1813 R->getScalarType());
1814 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1815 HeaderR.eraseFromParent();
1816 continue;
1817 }
1818 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1819 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1820 HeaderR.eraseFromParent();
1821 }
1822
1823 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1824 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1825 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1826 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1827
1828 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1829 B->setParent(nullptr);
1830
1831 VPBlockUtils::connectBlocks(Preheader, Header);
1832 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1834 } else {
1835 // The vector region contains header phis for which we cannot remove the
1836 // loop region yet.
1837 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1838 Term->getDebugLoc());
1839 ExitingVPBB->appendRecipe(BOC);
1840 }
1841
1842 Term->eraseFromParent();
1843
1844 return true;
1845}
1846
1847/// From the definition of llvm.experimental.get.vector.length,
1848/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
1852 vp_depth_first_deep(Plan.getEntry()))) {
1853 for (VPRecipeBase &R : *VPBB) {
1854 VPValue *AVL;
1855 if (!match(&R, m_EVL(m_VPValue(AVL))))
1856 continue;
1857
1858 ScalarEvolution &SE = *PSE.getSE();
1859 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
1860 if (isa<SCEVCouldNotCompute>(AVLSCEV))
1861 continue;
1862 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
1863 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
1864 continue;
1865
1867 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
1868 R.getDebugLoc());
1869 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
1870 return true;
1871 }
1872 }
1873 return false;
1874}
1875
1877 unsigned BestUF,
1879 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1880 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1881
1882 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1883 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1884 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1885 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
1886
1887 if (MadeChange) {
1888 Plan.setVF(BestVF);
1889 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1890 }
1891}
1892
1893/// Sink users of \p FOR after the recipe defining the previous value \p
1894/// Previous of the recurrence. \returns true if all users of \p FOR could be
1895/// re-arranged as needed or false if it is not possible.
1896static bool
1898 VPRecipeBase *Previous,
1899 VPDominatorTree &VPDT) {
1900 // Collect recipes that need sinking.
1903 Seen.insert(Previous);
1904 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1905 // The previous value must not depend on the users of the recurrence phi. In
1906 // that case, FOR is not a fixed order recurrence.
1907 if (SinkCandidate == Previous)
1908 return false;
1909
1910 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1911 !Seen.insert(SinkCandidate).second ||
1912 VPDT.properlyDominates(Previous, SinkCandidate))
1913 return true;
1914
1915 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1916 return false;
1917
1918 WorkList.push_back(SinkCandidate);
1919 return true;
1920 };
1921
1922 // Recursively sink users of FOR after Previous.
1923 WorkList.push_back(FOR);
1924 for (unsigned I = 0; I != WorkList.size(); ++I) {
1925 VPRecipeBase *Current = WorkList[I];
1926 assert(Current->getNumDefinedValues() == 1 &&
1927 "only recipes with a single defined value expected");
1928
1929 for (VPUser *User : Current->getVPSingleValue()->users()) {
1930 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1931 return false;
1932 }
1933 }
1934
1935 // Keep recipes to sink ordered by dominance so earlier instructions are
1936 // processed first.
1937 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1938 return VPDT.properlyDominates(A, B);
1939 });
1940
1941 for (VPRecipeBase *SinkCandidate : WorkList) {
1942 if (SinkCandidate == FOR)
1943 continue;
1944
1945 SinkCandidate->moveAfter(Previous);
1946 Previous = SinkCandidate;
1947 }
1948 return true;
1949}
1950
1951/// Try to hoist \p Previous and its operands before all users of \p FOR.
1953 VPRecipeBase *Previous,
1954 VPDominatorTree &VPDT) {
1955 if (cannotHoistOrSinkRecipe(*Previous))
1956 return false;
1957
1958 // Collect recipes that need hoisting.
1959 SmallVector<VPRecipeBase *> HoistCandidates;
1961 VPRecipeBase *HoistPoint = nullptr;
1962 // Find the closest hoist point by looking at all users of FOR and selecting
1963 // the recipe dominating all other users.
1964 for (VPUser *U : FOR->users()) {
1965 auto *R = cast<VPRecipeBase>(U);
1966 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1967 HoistPoint = R;
1968 }
1969 assert(all_of(FOR->users(),
1970 [&VPDT, HoistPoint](VPUser *U) {
1971 auto *R = cast<VPRecipeBase>(U);
1972 return HoistPoint == R ||
1973 VPDT.properlyDominates(HoistPoint, R);
1974 }) &&
1975 "HoistPoint must dominate all users of FOR");
1976
1977 auto NeedsHoisting = [HoistPoint, &VPDT,
1978 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1979 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1980 if (!HoistCandidate)
1981 return nullptr;
1982 VPRegionBlock *EnclosingLoopRegion =
1983 HoistCandidate->getParent()->getEnclosingLoopRegion();
1984 assert((!HoistCandidate->getRegion() ||
1985 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1986 "CFG in VPlan should still be flat, without replicate regions");
1987 // Hoist candidate was already visited, no need to hoist.
1988 if (!Visited.insert(HoistCandidate).second)
1989 return nullptr;
1990
1991 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1992 // hoisting.
1993 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1994 return nullptr;
1995
1996 // If we reached a recipe that dominates HoistPoint, we don't need to
1997 // hoist the recipe.
1998 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1999 return nullptr;
2000 return HoistCandidate;
2001 };
2002
2003 if (!NeedsHoisting(Previous->getVPSingleValue()))
2004 return true;
2005
2006 // Recursively try to hoist Previous and its operands before all users of FOR.
2007 HoistCandidates.push_back(Previous);
2008
2009 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2010 VPRecipeBase *Current = HoistCandidates[I];
2011 assert(Current->getNumDefinedValues() == 1 &&
2012 "only recipes with a single defined value expected");
2013 if (cannotHoistOrSinkRecipe(*Current))
2014 return false;
2015
2016 for (VPValue *Op : Current->operands()) {
2017 // If we reach FOR, it means the original Previous depends on some other
2018 // recurrence that in turn depends on FOR. If that is the case, we would
2019 // also need to hoist recipes involving the other FOR, which may break
2020 // dependencies.
2021 if (Op == FOR)
2022 return false;
2023
2024 if (auto *R = NeedsHoisting(Op))
2025 HoistCandidates.push_back(R);
2026 }
2027 }
2028
2029 // Order recipes to hoist by dominance so earlier instructions are processed
2030 // first.
2031 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2032 return VPDT.properlyDominates(A, B);
2033 });
2034
2035 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2036 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2037 HoistPoint->getIterator());
2038 }
2039
2040 return true;
2041}
2042
2044 VPBuilder &LoopBuilder) {
2045 VPDominatorTree VPDT(Plan);
2046
2048 for (VPRecipeBase &R :
2051 RecurrencePhis.push_back(FOR);
2052
2053 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2055 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2056 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2057 // to terminate.
2058 while (auto *PrevPhi =
2060 assert(PrevPhi->getParent() == FOR->getParent());
2061 assert(SeenPhis.insert(PrevPhi).second);
2062 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2063 }
2064
2065 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2066 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2067 return false;
2068
2069 // Introduce a recipe to combine the incoming and previous values of a
2070 // fixed-order recurrence.
2071 VPBasicBlock *InsertBlock = Previous->getParent();
2072 if (isa<VPHeaderPHIRecipe>(Previous))
2073 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2074 else
2075 LoopBuilder.setInsertPoint(InsertBlock,
2076 std::next(Previous->getIterator()));
2077
2078 auto *RecurSplice =
2080 {FOR, FOR->getBackedgeValue()});
2081
2082 FOR->replaceAllUsesWith(RecurSplice);
2083 // Set the first operand of RecurSplice to FOR again, after replacing
2084 // all users.
2085 RecurSplice->setOperand(0, FOR);
2086 }
2087 return true;
2088}
2089
2091 for (VPRecipeBase &R :
2093 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2094 if (!PhiR)
2095 continue;
2096 RecurKind RK = PhiR->getRecurrenceKind();
2097 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2099 continue;
2100
2101 for (VPUser *U : collectUsersRecursively(PhiR))
2102 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2103 RecWithFlags->dropPoisonGeneratingFlags();
2104 }
2105 }
2106}
2107
2108namespace {
2109struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2110 static bool isSentinel(const VPSingleDefRecipe *Def) {
2111 return Def == getEmptyKey() || Def == getTombstoneKey();
2112 }
2113
2114 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2115 /// return that source element type.
2116 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2117 // All VPInstructions that lower to GEPs must have the i8 source element
2118 // type (as they are PtrAdds), so we omit it.
2120 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2121 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2122 return GEP->getSourceElementType();
2123 return nullptr;
2124 })
2125 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2126 [](auto *I) { return I->getSourceElementType(); })
2127 .Default([](auto *) { return nullptr; });
2128 }
2129
2130 /// Returns true if recipe \p Def can be safely handed for CSE.
2131 static bool canHandle(const VPSingleDefRecipe *Def) {
2132 // We can extend the list of handled recipes in the future,
2133 // provided we account for the data embedded in them while checking for
2134 // equality or hashing.
2135 auto C = getOpcodeOrIntrinsicID(Def);
2136
2137 // The issue with (Insert|Extract)Value is that the index of the
2138 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2139 // VPlan.
2140 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2141 C->second == Instruction::ExtractValue)))
2142 return false;
2143
2144 // During CSE, we can only handle recipes that don't read from memory: if
2145 // they read from memory, there could be an intervening write to memory
2146 // before the next instance is CSE'd, leading to an incorrect result.
2147 return !Def->mayReadFromMemory();
2148 }
2149
2150 /// Hash the underlying data of \p Def.
2151 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2152 const VPlan *Plan = Def->getParent()->getPlan();
2153 VPTypeAnalysis TypeInfo(*Plan);
2154 hash_code Result = hash_combine(
2155 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2156 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2158 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2159 if (RFlags->hasPredicate())
2160 return hash_combine(Result, RFlags->getPredicate());
2161 return Result;
2162 }
2163
2164 /// Check equality of underlying data of \p L and \p R.
2165 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2166 if (isSentinel(L) || isSentinel(R))
2167 return L == R;
2168 if (L->getVPDefID() != R->getVPDefID() ||
2170 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2172 !equal(L->operands(), R->operands()))
2173 return false;
2175 "must have valid opcode info for both recipes");
2176 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2177 if (LFlags->hasPredicate() &&
2178 LFlags->getPredicate() !=
2179 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2180 return false;
2181 // Recipes in replicate regions implicitly depend on predicate. If either
2182 // recipe is in a replicate region, only consider them equal if both have
2183 // the same parent.
2184 const VPRegionBlock *RegionL = L->getRegion();
2185 const VPRegionBlock *RegionR = R->getRegion();
2186 if (((RegionL && RegionL->isReplicator()) ||
2187 (RegionR && RegionR->isReplicator())) &&
2188 L->getParent() != R->getParent())
2189 return false;
2190 const VPlan *Plan = L->getParent()->getPlan();
2191 VPTypeAnalysis TypeInfo(*Plan);
2192 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2193 }
2194};
2195} // end anonymous namespace
2196
2197/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2198/// Plan.
2200 VPDominatorTree VPDT(Plan);
2202
2204 vp_depth_first_deep(Plan.getEntry()))) {
2205 for (VPRecipeBase &R : *VPBB) {
2206 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2207 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2208 continue;
2209 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2210 // V must dominate Def for a valid replacement.
2211 if (!VPDT.dominates(V->getParent(), VPBB))
2212 continue;
2213 // Only keep flags present on both V and Def.
2214 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2215 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2216 Def->replaceAllUsesWith(V);
2217 continue;
2218 }
2219 CSEMap[Def] = Def;
2220 }
2221 }
2222}
2223
2224/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2225static void licm(VPlan &Plan) {
2226 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2227
2228 // Hoist any loop invariant recipes from the vector loop region to the
2229 // preheader. Preform a shallow traversal of the vector loop region, to
2230 // exclude recipes in replicate regions. Since the top-level blocks in the
2231 // vector loop region are guaranteed to execute if the vector pre-header is,
2232 // we don't need to check speculation safety.
2233 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2234 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2235 "Expected vector prehader's successor to be the vector loop region");
2237 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2238 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2240 continue;
2241 if (any_of(R.operands(), [](VPValue *Op) {
2242 return !Op->isDefinedOutsideLoopRegions();
2243 }))
2244 continue;
2245 R.moveBefore(*Preheader, Preheader->end());
2246 }
2247 }
2248}
2249
2251 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2252 if (Plan.hasScalarVFOnly())
2253 return;
2254 // Keep track of created truncates, so they can be re-used. Note that we
2255 // cannot use RAUW after creating a new truncate, as this would could make
2256 // other uses have different types for their operands, making them invalidly
2257 // typed.
2259 VPTypeAnalysis TypeInfo(Plan);
2260 VPBasicBlock *PH = Plan.getVectorPreheader();
2263 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2266 &R))
2267 continue;
2268
2269 VPValue *ResultVPV = R.getVPSingleValue();
2270 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2271 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2272 if (!NewResSizeInBits)
2273 continue;
2274
2275 // If the value wasn't vectorized, we must maintain the original scalar
2276 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2277 // skip casts which do not need to be handled explicitly here, as
2278 // redundant casts will be removed during recipe simplification.
2280 continue;
2281
2282 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2283 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2284 assert(OldResTy->isIntegerTy() && "only integer types supported");
2285 (void)OldResSizeInBits;
2286
2287 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2288
2289 // Any wrapping introduced by shrinking this operation shouldn't be
2290 // considered undefined behavior. So, we can't unconditionally copy
2291 // arithmetic wrapping flags to VPW.
2292 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2293 VPW->dropPoisonGeneratingFlags();
2294
2295 if (OldResSizeInBits != NewResSizeInBits &&
2296 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2297 // Extend result to original width.
2298 auto *Ext =
2299 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2300 Ext->insertAfter(&R);
2301 ResultVPV->replaceAllUsesWith(Ext);
2302 Ext->setOperand(0, ResultVPV);
2303 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2304 } else {
2305 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2306 "Only ICmps should not need extending the result.");
2307 }
2308
2309 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2311 continue;
2312
2313 // Shrink operands by introducing truncates as needed.
2314 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2315 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2316 auto *Op = R.getOperand(Idx);
2317 unsigned OpSizeInBits =
2319 if (OpSizeInBits == NewResSizeInBits)
2320 continue;
2321 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2322 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2323 if (!IterIsEmpty) {
2324 R.setOperand(Idx, ProcessedIter->second);
2325 continue;
2326 }
2327
2328 VPBuilder Builder;
2329 if (Op->isLiveIn())
2330 Builder.setInsertPoint(PH);
2331 else
2332 Builder.setInsertPoint(&R);
2333 VPWidenCastRecipe *NewOp =
2334 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2335 ProcessedIter->second = NewOp;
2336 R.setOperand(Idx, NewOp);
2337 }
2338
2339 }
2340 }
2341}
2342
2346 VPValue *Cond;
2347 // Skip blocks that are not terminated by BranchOnCond.
2348 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2349 continue;
2350
2351 assert(VPBB->getNumSuccessors() == 2 &&
2352 "Two successors expected for BranchOnCond");
2353 unsigned RemovedIdx;
2354 if (match(Cond, m_True()))
2355 RemovedIdx = 1;
2356 else if (match(Cond, m_False()))
2357 RemovedIdx = 0;
2358 else
2359 continue;
2360
2361 VPBasicBlock *RemovedSucc =
2362 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2363 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2364 "There must be a single edge between VPBB and its successor");
2365 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2366 // these recipes.
2367 for (VPRecipeBase &R : RemovedSucc->phis())
2368 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2369
2370 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2371 // automatically on VPlan destruction if it becomes unreachable.
2372 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2373 VPBB->back().eraseFromParent();
2374 }
2375}
2376
2395
2396// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2397// the loop terminator with a branch-on-cond recipe with the negated
2398// active-lane-mask as operand. Note that this turns the loop into an
2399// uncountable one. Only the existing terminator is replaced, all other existing
2400// recipes/users remain unchanged, except for poison-generating flags being
2401// dropped from the canonical IV increment. Return the created
2402// VPActiveLaneMaskPHIRecipe.
2403//
2404// The function uses the following definitions:
2405//
2406// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2407// calculate-trip-count-minus-VF (original TC) : original TC
2408// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2409// CanonicalIVPhi : CanonicalIVIncrement
2410// %StartV is the canonical induction start value.
2411//
2412// The function adds the following recipes:
2413//
2414// vector.ph:
2415// %TripCount = calculate-trip-count-minus-VF (original TC)
2416// [if DataWithControlFlowWithoutRuntimeCheck]
2417// %EntryInc = canonical-iv-increment-for-part %StartV
2418// %EntryALM = active-lane-mask %EntryInc, %TripCount
2419//
2420// vector.body:
2421// ...
2422// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2423// ...
2424// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2425// %ALM = active-lane-mask %InLoopInc, TripCount
2426// %Negated = Not %ALM
2427// branch-on-cond %Negated
2428//
2431 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2432 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2433 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2434 VPValue *StartV = CanonicalIVPHI->getStartValue();
2435
2436 auto *CanonicalIVIncrement =
2437 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2438 // TODO: Check if dropping the flags is needed if
2439 // !DataAndControlFlowWithoutRuntimeCheck.
2440 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2441 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2442 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2443 // we have to take unrolling into account. Each part needs to start at
2444 // Part * VF
2445 auto *VecPreheader = Plan.getVectorPreheader();
2446 VPBuilder Builder(VecPreheader);
2447
2448 // Create the ActiveLaneMask instruction using the correct start values.
2449 VPValue *TC = Plan.getTripCount();
2450
2451 VPValue *TripCount, *IncrementValue;
2453 // When the loop is guarded by a runtime overflow check for the loop
2454 // induction variable increment by VF, we can increment the value before
2455 // the get.active.lane mask and use the unmodified tripcount.
2456 IncrementValue = CanonicalIVIncrement;
2457 TripCount = TC;
2458 } else {
2459 // When avoiding a runtime check, the active.lane.mask inside the loop
2460 // uses a modified trip count and the induction variable increment is
2461 // done after the active.lane.mask intrinsic is called.
2462 IncrementValue = CanonicalIVPHI;
2463 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2464 {TC}, DL);
2465 }
2466 auto *EntryIncrement = Builder.createOverflowingOp(
2467 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2468 "index.part.next");
2469
2470 // Create the active lane mask instruction in the VPlan preheader.
2471 VPValue *ALMMultiplier =
2472 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2473 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2474 {EntryIncrement, TC, ALMMultiplier}, DL,
2475 "active.lane.mask.entry");
2476
2477 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2478 // preheader ActiveLaneMask instruction.
2479 auto *LaneMaskPhi =
2481 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2482
2483 // Create the active lane mask for the next iteration of the loop before the
2484 // original terminator.
2485 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2486 Builder.setInsertPoint(OriginalTerminator);
2487 auto *InLoopIncrement =
2488 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2489 {IncrementValue}, {false, false}, DL);
2490 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2491 {InLoopIncrement, TripCount, ALMMultiplier},
2492 DL, "active.lane.mask.next");
2493 LaneMaskPhi->addOperand(ALM);
2494
2495 // Replace the original terminator with BranchOnCond. We have to invert the
2496 // mask here because a true condition means jumping to the exit block.
2497 auto *NotMask = Builder.createNot(ALM, DL);
2498 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2499 OriginalTerminator->eraseFromParent();
2500 return LaneMaskPhi;
2501}
2502
2503/// Collect the header mask with the pattern:
2504/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2505/// TODO: Introduce explicit recipe for header-mask instead of searching
2506/// for the header-mask pattern manually.
2508 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2509 SmallVector<VPValue *> WideCanonicalIVs;
2510 auto *FoundWidenCanonicalIVUser = find_if(
2512 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2514 "Must have at most one VPWideCanonicalIVRecipe");
2515 if (FoundWidenCanonicalIVUser !=
2516 LoopRegion->getCanonicalIV()->users().end()) {
2517 auto *WideCanonicalIV =
2518 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2519 WideCanonicalIVs.push_back(WideCanonicalIV);
2520 }
2521
2522 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2523 // version of the canonical induction.
2524 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2525 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2526 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2527 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2528 WideCanonicalIVs.push_back(WidenOriginalIV);
2529 }
2530
2531 // Walk users of wide canonical IVs and find the single compare of the form
2532 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2533 VPSingleDefRecipe *HeaderMask = nullptr;
2534 for (auto *Wide : WideCanonicalIVs) {
2535 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2536 auto *VPI = dyn_cast<VPInstruction>(U);
2537 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2538 continue;
2539
2540 assert(VPI->getOperand(0) == Wide &&
2541 "WidenCanonicalIV must be the first operand of the compare");
2542 assert(!HeaderMask && "Multiple header masks found?");
2543 HeaderMask = VPI;
2544 }
2545 }
2546 return HeaderMask;
2547}
2548
2550 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2553 UseActiveLaneMaskForControlFlow) &&
2554 "DataAndControlFlowWithoutRuntimeCheck implies "
2555 "UseActiveLaneMaskForControlFlow");
2556
2557 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2558 auto *FoundWidenCanonicalIVUser = find_if(
2560 assert(FoundWidenCanonicalIVUser &&
2561 "Must have widened canonical IV when tail folding!");
2562 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2563 auto *WideCanonicalIV =
2564 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2565 VPSingleDefRecipe *LaneMask;
2566 if (UseActiveLaneMaskForControlFlow) {
2569 } else {
2570 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2571 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2572 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2573 LaneMask =
2574 B.createNaryOp(VPInstruction::ActiveLaneMask,
2575 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2576 nullptr, "active.lane.mask");
2577 }
2578
2579 // Walk users of WideCanonicalIV and replace the header mask of the form
2580 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2581 // removing the old one to ensure there is always only a single header mask.
2582 HeaderMask->replaceAllUsesWith(LaneMask);
2583 HeaderMask->eraseFromParent();
2584}
2585
2586template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2587 Op0_t In;
2589
2590 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2591
2592 template <typename OpTy> bool match(OpTy *V) const {
2593 if (m_Specific(In).match(V)) {
2594 Out = nullptr;
2595 return true;
2596 }
2598 return true;
2599 return false;
2600 }
2601};
2602
2603/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2604/// Returns the remaining part \p Out if so, or nullptr otherwise.
2605template <typename Op0_t, typename Op1_t>
2606static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2607 Op1_t &Out) {
2608 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2609}
2610
2611/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2612/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2613/// recipe could be created.
2614/// \p HeaderMask Header Mask.
2615/// \p CurRecipe Recipe to be transform.
2616/// \p TypeInfo VPlan-based type analysis.
2617/// \p EVL The explicit vector length parameter of vector-predication
2618/// intrinsics.
2620 VPRecipeBase &CurRecipe,
2621 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2622 VPlan *Plan = CurRecipe.getParent()->getPlan();
2623 VPValue *Addr, *Mask, *EndPtr;
2624
2625 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2626 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2627 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2628 EVLEndPtr->insertBefore(&CurRecipe);
2629 EVLEndPtr->setOperand(1, &EVL);
2630 return EVLEndPtr;
2631 };
2632
2633 if (match(&CurRecipe,
2634 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2635 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2636 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2637 EVL, Mask);
2638
2639 if (match(&CurRecipe,
2640 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2641 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2642 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2643 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2644 AdjustEndPtr(EndPtr), EVL, Mask);
2645
2646 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2647 m_RemoveMask(HeaderMask, Mask))) &&
2648 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2649 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2650 EVL, Mask);
2651
2652 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2653 m_RemoveMask(HeaderMask, Mask))) &&
2654 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2655 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2656 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2657 AdjustEndPtr(EndPtr), EVL, Mask);
2658
2659 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2660 if (Rdx->isConditional() &&
2661 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2662 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2663
2664 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2665 if (Interleave->getMask() &&
2666 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2667 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2668
2669 VPValue *LHS, *RHS;
2670 if (match(&CurRecipe,
2671 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2672 return new VPWidenIntrinsicRecipe(
2673 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2674 TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
2675
2676 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2677 m_VPValue(RHS))))
2678 return new VPWidenIntrinsicRecipe(
2679 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2680 TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
2681
2682 return nullptr;
2683}
2684
2685/// Replace recipes with their EVL variants.
2687 VPTypeAnalysis TypeInfo(Plan);
2688 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2689 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2690
2691 assert(all_of(Plan.getVF().users(),
2694 "User of VF that we can't transform to EVL.");
2695 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2697 });
2698
2699 assert(all_of(Plan.getVFxUF().users(),
2700 [&LoopRegion, &Plan](VPUser *U) {
2701 return match(U,
2702 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2703 m_Specific(&Plan.getVFxUF()))) ||
2704 isa<VPWidenPointerInductionRecipe>(U);
2705 }) &&
2706 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2707 "increment of the canonical induction.");
2708 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2709 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2710 // canonical induction must not be updated.
2712 });
2713
2714 // Defer erasing recipes till the end so that we don't invalidate the
2715 // VPTypeAnalysis cache.
2717
2718 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2719 // contained.
2720 bool ContainsFORs =
2722 if (ContainsFORs) {
2723 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2724 VPValue *MaxEVL = &Plan.getVF();
2725 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2726 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2727 MaxEVL = Builder.createScalarZExtOrTrunc(
2728 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2729 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2730
2731 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2732 VPValue *PrevEVL = Builder.createScalarPhi(
2733 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2734
2737 for (VPRecipeBase &R : *VPBB) {
2738 VPValue *V1, *V2;
2739 if (!match(&R,
2741 m_VPValue(V1), m_VPValue(V2))))
2742 continue;
2743 VPValue *Imm = Plan.getOrAddLiveIn(
2746 Intrinsic::experimental_vp_splice,
2747 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2748 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2749 VPSplice->insertBefore(&R);
2750 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2751 ToErase.push_back(&R);
2752 }
2753 }
2754 }
2755
2756 VPValue *HeaderMask = findHeaderMask(Plan);
2757 if (!HeaderMask)
2758 return;
2759
2760 // Replace header masks with a mask equivalent to predicating by EVL:
2761 //
2762 // icmp ule widen-canonical-iv backedge-taken-count
2763 // ->
2764 // icmp ult step-vector, EVL
2765 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2766 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2767 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2768 VPValue *EVLMask = Builder.createICmp(
2770 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2771 HeaderMask->replaceAllUsesWith(EVLMask);
2772 ToErase.push_back(HeaderMask->getDefiningRecipe());
2773
2774 // Try to optimize header mask recipes away to their EVL variants.
2775 // TODO: Split optimizeMaskToEVL out and move into
2776 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2777 // tryToBuildVPlanWithVPRecipes beforehand.
2778 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2779 auto *CurRecipe = cast<VPRecipeBase>(U);
2780 VPRecipeBase *EVLRecipe =
2781 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2782 if (!EVLRecipe)
2783 continue;
2784
2785 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2786 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2787 "New recipe must define the same number of values as the "
2788 "original.");
2789 EVLRecipe->insertBefore(CurRecipe);
2791 EVLRecipe)) {
2792 for (unsigned I = 0; I < NumDefVal; ++I) {
2793 VPValue *CurVPV = CurRecipe->getVPValue(I);
2794 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2795 }
2796 }
2797 ToErase.push_back(CurRecipe);
2798 }
2799 // Remove dead EVL mask.
2800 if (EVLMask->getNumUsers() == 0)
2801 ToErase.push_back(EVLMask->getDefiningRecipe());
2802
2803 for (VPRecipeBase *R : reverse(ToErase)) {
2804 SmallVector<VPValue *> PossiblyDead(R->operands());
2805 R->eraseFromParent();
2806 for (VPValue *Op : PossiblyDead)
2808 }
2809}
2810
2811/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2812/// replaces all uses except the canonical IV increment of
2813/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2814/// is used only for loop iterations counting after this transformation.
2815///
2816/// The function uses the following definitions:
2817/// %StartV is the canonical induction start value.
2818///
2819/// The function adds the following recipes:
2820///
2821/// vector.ph:
2822/// ...
2823///
2824/// vector.body:
2825/// ...
2826/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2827/// [ %NextEVLIV, %vector.body ]
2828/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2829/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2830/// ...
2831/// %OpEVL = cast i32 %VPEVL to IVSize
2832/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2833/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2834/// ...
2835///
2836/// If MaxSafeElements is provided, the function adds the following recipes:
2837/// vector.ph:
2838/// ...
2839///
2840/// vector.body:
2841/// ...
2842/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2843/// [ %NextEVLIV, %vector.body ]
2844/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2845/// %cmp = cmp ult %AVL, MaxSafeElements
2846/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2847/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2848/// ...
2849/// %OpEVL = cast i32 %VPEVL to IVSize
2850/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2851/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2852/// ...
2853///
2855 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2856 if (Plan.hasScalarVFOnly())
2857 return;
2858 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2859 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2860
2861 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2862 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2863 VPValue *StartV = CanonicalIVPHI->getStartValue();
2864
2865 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2866 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2867 EVLPhi->insertAfter(CanonicalIVPHI);
2868 VPBuilder Builder(Header, Header->getFirstNonPhi());
2869 // Create the AVL (application vector length), starting from TC -> 0 in steps
2870 // of EVL.
2871 VPPhi *AVLPhi = Builder.createScalarPhi(
2872 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2873 VPValue *AVL = AVLPhi;
2874
2875 if (MaxSafeElements) {
2876 // Support for MaxSafeDist for correct loop emission.
2877 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2878 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2879 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2880 "safe_avl");
2881 }
2882 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2884
2885 auto *CanonicalIVIncrement =
2886 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2887 Builder.setInsertPoint(CanonicalIVIncrement);
2888 VPValue *OpVPEVL = VPEVL;
2889
2890 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2891 OpVPEVL = Builder.createScalarZExtOrTrunc(
2892 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2893
2894 auto *NextEVLIV = Builder.createOverflowingOp(
2895 Instruction::Add, {OpVPEVL, EVLPhi},
2896 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2897 CanonicalIVIncrement->hasNoSignedWrap()},
2898 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2899 EVLPhi->addOperand(NextEVLIV);
2900
2901 VPValue *NextAVL = Builder.createOverflowingOp(
2902 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2903 DebugLoc::getCompilerGenerated(), "avl.next");
2904 AVLPhi->addOperand(NextAVL);
2905
2906 transformRecipestoEVLRecipes(Plan, *VPEVL);
2907
2908 // Replace all uses of VPCanonicalIVPHIRecipe by
2909 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2910 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2911 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2912 // TODO: support unroll factor > 1.
2913 Plan.setUF(1);
2914}
2915
2917 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2918 // There should be only one EVL PHI in the entire plan.
2919 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2920
2923 for (VPRecipeBase &R : VPBB->phis())
2924 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2925 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2926 EVLPhi = PhiR;
2927 }
2928
2929 // Early return if no EVL PHI is found.
2930 if (!EVLPhi)
2931 return;
2932
2933 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2934 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2935 VPValue *AVL;
2936 [[maybe_unused]] bool FoundAVL =
2937 match(EVLIncrement,
2938 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2939 assert(FoundAVL && "Didn't find AVL?");
2940
2941 // The AVL may be capped to a safe distance.
2942 VPValue *SafeAVL;
2943 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2944 AVL = SafeAVL;
2945
2946 VPValue *AVLNext;
2947 [[maybe_unused]] bool FoundAVLNext =
2949 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2950 assert(FoundAVLNext && "Didn't find AVL backedge?");
2951
2952 // Convert EVLPhi to concrete recipe.
2953 auto *ScalarR =
2954 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2955 EVLPhi->getDebugLoc(), "evl.based.iv");
2956 EVLPhi->replaceAllUsesWith(ScalarR);
2957 EVLPhi->eraseFromParent();
2958
2959 // Replace CanonicalIVInc with EVL-PHI increment.
2960 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2961 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2962 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2963 m_Specific(&Plan.getVFxUF()))) &&
2964 "Unexpected canonical iv");
2965 Backedge->replaceAllUsesWith(EVLIncrement);
2966
2967 // Remove unused phi and increment.
2968 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2969 CanonicalIVIncrement->eraseFromParent();
2970 CanonicalIV->eraseFromParent();
2971
2972 // Replace the use of VectorTripCount in the latch-exiting block.
2973 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2974 // After: (branch-on-cond eq AVLNext, 0)
2975
2976 VPBasicBlock *LatchExiting =
2977 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2978 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2979 // Skip single-iteration loop region
2980 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2981 return;
2982 assert(LatchExitingBr &&
2983 match(LatchExitingBr,
2984 m_BranchOnCount(m_VPValue(EVLIncrement),
2985 m_Specific(&Plan.getVectorTripCount()))) &&
2986 "Unexpected terminator in EVL loop");
2987
2988 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2989 VPBuilder Builder(LatchExitingBr);
2990 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2991 Plan.getConstantInt(AVLTy, 0));
2992 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2993 LatchExitingBr->eraseFromParent();
2994}
2995
2997 VPlan &Plan, PredicatedScalarEvolution &PSE,
2998 const DenseMap<Value *, const SCEV *> &StridesMap) {
2999 // Replace VPValues for known constant strides guaranteed by predicate scalar
3000 // evolution.
3001 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3002 auto *R = cast<VPRecipeBase>(&U);
3003 return R->getRegion() ||
3004 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3005 };
3006 ValueToSCEVMapTy RewriteMap;
3007 for (const SCEV *Stride : StridesMap.values()) {
3008 using namespace SCEVPatternMatch;
3009 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3010 const APInt *StrideConst;
3011 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3012 // Only handle constant strides for now.
3013 continue;
3014
3015 auto *CI = Plan.getConstantInt(*StrideConst);
3016 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3017 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3018
3019 // The versioned value may not be used in the loop directly but through a
3020 // sext/zext. Add new live-ins in those cases.
3021 for (Value *U : StrideV->users()) {
3023 continue;
3024 VPValue *StrideVPV = Plan.getLiveIn(U);
3025 if (!StrideVPV)
3026 continue;
3027 unsigned BW = U->getType()->getScalarSizeInBits();
3028 APInt C =
3029 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3030 VPValue *CI = Plan.getConstantInt(C);
3031 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3032 }
3033 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3034 }
3035
3036 for (VPRecipeBase &R : *Plan.getEntry()) {
3037 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3038 if (!ExpSCEV)
3039 continue;
3040 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3041 auto *NewSCEV =
3042 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3043 if (NewSCEV != ScevExpr) {
3044 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3045 ExpSCEV->replaceAllUsesWith(NewExp);
3046 if (Plan.getTripCount() == ExpSCEV)
3047 Plan.resetTripCount(NewExp);
3048 }
3049 }
3050}
3051
3053 VPlan &Plan,
3054 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3055 // Collect recipes in the backward slice of `Root` that may generate a poison
3056 // value that is used after vectorization.
3058 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3060 Worklist.push_back(Root);
3061
3062 // Traverse the backward slice of Root through its use-def chain.
3063 while (!Worklist.empty()) {
3064 VPRecipeBase *CurRec = Worklist.pop_back_val();
3065
3066 if (!Visited.insert(CurRec).second)
3067 continue;
3068
3069 // Prune search if we find another recipe generating a widen memory
3070 // instruction. Widen memory instructions involved in address computation
3071 // will lead to gather/scatter instructions, which don't need to be
3072 // handled.
3074 VPHeaderPHIRecipe>(CurRec))
3075 continue;
3076
3077 // This recipe contributes to the address computation of a widen
3078 // load/store. If the underlying instruction has poison-generating flags,
3079 // drop them directly.
3080 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3081 VPValue *A, *B;
3082 // Dropping disjoint from an OR may yield incorrect results, as some
3083 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3084 // for dependence analysis). Instead, replace it with an equivalent Add.
3085 // This is possible as all users of the disjoint OR only access lanes
3086 // where the operands are disjoint or poison otherwise.
3087 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3088 RecWithFlags->isDisjoint()) {
3089 VPBuilder Builder(RecWithFlags);
3090 VPInstruction *New = Builder.createOverflowingOp(
3091 Instruction::Add, {A, B}, {false, false},
3092 RecWithFlags->getDebugLoc());
3093 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3094 RecWithFlags->replaceAllUsesWith(New);
3095 RecWithFlags->eraseFromParent();
3096 CurRec = New;
3097 } else
3098 RecWithFlags->dropPoisonGeneratingFlags();
3099 } else {
3102 (void)Instr;
3103 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3104 "found instruction with poison generating flags not covered by "
3105 "VPRecipeWithIRFlags");
3106 }
3107
3108 // Add new definitions to the worklist.
3109 for (VPValue *Operand : CurRec->operands())
3110 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3111 Worklist.push_back(OpDef);
3112 }
3113 });
3114
3115 // Traverse all the recipes in the VPlan and collect the poison-generating
3116 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3117 // VPInterleaveRecipe.
3118 auto Iter = vp_depth_first_deep(Plan.getEntry());
3120 for (VPRecipeBase &Recipe : *VPBB) {
3121 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3122 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3123 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3124 if (AddrDef && WidenRec->isConsecutive() &&
3125 BlockNeedsPredication(UnderlyingInstr.getParent()))
3126 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3127 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3128 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3129 if (AddrDef) {
3130 // Check if any member of the interleave group needs predication.
3131 const InterleaveGroup<Instruction> *InterGroup =
3132 InterleaveRec->getInterleaveGroup();
3133 bool NeedPredication = false;
3134 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3135 I < NumMembers; ++I) {
3136 Instruction *Member = InterGroup->getMember(I);
3137 if (Member)
3138 NeedPredication |= BlockNeedsPredication(Member->getParent());
3139 }
3140
3141 if (NeedPredication)
3142 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3143 }
3144 }
3145 }
3146 }
3147}
3148
3150 VPlan &Plan,
3152 &InterleaveGroups,
3153 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3154 if (InterleaveGroups.empty())
3155 return;
3156
3157 // Interleave memory: for each Interleave Group we marked earlier as relevant
3158 // for this VPlan, replace the Recipes widening its memory instructions with a
3159 // single VPInterleaveRecipe at its insertion point.
3160 VPDominatorTree VPDT(Plan);
3161 for (const auto *IG : InterleaveGroups) {
3162 auto *Start =
3163 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3164 VPIRMetadata InterleaveMD(*Start);
3165 SmallVector<VPValue *, 4> StoredValues;
3166 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3167 StoredValues.push_back(StoreR->getStoredValue());
3168 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3169 Instruction *MemberI = IG->getMember(I);
3170 if (!MemberI)
3171 continue;
3172 VPWidenMemoryRecipe *MemoryR =
3173 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3174 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3175 StoredValues.push_back(StoreR->getStoredValue());
3176 InterleaveMD.intersect(*MemoryR);
3177 }
3178
3179 bool NeedsMaskForGaps =
3180 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3181 (!StoredValues.empty() && !IG->isFull());
3182
3183 Instruction *IRInsertPos = IG->getInsertPos();
3184 auto *InsertPos =
3185 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3186
3188 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3189 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3190 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3191
3192 // Get or create the start address for the interleave group.
3193 VPValue *Addr = Start->getAddr();
3194 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3195 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3196 // We cannot re-use the address of member zero because it does not
3197 // dominate the insert position. Instead, use the address of the insert
3198 // position and create a PtrAdd adjusting it to the address of member
3199 // zero.
3200 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3201 // InsertPos or sink loads above zero members to join it.
3202 assert(IG->getIndex(IRInsertPos) != 0 &&
3203 "index of insert position shouldn't be zero");
3204 auto &DL = IRInsertPos->getDataLayout();
3205 APInt Offset(32,
3206 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3207 IG->getIndex(IRInsertPos),
3208 /*IsSigned=*/true);
3209 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3210 VPBuilder B(InsertPos);
3211 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3212 }
3213 // If the group is reverse, adjust the index to refer to the last vector
3214 // lane instead of the first. We adjust the index from the first vector
3215 // lane, rather than directly getting the pointer for lane VF - 1, because
3216 // the pointer operand of the interleaved access is supposed to be uniform.
3217 if (IG->isReverse()) {
3218 auto *ReversePtr = new VPVectorEndPointerRecipe(
3219 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3220 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3221 ReversePtr->insertBefore(InsertPos);
3222 Addr = ReversePtr;
3223 }
3224 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3225 InsertPos->getMask(), NeedsMaskForGaps,
3226 InterleaveMD, InsertPos->getDebugLoc());
3227 VPIG->insertBefore(InsertPos);
3228
3229 unsigned J = 0;
3230 for (unsigned i = 0; i < IG->getFactor(); ++i)
3231 if (Instruction *Member = IG->getMember(i)) {
3232 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3233 if (!Member->getType()->isVoidTy()) {
3234 VPValue *OriginalV = MemberR->getVPSingleValue();
3235 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3236 J++;
3237 }
3238 MemberR->eraseFromParent();
3239 }
3240 }
3241}
3242
3243/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3244/// value, phi and backedge value. In the following example:
3245///
3246/// vector.ph:
3247/// Successor(s): vector loop
3248///
3249/// <x1> vector loop: {
3250/// vector.body:
3251/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3252/// ...
3253/// EMIT branch-on-count ...
3254/// No successors
3255/// }
3256///
3257/// WIDEN-INDUCTION will get expanded to:
3258///
3259/// vector.ph:
3260/// ...
3261/// vp<%induction.start> = ...
3262/// vp<%induction.increment> = ...
3263///
3264/// Successor(s): vector loop
3265///
3266/// <x1> vector loop: {
3267/// vector.body:
3268/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3269/// ...
3270/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3271/// EMIT branch-on-count ...
3272/// No successors
3273/// }
3274static void
3276 VPTypeAnalysis &TypeInfo) {
3277 VPlan *Plan = WidenIVR->getParent()->getPlan();
3278 VPValue *Start = WidenIVR->getStartValue();
3279 VPValue *Step = WidenIVR->getStepValue();
3280 VPValue *VF = WidenIVR->getVFValue();
3281 DebugLoc DL = WidenIVR->getDebugLoc();
3282
3283 // The value from the original loop to which we are mapping the new induction
3284 // variable.
3285 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3286
3287 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3290 // FIXME: The newly created binary instructions should contain nsw/nuw
3291 // flags, which can be found from the original scalar operations.
3292 VPIRFlags Flags;
3293 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3294 AddOp = Instruction::Add;
3295 MulOp = Instruction::Mul;
3296 } else {
3297 AddOp = ID.getInductionOpcode();
3298 MulOp = Instruction::FMul;
3299 Flags = ID.getInductionBinOp()->getFastMathFlags();
3300 }
3301
3302 // If the phi is truncated, truncate the start and step values.
3303 VPBuilder Builder(Plan->getVectorPreheader());
3304 Type *StepTy = TypeInfo.inferScalarType(Step);
3305 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3306 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3307 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3308 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3309 StepTy = Ty;
3310 }
3311
3312 // Construct the initial value of the vector IV in the vector loop preheader.
3313 Type *IVIntTy =
3315 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3316 if (StepTy->isFloatingPointTy())
3317 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3318
3319 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3320 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3321
3322 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3323 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3324 DebugLoc::getUnknown(), "induction");
3325
3326 // Create the widened phi of the vector IV.
3327 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3328 WidenIVR->getDebugLoc(), "vec.ind");
3329 WidePHI->insertBefore(WidenIVR);
3330
3331 // Create the backedge value for the vector IV.
3332 VPValue *Inc;
3333 VPValue *Prev;
3334 // If unrolled, use the increment and prev value from the operands.
3335 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3336 Inc = SplatVF;
3337 Prev = WidenIVR->getLastUnrolledPartOperand();
3338 } else {
3339 if (VPRecipeBase *R = VF->getDefiningRecipe())
3340 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3341 // Multiply the vectorization factor by the step using integer or
3342 // floating-point arithmetic as appropriate.
3343 if (StepTy->isFloatingPointTy())
3344 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3345 DL);
3346 else
3347 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3348 TypeInfo.inferScalarType(VF), DL);
3349
3350 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3351 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3352 Prev = WidePHI;
3353 }
3354
3356 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3357 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3358 WidenIVR->getDebugLoc(), "vec.ind.next");
3359
3360 WidePHI->addOperand(Next);
3361
3362 WidenIVR->replaceAllUsesWith(WidePHI);
3363}
3364
3365/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3366/// initial value, phi and backedge value. In the following example:
3367///
3368/// <x1> vector loop: {
3369/// vector.body:
3370/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3371/// ...
3372/// EMIT branch-on-count ...
3373/// }
3374///
3375/// WIDEN-POINTER-INDUCTION will get expanded to:
3376///
3377/// <x1> vector loop: {
3378/// vector.body:
3379/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3380/// EMIT %mul = mul %stepvector, %step
3381/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3382/// ...
3383/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3384/// EMIT branch-on-count ...
3385/// }
3387 VPTypeAnalysis &TypeInfo) {
3388 VPlan *Plan = R->getParent()->getPlan();
3389 VPValue *Start = R->getStartValue();
3390 VPValue *Step = R->getStepValue();
3391 VPValue *VF = R->getVFValue();
3392
3393 assert(R->getInductionDescriptor().getKind() ==
3395 "Not a pointer induction according to InductionDescriptor!");
3396 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3397 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3398 "Recipe should have been replaced");
3399
3400 VPBuilder Builder(R);
3401 DebugLoc DL = R->getDebugLoc();
3402
3403 // Build a scalar pointer phi.
3404 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3405
3406 // Create actual address geps that use the pointer phi as base and a
3407 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3408 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3409 Type *StepTy = TypeInfo.inferScalarType(Step);
3410 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3411 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3412 VPValue *PtrAdd = Builder.createNaryOp(
3413 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3414 R->replaceAllUsesWith(PtrAdd);
3415
3416 // Create the backedge value for the scalar pointer phi.
3418 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3419 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3420 DL);
3421 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3422
3423 VPValue *InductionGEP =
3424 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3425 ScalarPtrPhi->addOperand(InductionGEP);
3426}
3427
3429 // Replace loop regions with explicity CFG.
3430 SmallVector<VPRegionBlock *> LoopRegions;
3432 vp_depth_first_deep(Plan.getEntry()))) {
3433 if (!R->isReplicator())
3434 LoopRegions.push_back(R);
3435 }
3436 for (VPRegionBlock *R : LoopRegions)
3437 R->dissolveToCFGLoop();
3438}
3439
3441 VPTypeAnalysis TypeInfo(Plan);
3444 vp_depth_first_deep(Plan.getEntry()))) {
3445 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3446 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3447 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3448 ToRemove.push_back(WidenIVR);
3449 continue;
3450 }
3451
3452 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3453 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3454 ToRemove.push_back(WidenIVR);
3455 continue;
3456 }
3457
3458 // Expand VPBlendRecipe into VPInstruction::Select.
3459 VPBuilder Builder(&R);
3460 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3461 VPValue *Select = Blend->getIncomingValue(0);
3462 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3463 Select = Builder.createSelect(Blend->getMask(I),
3464 Blend->getIncomingValue(I), Select,
3465 R.getDebugLoc(), "predphi");
3466 Blend->replaceAllUsesWith(Select);
3467 ToRemove.push_back(Blend);
3468 }
3469
3470 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3471 Expr->decompose();
3472 ToRemove.push_back(Expr);
3473 }
3474
3475 VPValue *VectorStep;
3476 VPValue *ScalarStep;
3478 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3479 continue;
3480
3481 // Expand WideIVStep.
3482 auto *VPI = cast<VPInstruction>(&R);
3483 Type *IVTy = TypeInfo.inferScalarType(VPI);
3484 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3486 ? Instruction::UIToFP
3487 : Instruction::Trunc;
3488 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3489 }
3490
3491 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3492 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3493 ScalarStep =
3494 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3495 }
3496
3497 VPIRFlags Flags;
3498 if (IVTy->isFloatingPointTy())
3499 Flags = {VPI->getFastMathFlags()};
3500
3501 unsigned MulOpc =
3502 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3503 VPInstruction *Mul = Builder.createNaryOp(
3504 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3505 VectorStep = Mul;
3506 VPI->replaceAllUsesWith(VectorStep);
3507 ToRemove.push_back(VPI);
3508 }
3509 }
3510
3511 for (VPRecipeBase *R : ToRemove)
3512 R->eraseFromParent();
3513}
3514
3516 VPBasicBlock *EarlyExitVPBB,
3517 VPlan &Plan,
3518 VPBasicBlock *HeaderVPBB,
3519 VPBasicBlock *LatchVPBB) {
3520 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3521 if (!EarlyExitVPBB->getSinglePredecessor() &&
3522 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3523 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3524 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3525 "unsupported early exit VPBB");
3526 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3527 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3528 // of the phis.
3529 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3530 cast<VPIRPhi>(&R)->swapOperands();
3531 }
3532
3533 VPBuilder Builder(LatchVPBB->getTerminator());
3534 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3535 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3536 "Terminator must be be BranchOnCond");
3537 VPValue *CondOfEarlyExitingVPBB =
3538 EarlyExitingVPBB->getTerminator()->getOperand(0);
3539 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3540 ? CondOfEarlyExitingVPBB
3541 : Builder.createNot(CondOfEarlyExitingVPBB);
3542
3543 // Split the middle block and have it conditionally branch to the early exit
3544 // block if CondToEarlyExit.
3545 VPValue *IsEarlyExitTaken =
3546 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3547 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3548 VPBasicBlock *VectorEarlyExitVPBB =
3549 Plan.createVPBasicBlock("vector.early.exit");
3550 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3551 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3552 NewMiddle->swapSuccessors();
3553
3554 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3555
3556 // Update the exit phis in the early exit block.
3557 VPBuilder MiddleBuilder(NewMiddle);
3558 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3559 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3560 auto *ExitIRI = cast<VPIRPhi>(&R);
3561 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3562 // a single predecessor and 1 if it has two.
3563 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3564 if (ExitIRI->getNumOperands() != 1) {
3565 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3566 // predecessor. Extract its last lane.
3567 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3568 }
3569
3570 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3571 if (!IncomingFromEarlyExit->isLiveIn()) {
3572 // Update the incoming value from the early exit.
3573 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3574 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3575 "first.active.lane");
3576 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3577 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3578 nullptr, "early.exit.value");
3579 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3580 }
3581 }
3582 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3583
3584 // Replace the condition controlling the non-early exit from the vector loop
3585 // with one exiting if either the original condition of the vector latch is
3586 // true or the early exit has been taken.
3587 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3588 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3589 "Unexpected terminator");
3590 auto *IsLatchExitTaken =
3591 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3592 LatchExitingBranch->getOperand(1));
3593 auto *AnyExitTaken = Builder.createNaryOp(
3594 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3595 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3596 LatchExitingBranch->eraseFromParent();
3597}
3598
3599/// This function tries convert extended in-loop reductions to
3600/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3601/// valid. The created recipe must be decomposed to its constituent
3602/// recipes before execution.
3603static VPExpressionRecipe *
3605 VFRange &Range) {
3606 Type *RedTy = Ctx.Types.inferScalarType(Red);
3607 VPValue *VecOp = Red->getVecOp();
3608
3609 // Clamp the range if using extended-reduction is profitable.
3610 auto IsExtendedRedValidAndClampRange =
3611 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3613 [&](ElementCount VF) {
3614 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3616
3617 InstructionCost ExtRedCost;
3618 InstructionCost ExtCost =
3619 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3620 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3621
3625 // FIXME: Move partial reduction creation, costing and clamping
3626 // here from LoopVectorize.cpp.
3627 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3628 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3629 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3630 } else {
3631 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3632 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3633 Red->getFastMathFlags(), CostKind);
3634 }
3635 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3636 },
3637 Range);
3638 };
3639
3640 VPValue *A;
3641 // Match reduce(ext)).
3642 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3643 IsExtendedRedValidAndClampRange(
3644 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3645 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3646 Ctx.Types.inferScalarType(A)))
3647 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3648
3649 return nullptr;
3650}
3651
3652/// This function tries convert extended in-loop reductions to
3653/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3654/// and valid. The created VPExpressionRecipe must be decomposed to its
3655/// constituent recipes before execution. Patterns of the
3656/// VPExpressionRecipe:
3657/// reduce.add(mul(...)),
3658/// reduce.add(mul(ext(A), ext(B))),
3659/// reduce.add(ext(mul(ext(A), ext(B)))).
3660static VPExpressionRecipe *
3662 VPCostContext &Ctx, VFRange &Range) {
3663 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3664
3665 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3666 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3667 return nullptr;
3668
3669 Type *RedTy = Ctx.Types.inferScalarType(Red);
3670
3671 // Clamp the range if using multiply-accumulate-reduction is profitable.
3672 auto IsMulAccValidAndClampRange =
3674 VPWidenCastRecipe *OuterExt) -> bool {
3676 [&](ElementCount VF) {
3678 Type *SrcTy =
3679 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3680 InstructionCost MulAccCost;
3681
3682 if (IsPartialReduction) {
3683 Type *SrcTy2 =
3684 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3685 // FIXME: Move partial reduction creation, costing and clamping
3686 // here from LoopVectorize.cpp.
3687 MulAccCost = Ctx.TTI.getPartialReductionCost(
3688 Opcode, SrcTy, SrcTy2, RedTy, VF,
3690 Ext0->getOpcode())
3693 Ext1->getOpcode())
3695 Mul->getOpcode(), CostKind);
3696 } else {
3697 // Only partial reductions support mixed extends at the moment.
3698 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3699 return false;
3700
3701 bool IsZExt =
3702 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3703 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3704 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3705 SrcVecTy, CostKind);
3706 }
3707
3708 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3709 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3710 InstructionCost ExtCost = 0;
3711 if (Ext0)
3712 ExtCost += Ext0->computeCost(VF, Ctx);
3713 if (Ext1)
3714 ExtCost += Ext1->computeCost(VF, Ctx);
3715 if (OuterExt)
3716 ExtCost += OuterExt->computeCost(VF, Ctx);
3717
3718 return MulAccCost.isValid() &&
3719 MulAccCost < ExtCost + MulCost + RedCost;
3720 },
3721 Range);
3722 };
3723
3724 VPValue *VecOp = Red->getVecOp();
3725 VPRecipeBase *Sub = nullptr;
3726 VPValue *A, *B;
3727 VPValue *Tmp = nullptr;
3728 // Sub reductions could have a sub between the add reduction and vec op.
3729 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3730 Sub = VecOp->getDefiningRecipe();
3731 VecOp = Tmp;
3732 }
3733
3734 // If ValB is a constant and can be safely extended, truncate it to the same
3735 // type as ExtA's operand, then extend it to the same type as ExtA. This
3736 // creates two uniform extends that can more easily be matched by the rest of
3737 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3738 // replaced with the new extend of the constant.
3739 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3740 VPWidenCastRecipe *&ExtB,
3741 VPValue *&ValB, VPWidenRecipe *Mul) {
3742 if (!ExtA || ExtB || !ValB->isLiveIn())
3743 return;
3744 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3745 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3746 const APInt *Const;
3747 if (!match(ValB, m_APInt(Const)) ||
3749 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3750 return;
3751 // The truncate ensures that the type of each extended operand is the
3752 // same, and it's been proven that the constant can be extended from
3753 // NarrowTy safely. Necessary since ExtA's extended operand would be
3754 // e.g. an i8, while the const will likely be an i32. This will be
3755 // elided by later optimisations.
3756 VPBuilder Builder(Mul);
3757 auto *Trunc =
3758 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3759 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3760 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3761 Mul->setOperand(1, ExtB);
3762 };
3763
3764 // Try to match reduce.add(mul(...)).
3765 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3768 auto *Mul = cast<VPWidenRecipe>(VecOp);
3769
3770 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3771 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3772
3773 // Match reduce.add/sub(mul(ext, ext)).
3774 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3775 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3776 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3777 if (Sub)
3778 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3779 cast<VPWidenRecipe>(Sub), Red);
3780 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3781 }
3782 // TODO: Add an expression type for this variant with a negated mul
3783 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3784 return new VPExpressionRecipe(Mul, Red);
3785 }
3786 // TODO: Add an expression type for negated versions of other expression
3787 // variants.
3788 if (Sub)
3789 return nullptr;
3790
3791 // Match reduce.add(ext(mul(A, B))).
3792 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3793 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
3794 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
3797
3798 // reduce.add(ext(mul(ext, const)))
3799 // -> reduce.add(ext(mul(ext, ext(const))))
3800 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3801
3802 // reduce.add(ext(mul(ext(A), ext(B))))
3803 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3804 // The inner extends must either have the same opcode as the outer extend or
3805 // be the same, in which case the multiply can never result in a negative
3806 // value and the outer extend can be folded away by doing wider
3807 // extends for the operands of the mul.
3808 if (Ext0 && Ext1 &&
3809 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3810 Ext0->getOpcode() == Ext1->getOpcode() &&
3811 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3812 auto *NewExt0 = new VPWidenCastRecipe(
3813 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3814 *Ext0, Ext0->getDebugLoc());
3815 NewExt0->insertBefore(Ext0);
3816
3817 VPWidenCastRecipe *NewExt1 = NewExt0;
3818 if (Ext0 != Ext1) {
3819 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3820 Ext->getResultType(), *Ext1, *Ext1,
3821 Ext1->getDebugLoc());
3822 NewExt1->insertBefore(Ext1);
3823 }
3824 Mul->setOperand(0, NewExt0);
3825 Mul->setOperand(1, NewExt1);
3826 Red->setOperand(1, Mul);
3827 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3828 }
3829 }
3830 return nullptr;
3831}
3832
3833/// This function tries to create abstract recipes from the reduction recipe for
3834/// following optimizations and cost estimation.
3836 VPCostContext &Ctx,
3837 VFRange &Range) {
3838 VPExpressionRecipe *AbstractR = nullptr;
3839 auto IP = std::next(Red->getIterator());
3840 auto *VPBB = Red->getParent();
3841 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3842 AbstractR = MulAcc;
3843 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3844 AbstractR = ExtRed;
3845 // Cannot create abstract inloop reduction recipes.
3846 if (!AbstractR)
3847 return;
3848
3849 AbstractR->insertBefore(*VPBB, IP);
3850 Red->replaceAllUsesWith(AbstractR);
3851}
3852
3863
3865 if (Plan.hasScalarVFOnly())
3866 return;
3867
3868#ifndef NDEBUG
3869 VPDominatorTree VPDT(Plan);
3870#endif
3871
3872 SmallVector<VPValue *> VPValues;
3875 append_range(VPValues, Plan.getLiveIns());
3876 for (VPRecipeBase &R : *Plan.getEntry())
3877 append_range(VPValues, R.definedValues());
3878
3879 auto *VectorPreheader = Plan.getVectorPreheader();
3880 for (VPValue *VPV : VPValues) {
3882 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3883 isa<Constant>(VPV->getLiveInIRValue())))
3884 continue;
3885
3886 // Add explicit broadcast at the insert point that dominates all users.
3887 VPBasicBlock *HoistBlock = VectorPreheader;
3888 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3889 for (VPUser *User : VPV->users()) {
3890 if (User->usesScalars(VPV))
3891 continue;
3892 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3893 HoistPoint = HoistBlock->begin();
3894 else
3895 assert(VPDT.dominates(VectorPreheader,
3896 cast<VPRecipeBase>(User)->getParent()) &&
3897 "All users must be in the vector preheader or dominated by it");
3898 }
3899
3900 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3901 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3902 VPV->replaceUsesWithIf(Broadcast,
3903 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3904 return Broadcast != &U && !U.usesScalars(VPV);
3905 });
3906 }
3907}
3908
3910 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3912 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3913 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3914
3915 VPValue *TC = Plan.getTripCount();
3916 // Skip cases for which the trip count may be non-trivial to materialize.
3917 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3918 // tail is required.
3919 if (!Plan.hasScalarTail() ||
3921 Plan.getScalarPreheader() ||
3922 !TC->isLiveIn())
3923 return;
3924
3925 // Materialize vector trip counts for constants early if it can simply
3926 // be computed as (Original TC / VF * UF) * VF * UF.
3927 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3928 // tail-folded loops.
3929 ScalarEvolution &SE = *PSE.getSE();
3930 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3931 if (!isa<SCEVConstant>(TCScev))
3932 return;
3933 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3934 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3935 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3936 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3937}
3938
3940 VPBasicBlock *VectorPH) {
3942 if (BTC->getNumUsers() == 0)
3943 return;
3944
3945 VPBuilder Builder(VectorPH, VectorPH->begin());
3946 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3947 auto *TCMO = Builder.createNaryOp(
3948 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
3949 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3950 BTC->replaceAllUsesWith(TCMO);
3951}
3952
3954 if (Plan.hasScalarVFOnly())
3955 return;
3956
3957 VPTypeAnalysis TypeInfo(Plan);
3958 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3959 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3961 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3962 vp_depth_first_shallow(LoopRegion->getEntry()));
3963 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3964 // VPInstructions, excluding ones in replicate regions. Those are not
3965 // materialized explicitly yet. Those vector users are still handled in
3966 // VPReplicateRegion::execute(), via shouldPack().
3967 // TODO: materialize build vectors for replicating recipes in replicating
3968 // regions.
3969 for (VPBasicBlock *VPBB :
3970 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3971 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3973 continue;
3974 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3975 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3976 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3977 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3978 };
3979 if ((isa<VPReplicateRecipe>(DefR) &&
3980 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3981 (isa<VPInstruction>(DefR) &&
3983 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3984 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3985 continue;
3986
3987 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3988 unsigned Opcode = ScalarTy->isStructTy()
3991 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3992 BuildVector->insertAfter(DefR);
3993
3994 DefR->replaceUsesWithIf(
3995 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3996 VPUser &U, unsigned) {
3997 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3998 });
3999 }
4000 }
4001
4002 // Create explicit VPInstructions to convert vectors to scalars. The current
4003 // implementation is conservative - it may miss some cases that may or may not
4004 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4005 // if they are known to operate on scalar values.
4006 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4007 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4010 continue;
4011 for (VPValue *Def : R.definedValues()) {
4012 // Skip recipes that are single-scalar or only have their first lane
4013 // used.
4014 // TODO: The Defs skipped here may or may not be vector values.
4015 // Introduce Unpacks, and remove them later, if they are guaranteed to
4016 // produce scalar values.
4018 continue;
4019
4020 // At the moment, we create unpacks only for scalar users outside
4021 // replicate regions. Recipes inside replicate regions still extract the
4022 // required lanes implicitly.
4023 // TODO: Remove once replicate regions are unrolled completely.
4024 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4025 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4026 return U->usesScalars(Def) &&
4027 (!ParentRegion || !ParentRegion->isReplicator());
4028 };
4029 if (none_of(Def->users(), IsCandidateUnpackUser))
4030 continue;
4031
4032 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4033 if (R.isPhi())
4034 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4035 else
4036 Unpack->insertAfter(&R);
4037 Def->replaceUsesWithIf(Unpack,
4038 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4039 return IsCandidateUnpackUser(&U);
4040 });
4041 }
4042 }
4043 }
4044}
4045
4047 VPBasicBlock *VectorPHVPBB,
4048 bool TailByMasking,
4049 bool RequiresScalarEpilogue) {
4050 VPValue &VectorTC = Plan.getVectorTripCount();
4051 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4052 // There's nothing to do if there are no users of the vector trip count or its
4053 // IR value has already been set.
4054 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4055 return;
4056
4057 VPValue *TC = Plan.getTripCount();
4058 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4059 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4060 VPValue *Step = &Plan.getVFxUF();
4061
4062 // If the tail is to be folded by masking, round the number of iterations N
4063 // up to a multiple of Step instead of rounding down. This is done by first
4064 // adding Step-1 and then rounding down. Note that it's ok if this addition
4065 // overflows: the vector induction variable will eventually wrap to zero given
4066 // that it starts at zero and its Step is a power of two; the loop will then
4067 // exit, with the last early-exit vector comparison also producing all-true.
4068 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4069 // is accounted for in emitIterationCountCheck that adds an overflow check.
4070 if (TailByMasking) {
4071 TC = Builder.createNaryOp(
4072 Instruction::Add,
4073 {TC, Builder.createNaryOp(Instruction::Sub,
4074 {Step, Plan.getConstantInt(TCTy, 1)})},
4075 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4076 }
4077
4078 // Now we need to generate the expression for the part of the loop that the
4079 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4080 // iterations are not required for correctness, or N - Step, otherwise. Step
4081 // is equal to the vectorization factor (number of SIMD elements) times the
4082 // unroll factor (number of SIMD instructions).
4083 VPValue *R =
4084 Builder.createNaryOp(Instruction::URem, {TC, Step},
4085 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4086
4087 // There are cases where we *must* run at least one iteration in the remainder
4088 // loop. See the cost model for when this can happen. If the step evenly
4089 // divides the trip count, we set the remainder to be equal to the step. If
4090 // the step does not evenly divide the trip count, no adjustment is necessary
4091 // since there will already be scalar iterations. Note that the minimum
4092 // iterations check ensures that N >= Step.
4093 if (RequiresScalarEpilogue) {
4094 assert(!TailByMasking &&
4095 "requiring scalar epilogue is not supported with fail folding");
4096 VPValue *IsZero =
4097 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4098 R = Builder.createSelect(IsZero, Step, R);
4099 }
4100
4101 VPValue *Res = Builder.createNaryOp(
4102 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4103 VectorTC.replaceAllUsesWith(Res);
4104}
4105
4107 ElementCount VFEC) {
4108 VPBuilder Builder(VectorPH, VectorPH->begin());
4109 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4110 VPValue &VF = Plan.getVF();
4111 VPValue &VFxUF = Plan.getVFxUF();
4112 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4113 // used.
4114 // TODO: Assert that they aren't used.
4115
4116 // If there are no users of the runtime VF, compute VFxUF by constant folding
4117 // the multiplication of VF and UF.
4118 if (VF.getNumUsers() == 0) {
4119 VPValue *RuntimeVFxUF =
4120 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4121 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4122 return;
4123 }
4124
4125 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4126 // vscale) * UF.
4127 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4129 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4131 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4132 }
4133 VF.replaceAllUsesWith(RuntimeVF);
4134
4135 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4136 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4137 VFxUF.replaceAllUsesWith(MulByUF);
4138}
4139
4142 const DataLayout &DL = SE.getDataLayout();
4143 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4144
4145 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4146 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4147 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4148 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4150 continue;
4151 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4152 if (!ExpSCEV)
4153 break;
4154 const SCEV *Expr = ExpSCEV->getSCEV();
4155 Value *Res =
4156 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4157 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4158 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4159 ExpSCEV->replaceAllUsesWith(Exp);
4160 if (Plan.getTripCount() == ExpSCEV)
4161 Plan.resetTripCount(Exp);
4162 ExpSCEV->eraseFromParent();
4163 }
4165 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4166 "after any VPIRInstructions");
4167 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4168 // to the VPIRBasicBlock.
4169 auto EI = Entry->begin();
4170 for (Instruction &I : drop_end(*EntryBB)) {
4171 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4172 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4173 EI++;
4174 continue;
4175 }
4177 }
4178
4179 return ExpandedSCEVs;
4180}
4181
4182/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4183/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4184/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4185/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4186/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4187/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4188/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4189/// is defined at \p Idx of a load interleave group.
4190static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4191 VPValue *OpV, unsigned Idx) {
4192 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4193 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4194 if (!Member0OpR)
4195 return Member0Op == OpV;
4196 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4197 return !W->getMask() && Member0Op == OpV;
4198 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4199 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4200 return false;
4201}
4202
4203/// Returns true if \p IR is a full interleave group with factor and number of
4204/// members both equal to \p VF. The interleave group must also access the full
4205/// vector width \p VectorRegWidth.
4207 ElementCount VF,
4208 VPTypeAnalysis &TypeInfo,
4209 TypeSize VectorRegWidth) {
4210 if (!InterleaveR || InterleaveR->getMask())
4211 return false;
4212
4213 Type *GroupElementTy = nullptr;
4214 if (InterleaveR->getStoredValues().empty()) {
4215 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4216 if (!all_of(InterleaveR->definedValues(),
4217 [&TypeInfo, GroupElementTy](VPValue *Op) {
4218 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4219 }))
4220 return false;
4221 } else {
4222 GroupElementTy =
4223 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4224 if (!all_of(InterleaveR->getStoredValues(),
4225 [&TypeInfo, GroupElementTy](VPValue *Op) {
4226 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4227 }))
4228 return false;
4229 }
4230
4231 unsigned VFMin = VF.getKnownMinValue();
4232 TypeSize GroupSize = TypeSize::get(
4233 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4234 const auto *IG = InterleaveR->getInterleaveGroup();
4235 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4236 GroupSize == VectorRegWidth;
4237}
4238
4239/// Returns true if \p VPValue is a narrow VPValue.
4240static bool isAlreadyNarrow(VPValue *VPV) {
4241 if (VPV->isLiveIn())
4242 return true;
4243 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4244 return RepR && RepR->isSingleScalar();
4245}
4246
4247// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4248// a narrow variant.
4249static VPValue *
4251 auto *R = V->getDefiningRecipe();
4252 if (!R || NarrowedOps.contains(V))
4253 return V;
4254
4255 if (isAlreadyNarrow(V))
4256 return V;
4257
4258 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4259 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4260 WideMember0->setOperand(
4261 Idx,
4262 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4263 return V;
4264 }
4265
4266 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4267 // Narrow interleave group to wide load, as transformed VPlan will only
4268 // process one original iteration.
4269 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4270 auto *L = new VPWidenLoadRecipe(
4271 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4272 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4273 L->insertBefore(LoadGroup);
4274 NarrowedOps.insert(L);
4275 return L;
4276 }
4277
4278 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4279 assert(RepR->isSingleScalar() &&
4280 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4281 "must be a single scalar load");
4282 NarrowedOps.insert(RepR);
4283 return RepR;
4284 }
4285
4286 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4287 VPValue *PtrOp = WideLoad->getAddr();
4288 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4289 PtrOp = VecPtr->getOperand(0);
4290 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4291 // process one original iteration.
4292 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4293 /*IsUniform*/ true,
4294 /*Mask*/ nullptr, *WideLoad);
4295 N->insertBefore(WideLoad);
4296 NarrowedOps.insert(N);
4297 return N;
4298}
4299
4301 TypeSize VectorRegWidth) {
4302 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4303 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4304 return;
4305
4306 VPTypeAnalysis TypeInfo(Plan);
4307
4309 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4311 continue;
4312
4315 continue;
4316
4317 // Bail out on recipes not supported at the moment:
4318 // * phi recipes other than the canonical induction
4319 // * recipes writing to memory except interleave groups
4320 // Only support plans with a canonical induction phi.
4321 if (R.isPhi())
4322 return;
4323
4324 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4325 if (R.mayWriteToMemory() && !InterleaveR)
4326 return;
4327
4328 // Do not narrow interleave groups if there are VectorPointer recipes and
4329 // the plan was unrolled. The recipe implicitly uses VF from
4330 // VPTransformState.
4331 // TODO: Remove restriction once the VF for the VectorPointer offset is
4332 // modeled explicitly as operand.
4333 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4334 return;
4335
4336 // All other ops are allowed, but we reject uses that cannot be converted
4337 // when checking all allowed consumers (store interleave groups) below.
4338 if (!InterleaveR)
4339 continue;
4340
4341 // Bail out on non-consecutive interleave groups.
4342 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4343 VectorRegWidth))
4344 return;
4345
4346 // Skip read interleave groups.
4347 if (InterleaveR->getStoredValues().empty())
4348 continue;
4349
4350 // Narrow interleave groups, if all operands are already matching narrow
4351 // ops.
4352 auto *Member0 = InterleaveR->getStoredValues()[0];
4353 if (isAlreadyNarrow(Member0) &&
4354 all_of(InterleaveR->getStoredValues(),
4355 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4356 StoreGroups.push_back(InterleaveR);
4357 continue;
4358 }
4359
4360 // For now, we only support full interleave groups storing load interleave
4361 // groups.
4362 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4363 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4364 if (!DefR)
4365 return false;
4366 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4367 return IR && IR->getInterleaveGroup()->isFull() &&
4368 IR->getVPValue(Op.index()) == Op.value();
4369 })) {
4370 StoreGroups.push_back(InterleaveR);
4371 continue;
4372 }
4373
4374 // Check if all values feeding InterleaveR are matching wide recipes, which
4375 // operands that can be narrowed.
4376 auto *WideMember0 =
4377 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4378 if (!WideMember0)
4379 return;
4380 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4382 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4383 R->getNumOperands() > 2)
4384 return;
4385 if (any_of(enumerate(R->operands()),
4386 [WideMember0, Idx = I](const auto &P) {
4387 const auto &[OpIdx, OpV] = P;
4388 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4389 }))
4390 return;
4391 }
4392 StoreGroups.push_back(InterleaveR);
4393 }
4394
4395 if (StoreGroups.empty())
4396 return;
4397
4398 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4399 SmallPtrSet<VPValue *, 4> NarrowedOps;
4400 // Narrow operation tree rooted at store groups.
4401 for (auto *StoreGroup : StoreGroups) {
4402 VPValue *Res =
4403 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4404 auto *SI =
4405 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4406 auto *S = new VPWidenStoreRecipe(
4407 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4408 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4409 S->insertBefore(StoreGroup);
4410 StoreGroup->eraseFromParent();
4411 }
4412
4413 // Adjust induction to reflect that the transformed plan only processes one
4414 // original iteration.
4415 auto *CanIV = VectorLoop->getCanonicalIV();
4416 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4417 VPBuilder PHBuilder(Plan.getVectorPreheader());
4418
4419 VPValue *UF = Plan.getOrAddLiveIn(
4420 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4421 if (VF.isScalable()) {
4422 VPValue *VScale = PHBuilder.createElementCount(
4424 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4425 Inc->setOperand(1, VScaleUF);
4426 Plan.getVF().replaceAllUsesWith(VScale);
4427 } else {
4428 Inc->setOperand(1, UF);
4430 Plan.getConstantInt(CanIV->getScalarType(), 1));
4431 }
4432 removeDeadRecipes(Plan);
4433}
4434
4435/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4436/// BranchOnCond recipe.
4438 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4439 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4440 auto *MiddleTerm =
4442 // Only add branch metadata if there is a (conditional) terminator.
4443 if (!MiddleTerm)
4444 return;
4445
4446 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4447 "must have a BranchOnCond");
4448 // Assume that `TripCount % VectorStep ` is equally distributed.
4449 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4450 if (VF.isScalable() && VScaleForTuning.has_value())
4451 VectorStep *= *VScaleForTuning;
4452 assert(VectorStep > 0 && "trip count should not be zero");
4453 MDBuilder MDB(Plan.getContext());
4454 MDNode *BranchWeights =
4455 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4456 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4457}
4458
4459/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4460/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4461/// the end value of the induction.
4463 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4464 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4465 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4466 // Truncated wide inductions resume from the last lane of their vector value
4467 // in the last vector iteration which is handled elsewhere.
4468 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4469 return nullptr;
4470
4471 VPValue *Start = WideIV->getStartValue();
4472 VPValue *Step = WideIV->getStepValue();
4474 VPValue *EndValue = VectorTC;
4475 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4476 EndValue = VectorPHBuilder.createDerivedIV(
4477 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4478 Start, VectorTC, Step);
4479 }
4480
4481 // EndValue is derived from the vector trip count (which has the same type as
4482 // the widest induction) and thus may be wider than the induction here.
4483 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4484 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4485 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4486 ScalarTypeOfWideIV,
4487 WideIV->getDebugLoc());
4488 }
4489
4490 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4491 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4492 return ResumePhiRecipe;
4493}
4494
4496 VPlan &Plan, VPRecipeBuilder &Builder,
4497 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4498 VPTypeAnalysis TypeInfo(Plan);
4499 auto *ScalarPH = Plan.getScalarPreheader();
4500 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4501 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4502 VPBuilder VectorPHBuilder(
4503 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4504 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4505 VPBuilder ScalarPHBuilder(ScalarPH);
4506 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4507 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4508
4509 // TODO: Extract final value from induction recipe initially, optimize to
4510 // pre-computed end value together in optimizeInductionExitUsers.
4511 auto *VectorPhiR =
4512 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4513 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4515 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4516 &Plan.getVectorTripCount())) {
4517 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4518 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4519 ScalarPhiIRI->addOperand(ResumePhi);
4520 continue;
4521 }
4522 // TODO: Also handle truncated inductions here. Computing end-values
4523 // separately should be done as VPlan-to-VPlan optimization, after
4524 // legalizing all resume values to use the last lane from the loop.
4525 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4526 "should only skip truncated wide inductions");
4527 continue;
4528 }
4529
4530 // The backedge value provides the value to resume coming out of a loop,
4531 // which for FORs is a vector whose last element needs to be extracted. The
4532 // start value provides the value if the loop is bypassed.
4533 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4534 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4535 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4536 "Cannot handle loops with uncountable early exits");
4537 if (IsFOR)
4538 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4539 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4540 "vector.recur.extract");
4541 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4542 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4543 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4544 ScalarPhiIRI->addOperand(ResumePhiR);
4545 }
4546}
4547
4549 VFRange &Range) {
4550 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4551 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4552 auto *MiddleVPBB = Plan.getMiddleBlock();
4553 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4554 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4555
4556 auto IsScalableOne = [](ElementCount VF) -> bool {
4557 return VF == ElementCount::getScalable(1);
4558 };
4559
4560 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4561 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4562 if (!FOR)
4563 continue;
4564
4565 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4566 "Cannot handle loops with uncountable early exits");
4567
4568 // This is the second phase of vectorizing first-order recurrences, creating
4569 // extract for users outside the loop. An overview of the transformation is
4570 // described below. Suppose we have the following loop with some use after
4571 // the loop of the last a[i-1],
4572 //
4573 // for (int i = 0; i < n; ++i) {
4574 // t = a[i - 1];
4575 // b[i] = a[i] - t;
4576 // }
4577 // use t;
4578 //
4579 // There is a first-order recurrence on "a". For this loop, the shorthand
4580 // scalar IR looks like:
4581 //
4582 // scalar.ph:
4583 // s.init = a[-1]
4584 // br scalar.body
4585 //
4586 // scalar.body:
4587 // i = phi [0, scalar.ph], [i+1, scalar.body]
4588 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4589 // s2 = a[i]
4590 // b[i] = s2 - s1
4591 // br cond, scalar.body, exit.block
4592 //
4593 // exit.block:
4594 // use = lcssa.phi [s1, scalar.body]
4595 //
4596 // In this example, s1 is a recurrence because it's value depends on the
4597 // previous iteration. In the first phase of vectorization, we created a
4598 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4599 // for users in the scalar preheader and exit block.
4600 //
4601 // vector.ph:
4602 // v_init = vector(..., ..., ..., a[-1])
4603 // br vector.body
4604 //
4605 // vector.body
4606 // i = phi [0, vector.ph], [i+4, vector.body]
4607 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4608 // v2 = a[i, i+1, i+2, i+3]
4609 // b[i] = v2 - v1
4610 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4611 // b[i, i+1, i+2, i+3] = v2 - v1
4612 // br cond, vector.body, middle.block
4613 //
4614 // middle.block:
4615 // vector.recur.extract.for.phi = v2(2)
4616 // vector.recur.extract = v2(3)
4617 // br cond, scalar.ph, exit.block
4618 //
4619 // scalar.ph:
4620 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4621 // [s.init, otherwise]
4622 // br scalar.body
4623 //
4624 // scalar.body:
4625 // i = phi [0, scalar.ph], [i+1, scalar.body]
4626 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4627 // s2 = a[i]
4628 // b[i] = s2 - s1
4629 // br cond, scalar.body, exit.block
4630 //
4631 // exit.block:
4632 // lo = lcssa.phi [s1, scalar.body],
4633 // [vector.recur.extract.for.phi, middle.block]
4634 //
4635 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4636 // Extract the penultimate value of the recurrence and use it as operand for
4637 // the VPIRInstruction modeling the phi.
4638 for (VPUser *U : FOR->users()) {
4639 using namespace llvm::VPlanPatternMatch;
4640 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4641 continue;
4642 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4643 // penultimate value of the recurrence. Instead we rely on the existing
4644 // extract of the last element from the result of
4645 // VPInstruction::FirstOrderRecurrenceSplice.
4646 // TODO: Consider vscale_range info and UF.
4648 Range))
4649 return;
4650 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4651 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4652 {}, "vector.recur.extract.for.phi");
4653 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4654 }
4655 }
4656}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3536
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3823
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3898
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3850
iterator end()
Definition VPlan.h:3860
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3858
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3911
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:3872
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2411
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2445
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2435
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2451
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2431
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:189
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:210
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:129
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:148
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:167
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2955
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3479
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:432
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:405
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:417
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3644
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3567
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3000
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1986
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2034
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2023
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3976
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4000
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:981
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1065
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1016
@ FirstOrderRecurrenceSplice
Definition VPlan.h:987
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1011
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1008
@ CanonicalIVIncrementForPart
Definition VPlan.h:1001
@ CalculateTripCountMinusVF
Definition VPlan.h:999
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2552
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2544
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2573
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2625
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2584
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3141
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPRegionBlock * getRegion()
Definition VPlan.h:4128
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2833
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2674
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4011
const VPBlockBase * getEntry() const
Definition VPlan.h:4047
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4122
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4079
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4064
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4109
const VPBlockBase * getExiting() const
Definition VPlan.h:4059
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4072
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2877
bool isSingleScalar() const
Definition VPlan.h:2922
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2946
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3713
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1850
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3608
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1493
Instruction::CastOps getOpcode() const
Definition VPlan.h:1544
A recipe for handling GEP instructions.
Definition VPlan.h:1778
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2051
PHINode * getPHINode() const
Definition VPlan.h:2093
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2079
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2096
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2126
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2207
A recipe for widening vector intrinsics.
Definition VPlan.h:1551
A common base class for widening memory operations.
Definition VPlan.h:3183
A recipe for widened phis.
Definition VPlan.h:2262
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1450
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4141
bool hasVF(ElementCount VF) const
Definition VPlan.h:4346
LLVMContext & getContext() const
Definition VPlan.h:4334
VPBasicBlock * getEntry()
Definition VPlan.h:4234
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4325
bool hasScalableVF() const
Definition VPlan.h:4347
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4332
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4328
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4296
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4403
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4317
unsigned getUF() const
Definition VPlan.h:4366
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4480
bool hasUF(unsigned UF) const
Definition VPlan.h:4364
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4286
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4409
void setVF(ElementCount VF)
Definition VPlan.h:4340
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4379
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4310
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4259
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4458
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4406
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4388
bool hasScalarVFOnly() const
Definition VPlan.h:4357
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4277
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4428
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4282
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4425
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4239
void setUF(unsigned UF)
Definition VPlan.h:4371
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4512
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2304
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3314
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3274
A recipe for widening select instructions.
Definition VPlan.h:1736
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3396
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3354
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.