LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47
49 VPlan &Plan,
51 GetIntOrFpInductionDescriptor,
52 const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 const auto *II = GetIntOrFpInductionDescriptor(Phi);
76 if (!II) {
77 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
78 for (VPValue *Op : PhiR->operands())
79 NewRecipe->addOperand(Op);
80 } else {
81 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
82 VPValue *Step =
84 // It is always safe to copy over the NoWrap and FastMath flags. In
85 // particular, when folding tail by masking, the masked-off lanes are
86 // never used, so it is safe.
88 NewRecipe = new VPWidenIntOrFpInductionRecipe(
89 Phi, Start, Step, &Plan.getVF(), *II, Flags,
90 Ingredient.getDebugLoc());
91 }
92 } else {
93 auto *VPI = cast<VPInstruction>(&Ingredient);
94 assert(!isa<PHINode>(Inst) && "phis should be handled above");
95 // Create VPWidenMemoryRecipe for loads and stores.
96 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
97 NewRecipe = new VPWidenLoadRecipe(
98 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
99 false /*Consecutive*/, false /*Reverse*/, *VPI,
100 Ingredient.getDebugLoc());
101 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
102 NewRecipe = new VPWidenStoreRecipe(
103 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
104 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
105 Ingredient.getDebugLoc());
107 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
108 Ingredient.getDebugLoc());
109 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
110 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
111 if (VectorID == Intrinsic::not_intrinsic)
112 return false;
113 NewRecipe = new VPWidenIntrinsicRecipe(
114 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
115 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
116 *VPI, CI->getDebugLoc());
117 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
118 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
119 *VPI, Ingredient.getDebugLoc());
120 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
121 NewRecipe = new VPWidenCastRecipe(
122 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
123 VPIRFlags(*CI), VPIRMetadata(*CI));
124 } else {
125 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
126 *VPI, Ingredient.getDebugLoc());
127 }
128 }
129
130 NewRecipe->insertBefore(&Ingredient);
131 if (NewRecipe->getNumDefinedValues() == 1)
132 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
133 else
134 assert(NewRecipe->getNumDefinedValues() == 0 &&
135 "Only recpies with zero or one defined values expected");
136 Ingredient.eraseFromParent();
137 }
138 }
139 return true;
140}
141
142/// Return true if we do not know how to (mechanically) hoist or sink \p R out
143/// of a loop region.
145 // Assumes don't alias anything or throw; as long as they're guaranteed to
146 // execute, they're safe to hoist.
148 return false;
149
150 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
151 // memory location is not modified in the vector loop.
152 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
153 return true;
154
155 // Allocas cannot be hoisted.
156 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
157 return RepR && RepR->getOpcode() == Instruction::Alloca;
158}
159
160static bool sinkScalarOperands(VPlan &Plan) {
161 auto Iter = vp_depth_first_deep(Plan.getEntry());
162 bool ScalarVFOnly = Plan.hasScalarVFOnly();
163 bool Changed = false;
164
166 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
167 VPBasicBlock *SinkTo, VPValue *Op) {
168 auto *Candidate =
169 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
170 if (!Candidate)
171 return;
172
173 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
174 // for now.
176 return;
177
178 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
179 return;
180
181 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
182 if (!ScalarVFOnly && RepR->isSingleScalar())
183 return;
184
185 WorkList.insert({SinkTo, Candidate});
186 };
187
188 // First, collect the operands of all recipes in replicate blocks as seeds for
189 // sinking.
191 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
192 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
193 continue;
194 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
195 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
196 continue;
197 for (auto &Recipe : *VPBB)
198 for (VPValue *Op : Recipe.operands())
199 InsertIfValidSinkCandidate(VPBB, Op);
200 }
201
202 // Try to sink each replicate or scalar IV steps recipe in the worklist.
203 for (unsigned I = 0; I != WorkList.size(); ++I) {
204 VPBasicBlock *SinkTo;
205 VPSingleDefRecipe *SinkCandidate;
206 std::tie(SinkTo, SinkCandidate) = WorkList[I];
207
208 // All recipe users of SinkCandidate must be in the same block SinkTo or all
209 // users outside of SinkTo must only use the first lane of SinkCandidate. In
210 // the latter case, we need to duplicate SinkCandidate.
211 auto UsersOutsideSinkTo =
212 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
213 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
214 });
215 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
216 return !U->usesFirstLaneOnly(SinkCandidate);
217 }))
218 continue;
219 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
220
221 if (NeedsDuplicating) {
222 if (ScalarVFOnly)
223 continue;
224 VPSingleDefRecipe *Clone;
225 if (auto *SinkCandidateRepR =
226 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
227 // TODO: Handle converting to uniform recipes as separate transform,
228 // then cloning should be sufficient here.
229 Instruction *I = SinkCandidate->getUnderlyingInstr();
230 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
231 nullptr /*Mask*/, *SinkCandidateRepR,
232 *SinkCandidateRepR);
233 // TODO: add ".cloned" suffix to name of Clone's VPValue.
234 } else {
235 Clone = SinkCandidate->clone();
236 }
237
238 Clone->insertBefore(SinkCandidate);
239 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
240 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
241 });
242 }
243 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
244 for (VPValue *Op : SinkCandidate->operands())
245 InsertIfValidSinkCandidate(SinkTo, Op);
246 Changed = true;
247 }
248 return Changed;
249}
250
251/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
252/// the mask.
254 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
255 if (!EntryBB || EntryBB->size() != 1 ||
256 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
257 return nullptr;
258
259 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
260}
261
262/// If \p R is a triangle region, return the 'then' block of the triangle.
264 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
265 if (EntryBB->getNumSuccessors() != 2)
266 return nullptr;
267
268 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
269 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
270 if (!Succ0 || !Succ1)
271 return nullptr;
272
273 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
274 return nullptr;
275 if (Succ0->getSingleSuccessor() == Succ1)
276 return Succ0;
277 if (Succ1->getSingleSuccessor() == Succ0)
278 return Succ1;
279 return nullptr;
280}
281
282// Merge replicate regions in their successor region, if a replicate region
283// is connected to a successor replicate region with the same predicate by a
284// single, empty VPBasicBlock.
286 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
287
288 // Collect replicate regions followed by an empty block, followed by another
289 // replicate region with matching masks to process front. This is to avoid
290 // iterator invalidation issues while merging regions.
293 vp_depth_first_deep(Plan.getEntry()))) {
294 if (!Region1->isReplicator())
295 continue;
296 auto *MiddleBasicBlock =
297 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
298 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
299 continue;
300
301 auto *Region2 =
302 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
303 if (!Region2 || !Region2->isReplicator())
304 continue;
305
306 VPValue *Mask1 = getPredicatedMask(Region1);
307 VPValue *Mask2 = getPredicatedMask(Region2);
308 if (!Mask1 || Mask1 != Mask2)
309 continue;
310
311 assert(Mask1 && Mask2 && "both region must have conditions");
312 WorkList.push_back(Region1);
313 }
314
315 // Move recipes from Region1 to its successor region, if both are triangles.
316 for (VPRegionBlock *Region1 : WorkList) {
317 if (TransformedRegions.contains(Region1))
318 continue;
319 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
320 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
321
322 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
323 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
324 if (!Then1 || !Then2)
325 continue;
326
327 // Note: No fusion-preventing memory dependencies are expected in either
328 // region. Such dependencies should be rejected during earlier dependence
329 // checks, which guarantee accesses can be re-ordered for vectorization.
330 //
331 // Move recipes to the successor region.
332 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
333 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
334
335 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
336 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
337
338 // Move VPPredInstPHIRecipes from the merge block to the successor region's
339 // merge block. Update all users inside the successor region to use the
340 // original values.
341 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
342 VPValue *PredInst1 =
343 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
344 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
345 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
346 return cast<VPRecipeBase>(&U)->getParent() == Then2;
347 });
348
349 // Remove phi recipes that are unused after merging the regions.
350 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
351 Phi1ToMove.eraseFromParent();
352 continue;
353 }
354 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
355 }
356
357 // Remove the dead recipes in Region1's entry block.
358 for (VPRecipeBase &R :
359 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
360 R.eraseFromParent();
361
362 // Finally, remove the first region.
363 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
364 VPBlockUtils::disconnectBlocks(Pred, Region1);
365 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
366 }
367 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
368 TransformedRegions.insert(Region1);
369 }
370
371 return !TransformedRegions.empty();
372}
373
375 VPlan &Plan) {
376 Instruction *Instr = PredRecipe->getUnderlyingInstr();
377 // Build the triangular if-then region.
378 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
379 assert(Instr->getParent() && "Predicated instruction not in any basic block");
380 auto *BlockInMask = PredRecipe->getMask();
381 auto *MaskDef = BlockInMask->getDefiningRecipe();
382 auto *BOMRecipe = new VPBranchOnMaskRecipe(
383 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
384 auto *Entry =
385 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
386
387 // Replace predicated replicate recipe with a replicate recipe without a
388 // mask but in the replicate region.
389 auto *RecipeWithoutMask = new VPReplicateRecipe(
390 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
391 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
392 PredRecipe->getDebugLoc());
393 auto *Pred =
394 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
395
396 VPPredInstPHIRecipe *PHIRecipe = nullptr;
397 if (PredRecipe->getNumUsers() != 0) {
398 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
399 RecipeWithoutMask->getDebugLoc());
400 PredRecipe->replaceAllUsesWith(PHIRecipe);
401 PHIRecipe->setOperand(0, RecipeWithoutMask);
402 }
403 PredRecipe->eraseFromParent();
404 auto *Exiting =
405 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
407 Plan.createReplicateRegion(Entry, Exiting, RegionName);
408
409 // Note: first set Entry as region entry and then connect successors starting
410 // from it in order, to propagate the "parent" of each VPBasicBlock.
411 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
412 VPBlockUtils::connectBlocks(Pred, Exiting);
413
414 return Region;
415}
416
417static void addReplicateRegions(VPlan &Plan) {
420 vp_depth_first_deep(Plan.getEntry()))) {
421 for (VPRecipeBase &R : *VPBB)
422 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
423 if (RepR->isPredicated())
424 WorkList.push_back(RepR);
425 }
426 }
427
428 unsigned BBNum = 0;
429 for (VPReplicateRecipe *RepR : WorkList) {
430 VPBasicBlock *CurrentBlock = RepR->getParent();
431 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
432
433 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
434 SplitBlock->setName(
435 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
436 // Record predicated instructions for above packing optimizations.
438 Region->setParent(CurrentBlock->getParent());
440
441 VPRegionBlock *ParentRegion = Region->getParent();
442 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
443 ParentRegion->setExiting(SplitBlock);
444 }
445}
446
447/// Remove redundant VPBasicBlocks by merging them into their predecessor if
448/// the predecessor has a single successor.
452 vp_depth_first_deep(Plan.getEntry()))) {
453 // Don't fold the blocks in the skeleton of the Plan into their single
454 // predecessors for now.
455 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
456 if (!VPBB->getParent())
457 continue;
458 auto *PredVPBB =
459 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
460 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
461 isa<VPIRBasicBlock>(PredVPBB))
462 continue;
463 WorkList.push_back(VPBB);
464 }
465
466 for (VPBasicBlock *VPBB : WorkList) {
467 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
468 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
469 R.moveBefore(*PredVPBB, PredVPBB->end());
470 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
471 auto *ParentRegion = VPBB->getParent();
472 if (ParentRegion && ParentRegion->getExiting() == VPBB)
473 ParentRegion->setExiting(PredVPBB);
474 for (auto *Succ : to_vector(VPBB->successors())) {
476 VPBlockUtils::connectBlocks(PredVPBB, Succ);
477 }
478 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
479 }
480 return !WorkList.empty();
481}
482
484 // Convert masked VPReplicateRecipes to if-then region blocks.
486
487 bool ShouldSimplify = true;
488 while (ShouldSimplify) {
489 ShouldSimplify = sinkScalarOperands(Plan);
490 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
491 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
492 }
493}
494
495/// Remove redundant casts of inductions.
496///
497/// Such redundant casts are casts of induction variables that can be ignored,
498/// because we already proved that the casted phi is equal to the uncasted phi
499/// in the vectorized loop. There is no need to vectorize the cast - the same
500/// value can be used for both the phi and casts in the vector loop.
502 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
504 if (!IV || IV->getTruncInst())
505 continue;
506
507 // A sequence of IR Casts has potentially been recorded for IV, which
508 // *must be bypassed* when the IV is vectorized, because the vectorized IV
509 // will produce the desired casted value. This sequence forms a def-use
510 // chain and is provided in reverse order, ending with the cast that uses
511 // the IV phi. Search for the recipe of the last cast in the chain and
512 // replace it with the original IV. Note that only the final cast is
513 // expected to have users outside the cast-chain and the dead casts left
514 // over will be cleaned up later.
515 auto &Casts = IV->getInductionDescriptor().getCastInsts();
516 VPValue *FindMyCast = IV;
517 for (Instruction *IRCast : reverse(Casts)) {
518 VPSingleDefRecipe *FoundUserCast = nullptr;
519 for (auto *U : FindMyCast->users()) {
520 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
521 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
522 FoundUserCast = UserCast;
523 break;
524 }
525 }
526 FindMyCast = FoundUserCast;
527 }
528 FindMyCast->replaceAllUsesWith(IV);
529 }
530}
531
532/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
533/// recipe, if it exists.
535 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
536 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
537 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
538 for (VPUser *U : CanonicalIV->users()) {
540 if (WidenNewIV)
541 break;
542 }
543
544 if (!WidenNewIV)
545 return;
546
547 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
548 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
549 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
550
551 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
552 continue;
553
554 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
555 // everything WidenNewIV's users need. That is, WidenOriginalIV will
556 // generate a vector phi or all users of WidenNewIV demand the first lane
557 // only.
558 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
559 vputils::onlyFirstLaneUsed(WidenNewIV)) {
560 // We are replacing a wide canonical iv with a suitable wide induction.
561 // This is used to compute header mask, hence all lanes will be used and
562 // we need to drop wrap flags only applying to lanes guranteed to execute
563 // in the original scalar loop.
564 WidenOriginalIV->dropPoisonGeneratingFlags();
565 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
566 WidenNewIV->eraseFromParent();
567 return;
568 }
569 }
570}
571
572/// Returns true if \p R is dead and can be removed.
573static bool isDeadRecipe(VPRecipeBase &R) {
574 // Do remove conditional assume instructions as their conditions may be
575 // flattened.
576 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
577 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
579 if (IsConditionalAssume)
580 return true;
581
582 if (R.mayHaveSideEffects())
583 return false;
584
585 // Recipe is dead if no user keeps the recipe alive.
586 return all_of(R.definedValues(),
587 [](VPValue *V) { return V->getNumUsers() == 0; });
588}
589
592 vp_post_order_deep(Plan.getEntry()))) {
593 // The recipes in the block are processed in reverse order, to catch chains
594 // of dead recipes.
595 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
596 if (isDeadRecipe(R)) {
597 R.eraseFromParent();
598 continue;
599 }
600
601 // Check if R is a dead VPPhi <-> update cycle and remove it.
602 auto *PhiR = dyn_cast<VPPhi>(&R);
603 if (!PhiR || PhiR->getNumOperands() != 2)
604 continue;
605 VPUser *PhiUser = PhiR->getSingleUser();
606 if (!PhiUser)
607 continue;
608 VPValue *Incoming = PhiR->getOperand(1);
609 if (PhiUser != Incoming->getDefiningRecipe() ||
610 Incoming->getNumUsers() != 1)
611 continue;
612 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
613 PhiR->eraseFromParent();
614 Incoming->getDefiningRecipe()->eraseFromParent();
615 }
616 }
617}
618
621 Instruction::BinaryOps InductionOpcode,
622 FPMathOperator *FPBinOp, Instruction *TruncI,
623 VPValue *StartV, VPValue *Step, DebugLoc DL,
624 VPBuilder &Builder) {
625 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
626 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
627 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
628 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
629 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
630
631 // Truncate base induction if needed.
632 VPTypeAnalysis TypeInfo(Plan);
633 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
634 if (TruncI) {
635 Type *TruncTy = TruncI->getType();
636 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
637 "Not truncating.");
638 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
639 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
640 ResultTy = TruncTy;
641 }
642
643 // Truncate step if needed.
644 Type *StepTy = TypeInfo.inferScalarType(Step);
645 if (ResultTy != StepTy) {
646 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
647 "Not truncating.");
648 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
649 auto *VecPreheader =
651 VPBuilder::InsertPointGuard Guard(Builder);
652 Builder.setInsertPoint(VecPreheader);
653 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
654 }
655 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
656 &Plan.getVF(), DL);
657}
658
661 for (unsigned I = 0; I != Users.size(); ++I) {
663 if (isa<VPHeaderPHIRecipe>(Cur))
664 continue;
665 for (VPValue *V : Cur->definedValues())
666 Users.insert_range(V->users());
667 }
668 return Users.takeVector();
669}
670
671/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
672/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
673/// VPWidenPointerInductionRecipe will generate vectors only. If some users
674/// require vectors while other require scalars, the scalar uses need to extract
675/// the scalars from the generated vectors (Note that this is different to how
676/// int/fp inductions are handled). Legalize extract-from-ends using uniform
677/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
678/// the correct end value is available. Also optimize
679/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
680/// providing them scalar steps built on the canonical scalar IV and update the
681/// original IV's users. This is an optional optimization to reduce the needs of
682/// vector extracts.
685 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
686 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
687 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
688 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
689 if (!PhiR)
690 continue;
691
692 // Try to narrow wide and replicating recipes to uniform recipes, based on
693 // VPlan analysis.
694 // TODO: Apply to all recipes in the future, to replace legacy uniformity
695 // analysis.
696 auto Users = collectUsersRecursively(PhiR);
697 for (VPUser *U : reverse(Users)) {
698 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
699 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
700 // Skip recipes that shouldn't be narrowed.
701 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
702 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
703 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
704 continue;
705
706 // Skip recipes that may have other lanes than their first used.
708 continue;
709
710 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
711 Def->operands(), /*IsUniform*/ true,
712 /*Mask*/ nullptr, /*Flags*/ *Def);
713 Clone->insertAfter(Def);
714 Def->replaceAllUsesWith(Clone);
715 }
716
717 // Replace wide pointer inductions which have only their scalars used by
718 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
719 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
720 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
721 continue;
722
723 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
724 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
725 VPValue *StepV = PtrIV->getOperand(1);
727 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
728 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
729
730 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
731 PtrIV->getDebugLoc(), "next.gep");
732
733 PtrIV->replaceAllUsesWith(PtrAdd);
734 continue;
735 }
736
737 // Replace widened induction with scalar steps for users that only use
738 // scalars.
739 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
740 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
741 return U->usesScalars(WideIV);
742 }))
743 continue;
744
745 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
747 Plan, ID.getKind(), ID.getInductionOpcode(),
748 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
749 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
750 WideIV->getDebugLoc(), Builder);
751
752 // Update scalar users of IV to use Step instead.
753 if (!HasOnlyVectorVFs)
754 WideIV->replaceAllUsesWith(Steps);
755 else
756 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
757 return U.usesScalars(WideIV);
758 });
759 }
760}
761
762/// Check if \p VPV is an untruncated wide induction, either before or after the
763/// increment. If so return the header IV (before the increment), otherwise
764/// return null.
766 ScalarEvolution &SE) {
767 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
768 if (WideIV) {
769 // VPV itself is a wide induction, separately compute the end value for exit
770 // users if it is not a truncated IV.
771 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
772 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
773 }
774
775 // Check if VPV is an optimizable induction increment.
776 VPRecipeBase *Def = VPV->getDefiningRecipe();
777 if (!Def || Def->getNumOperands() != 2)
778 return nullptr;
779 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
780 if (!WideIV)
781 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
782 if (!WideIV)
783 return nullptr;
784
785 auto IsWideIVInc = [&]() {
786 auto &ID = WideIV->getInductionDescriptor();
787
788 // Check if VPV increments the induction by the induction step.
789 VPValue *IVStep = WideIV->getStepValue();
790 switch (ID.getInductionOpcode()) {
791 case Instruction::Add:
792 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
793 case Instruction::FAdd:
795 m_Specific(IVStep)));
796 case Instruction::FSub:
797 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
798 m_Specific(IVStep)));
799 case Instruction::Sub: {
800 // IVStep will be the negated step of the subtraction. Check if Step == -1
801 // * IVStep.
802 VPValue *Step;
803 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
804 return false;
805 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
806 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
807 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
808 !isa<SCEVCouldNotCompute>(StepSCEV) &&
809 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
810 }
811 default:
812 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
813 match(VPV, m_GetElementPtr(m_Specific(WideIV),
814 m_Specific(WideIV->getStepValue())));
815 }
816 llvm_unreachable("should have been covered by switch above");
817 };
818 return IsWideIVInc() ? WideIV : nullptr;
819}
820
821/// Attempts to optimize the induction variable exit values for users in the
822/// early exit block.
824 VPTypeAnalysis &TypeInfo,
825 VPBlockBase *PredVPBB,
826 VPValue *Op,
827 ScalarEvolution &SE) {
828 VPValue *Incoming, *Mask;
831 return nullptr;
832
833 auto *WideIV = getOptimizableIVOf(Incoming, SE);
834 if (!WideIV)
835 return nullptr;
836
837 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
838 if (WideIntOrFp && WideIntOrFp->getTruncInst())
839 return nullptr;
840
841 // Calculate the final index.
842 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
843 auto *CanonicalIV = LoopRegion->getCanonicalIV();
844 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
845 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
846
847 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
848 VPValue *FirstActiveLane =
849 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
850 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
851 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
852 FirstActiveLaneType, DL);
853 VPValue *EndValue =
854 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
855
856 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
857 // changed it means the exit is using the incremented value, so we need to
858 // add the step.
859 if (Incoming != WideIV) {
860 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
861 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
862 }
863
864 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
865 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
866 VPValue *Start = WideIV->getStartValue();
867 VPValue *Step = WideIV->getStepValue();
868 EndValue = B.createDerivedIV(
869 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
870 Start, EndValue, Step);
871 }
872
873 return EndValue;
874}
875
876/// Attempts to optimize the induction variable exit values for users in the
877/// exit block coming from the latch in the original scalar loop.
879 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
883 return nullptr;
884
885 auto *WideIV = getOptimizableIVOf(Incoming, SE);
886 if (!WideIV)
887 return nullptr;
888
889 VPValue *EndValue = EndValues.lookup(WideIV);
890 assert(EndValue && "end value must have been pre-computed");
891
892 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
893 // changed it means the exit is using the incremented value, so we don't
894 // need to subtract the step.
895 if (Incoming != WideIV)
896 return EndValue;
897
898 // Otherwise, subtract the step from the EndValue.
899 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
900 VPValue *Step = WideIV->getStepValue();
901 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
902 if (ScalarTy->isIntegerTy())
903 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
904 if (ScalarTy->isPointerTy()) {
905 Type *StepTy = TypeInfo.inferScalarType(Step);
906 auto *Zero = Plan.getConstantInt(StepTy, 0);
907 return B.createPtrAdd(EndValue,
908 B.createNaryOp(Instruction::Sub, {Zero, Step}),
909 DebugLoc::getUnknown(), "ind.escape");
910 }
911 if (ScalarTy->isFloatingPointTy()) {
912 const auto &ID = WideIV->getInductionDescriptor();
913 return B.createNaryOp(
914 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
915 ? Instruction::FSub
916 : Instruction::FAdd,
917 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
918 }
919 llvm_unreachable("all possible induction types must be handled");
920 return nullptr;
921}
922
924 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
925 ScalarEvolution &SE) {
926 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
927 VPTypeAnalysis TypeInfo(Plan);
928 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
929 for (VPRecipeBase &R : ExitVPBB->phis()) {
930 auto *ExitIRI = cast<VPIRPhi>(&R);
931
932 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
933 VPValue *Escape = nullptr;
934 if (PredVPBB == MiddleVPBB)
935 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
936 ExitIRI->getOperand(Idx),
937 EndValues, SE);
938 else
939 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
940 ExitIRI->getOperand(Idx), SE);
941 if (Escape)
942 ExitIRI->setOperand(Idx, Escape);
943 }
944 }
945 }
946}
947
948/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
949/// them with already existing recipes expanding the same SCEV expression.
952
953 for (VPRecipeBase &R :
955 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
956 if (!ExpR)
957 continue;
958
959 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
960 if (Inserted)
961 continue;
962 ExpR->replaceAllUsesWith(V->second);
963 ExpR->eraseFromParent();
964 }
965}
966
968 SmallVector<VPValue *> WorkList;
970 WorkList.push_back(V);
971
972 while (!WorkList.empty()) {
973 VPValue *Cur = WorkList.pop_back_val();
974 if (!Seen.insert(Cur).second)
975 continue;
977 if (!R)
978 continue;
979 if (!isDeadRecipe(*R))
980 continue;
981 append_range(WorkList, R->operands());
982 R->eraseFromParent();
983 }
984}
985
986/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
987/// Returns an optional pair, where the first element indicates whether it is
988/// an intrinsic ID.
989static std::optional<std::pair<bool, unsigned>>
991 return TypeSwitch<const VPSingleDefRecipe *,
992 std::optional<std::pair<bool, unsigned>>>(R)
995 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
996 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
997 return std::make_pair(true, I->getVectorIntrinsicID());
998 })
999 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1000 // For recipes that do not directly map to LLVM IR instructions,
1001 // assign opcodes after the last VPInstruction opcode (which is also
1002 // after the last IR Instruction opcode), based on the VPDefID.
1003 return std::make_pair(false,
1004 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1005 })
1006 .Default([](auto *) { return std::nullopt; });
1007}
1008
1009/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1010/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1011/// Operands are foldable live-ins.
1013 ArrayRef<VPValue *> Operands,
1014 const DataLayout &DL,
1015 VPTypeAnalysis &TypeInfo) {
1016 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1017 if (!OpcodeOrIID)
1018 return nullptr;
1019
1021 for (VPValue *Op : Operands) {
1022 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1023 return nullptr;
1024 Ops.push_back(Op->getLiveInIRValue());
1025 }
1026
1027 auto FoldToIRValue = [&]() -> Value * {
1028 InstSimplifyFolder Folder(DL);
1029 if (OpcodeOrIID->first) {
1030 if (R.getNumOperands() != 2)
1031 return nullptr;
1032 unsigned ID = OpcodeOrIID->second;
1033 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1034 TypeInfo.inferScalarType(&R));
1035 }
1036 unsigned Opcode = OpcodeOrIID->second;
1037 if (Instruction::isBinaryOp(Opcode))
1038 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1039 Ops[0], Ops[1]);
1040 if (Instruction::isCast(Opcode))
1041 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1042 TypeInfo.inferScalarType(R.getVPSingleValue()));
1043 switch (Opcode) {
1045 return Folder.FoldSelect(Ops[0], Ops[1],
1047 case VPInstruction::Not:
1048 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1050 case Instruction::Select:
1051 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1052 case Instruction::ICmp:
1053 case Instruction::FCmp:
1054 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1055 Ops[1]);
1056 case Instruction::GetElementPtr: {
1057 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1058 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1059 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1060 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1061 }
1064 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1065 Ops[0], Ops[1],
1066 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1067 // An extract of a live-in is an extract of a broadcast, so return the
1068 // broadcasted element.
1069 case Instruction::ExtractElement:
1070 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1071 return Ops[0];
1072 }
1073 return nullptr;
1074 };
1075
1076 if (Value *V = FoldToIRValue())
1077 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1078 return nullptr;
1079}
1080
1081/// Try to simplify VPSingleDefRecipe \p Def.
1083 VPlan *Plan = Def->getParent()->getPlan();
1084
1085 // Simplification of live-in IR values for SingleDef recipes using
1086 // InstSimplifyFolder.
1087 const DataLayout &DL =
1089 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1090 return Def->replaceAllUsesWith(V);
1091
1092 // Fold PredPHI LiveIn -> LiveIn.
1093 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1094 VPValue *Op = PredPHI->getOperand(0);
1095 if (Op->isLiveIn())
1096 PredPHI->replaceAllUsesWith(Op);
1097 }
1098
1099 VPBuilder Builder(Def);
1100 VPValue *A;
1101 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1102 Type *TruncTy = TypeInfo.inferScalarType(Def);
1103 Type *ATy = TypeInfo.inferScalarType(A);
1104 if (TruncTy == ATy) {
1105 Def->replaceAllUsesWith(A);
1106 } else {
1107 // Don't replace a scalarizing recipe with a widened cast.
1108 if (isa<VPReplicateRecipe>(Def))
1109 return;
1110 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1111
1112 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1113 ? Instruction::SExt
1114 : Instruction::ZExt;
1115 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1116 TruncTy);
1117 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1118 // UnderlyingExt has distinct return type, used to retain legacy cost.
1119 Ext->setUnderlyingValue(UnderlyingExt);
1120 }
1121 Def->replaceAllUsesWith(Ext);
1122 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1123 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1124 Def->replaceAllUsesWith(Trunc);
1125 }
1126 }
1127#ifndef NDEBUG
1128 // Verify that the cached type info is for both A and its users is still
1129 // accurate by comparing it to freshly computed types.
1130 VPTypeAnalysis TypeInfo2(*Plan);
1131 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1132 for (VPUser *U : A->users()) {
1133 auto *R = cast<VPRecipeBase>(U);
1134 for (VPValue *VPV : R->definedValues())
1135 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1136 }
1137#endif
1138 }
1139
1140 // Simplify (X && Y) || (X && !Y) -> X.
1141 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1142 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1143 // recipes to be visited during simplification.
1144 VPValue *X, *Y, *Z;
1145 if (match(Def,
1148 Def->replaceAllUsesWith(X);
1149 Def->eraseFromParent();
1150 return;
1151 }
1152
1153 // x | 1 -> 1
1154 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1155 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1156
1157 // x | 0 -> x
1158 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1159 return Def->replaceAllUsesWith(X);
1160
1161 // x & 0 -> 0
1162 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1163 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1164
1165 // x && false -> false
1166 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1167 return Def->replaceAllUsesWith(Def->getOperand(1));
1168
1169 // (x && y) || (x && z) -> x && (y || z)
1172 // Simplify only if one of the operands has one use to avoid creating an
1173 // extra recipe.
1174 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1175 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1176 return Def->replaceAllUsesWith(
1177 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1178
1179 // x && !x -> 0
1181 return Def->replaceAllUsesWith(Plan->getFalse());
1182
1183 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1184 return Def->replaceAllUsesWith(X);
1185
1186 // select !c, x, y -> select c, y, x
1187 VPValue *C;
1188 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1189 Def->setOperand(0, C);
1190 Def->setOperand(1, Y);
1191 Def->setOperand(2, X);
1192 return;
1193 }
1194
1195 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1196 // tail folding it is likely that x is a header mask and can be simplified
1197 // further.
1199 m_VPValue(Z))) &&
1200 X->hasMoreThanOneUniqueUser())
1201 return Def->replaceAllUsesWith(
1202 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1203
1204 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1205 return Def->replaceAllUsesWith(A);
1206
1207 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1208 return Def->replaceAllUsesWith(
1209 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1210
1211 if (match(Def, m_Not(m_VPValue(A)))) {
1212 if (match(A, m_Not(m_VPValue(A))))
1213 return Def->replaceAllUsesWith(A);
1214
1215 // Try to fold Not into compares by adjusting the predicate in-place.
1216 CmpPredicate Pred;
1217 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1218 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1219 if (all_of(Cmp->users(),
1221 m_Not(m_Specific(Cmp)),
1222 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1223 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1224 for (VPUser *U : to_vector(Cmp->users())) {
1225 auto *R = cast<VPSingleDefRecipe>(U);
1226 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1227 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1228 R->setOperand(1, Y);
1229 R->setOperand(2, X);
1230 } else {
1231 // not (cmp pred) -> cmp inv_pred
1232 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1233 R->replaceAllUsesWith(Cmp);
1234 }
1235 }
1236 // If Cmp doesn't have a debug location, use the one from the negation,
1237 // to preserve the location.
1238 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1239 Cmp->setDebugLoc(Def->getDebugLoc());
1240 }
1241 }
1242 }
1243
1244 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1245 // This is useful for fmax/fmin without fast-math flags, where we need to
1246 // check if any operand is NaN.
1248 m_Deferred(X)),
1250 m_Deferred(Y))))) {
1251 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1252 return Def->replaceAllUsesWith(NewCmp);
1253 }
1254
1255 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1256 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1257 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1258 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1259 TypeInfo.inferScalarType(Def))
1260 return Def->replaceAllUsesWith(Def->getOperand(1));
1261
1263 m_One()))) {
1264 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1265 if (TypeInfo.inferScalarType(X) != WideStepTy)
1266 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1267 Def->replaceAllUsesWith(X);
1268 return;
1269 }
1270
1271 // For i1 vp.merges produced by AnyOf reductions:
1272 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1274 m_VPValue(X), m_VPValue())) &&
1276 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1277 Def->setOperand(1, Def->getOperand(0));
1278 Def->setOperand(0, Y);
1279 return;
1280 }
1281
1282 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1283 if (Phi->getOperand(0) == Phi->getOperand(1))
1284 Phi->replaceAllUsesWith(Phi->getOperand(0));
1285 return;
1286 }
1287
1288 // Look through ExtractLastElement (BuildVector ....).
1291 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1292 Def->replaceAllUsesWith(
1293 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1294 return;
1295 }
1296
1297 // Look through ExtractPenultimateElement (BuildVector ....).
1299 m_BuildVector()))) {
1300 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1301 Def->replaceAllUsesWith(
1302 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1303 return;
1304 }
1305
1306 uint64_t Idx;
1308 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1309 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1310 return;
1311 }
1312
1313 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1314 Def->replaceAllUsesWith(
1315 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1316 return;
1317 }
1318
1319 // Look through broadcast of single-scalar when used as select conditions; in
1320 // that case the scalar condition can be used directly.
1321 if (match(Def,
1324 "broadcast operand must be single-scalar");
1325 Def->setOperand(0, C);
1326 return;
1327 }
1328
1329 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1330 if (Phi->getNumOperands() == 1)
1331 Phi->replaceAllUsesWith(Phi->getOperand(0));
1332 return;
1333 }
1334
1335 // Some simplifications can only be applied after unrolling. Perform them
1336 // below.
1337 if (!Plan->isUnrolled())
1338 return;
1339
1340 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1341 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1342 isa<VPPhi>(X)) {
1343 auto *Phi = cast<VPPhi>(X);
1344 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1345 Phi->getSingleUser() == Def) {
1346 Phi->setOperand(0, Y);
1347 Def->replaceAllUsesWith(Phi);
1348 return;
1349 }
1350 }
1351
1352 // VPVectorPointer for part 0 can be replaced by their start pointer.
1353 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1354 if (VecPtr->isFirstPart()) {
1355 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1356 return;
1357 }
1358 }
1359
1360 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1361 // the first lane is demanded.
1362 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1363 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1364 Steps->replaceAllUsesWith(Steps->getOperand(0));
1365 return;
1366 }
1367 }
1368 // Simplify redundant ReductionStartVector recipes after unrolling.
1369 VPValue *StartV;
1371 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1372 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1373 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1374 return PhiR && PhiR->isInLoop();
1375 });
1376 return;
1377 }
1378
1379 if (match(Def,
1382 Def->replaceAllUsesWith(A);
1383 return;
1384 }
1385
1390 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1391 all_of(A->users(),
1392 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1393 return Def->replaceAllUsesWith(A);
1394 }
1395
1396 if (Plan->getUF() == 1 &&
1398 return Def->replaceAllUsesWith(
1399 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1400 }
1401}
1402
1405 Plan.getEntry());
1406 VPTypeAnalysis TypeInfo(Plan);
1408 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1409 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1410 simplifyRecipe(Def, TypeInfo);
1411 }
1412}
1413
1415 if (Plan.hasScalarVFOnly())
1416 return;
1417
1418 // Try to narrow wide and replicating recipes to single scalar recipes,
1419 // based on VPlan analysis. Only process blocks in the loop region for now,
1420 // without traversing into nested regions, as recipes in replicate regions
1421 // cannot be converted yet.
1424 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1426 continue;
1427 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1428 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1429 continue;
1430
1431 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1432 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1433 vputils::isSingleScalar(RepR->getOperand(1))) {
1434 auto *Clone = new VPReplicateRecipe(
1435 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1436 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1437 *RepR /*Metadata*/, RepR->getDebugLoc());
1438 Clone->insertBefore(RepOrWidenR);
1439 unsigned ExtractOpc =
1440 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1443 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1444 Ext->insertBefore(Clone);
1445 Clone->setOperand(0, Ext);
1446 RepR->eraseFromParent();
1447 continue;
1448 }
1449
1450 // Skip recipes that aren't single scalars or don't have only their
1451 // scalar results used. In the latter case, we would introduce extra
1452 // broadcasts.
1453 if (!vputils::isSingleScalar(RepOrWidenR) ||
1454 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1455 if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
1456 // VPWidenStore doesn't have users, and stores are always
1457 // profitable to widen: hence, permitting address and mask
1458 // operands, and single-scalar stored values is an important leaf
1459 // condition. The assert must hold as we checked the RepOrWidenR
1460 // operand against vputils::isSingleScalar.
1461 assert(RepOrWidenR != Store->getStoredValue() ||
1462 vputils::isSingleScalar(Store->getStoredValue()));
1463 return true;
1464 }
1465
1466 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1467 unsigned Opcode = VPI->getOpcode();
1468 if (Opcode == VPInstruction::ExtractLastElement ||
1471 return true;
1472 }
1473
1474 return U->usesScalars(RepOrWidenR);
1475 }))
1476 continue;
1477
1478 auto *Clone = new VPReplicateRecipe(
1479 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1480 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1481 Clone->insertBefore(RepOrWidenR);
1482 RepOrWidenR->replaceAllUsesWith(Clone);
1483 if (isDeadRecipe(*RepOrWidenR))
1484 RepOrWidenR->eraseFromParent();
1485 }
1486 }
1487}
1488
1489/// Try to see if all of \p Blend's masks share a common value logically and'ed
1490/// and remove it from the masks.
1492 if (Blend->isNormalized())
1493 return;
1494 VPValue *CommonEdgeMask;
1495 if (!match(Blend->getMask(0),
1496 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1497 return;
1498 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1499 if (!match(Blend->getMask(I),
1500 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1501 return;
1502 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1503 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1504}
1505
1506/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1507/// to make sure the masks are simplified.
1508static void simplifyBlends(VPlan &Plan) {
1511 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1512 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1513 if (!Blend)
1514 continue;
1515
1516 removeCommonBlendMask(Blend);
1517
1518 // Try to remove redundant blend recipes.
1519 SmallPtrSet<VPValue *, 4> UniqueValues;
1520 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1521 UniqueValues.insert(Blend->getIncomingValue(0));
1522 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1523 if (!match(Blend->getMask(I), m_False()))
1524 UniqueValues.insert(Blend->getIncomingValue(I));
1525
1526 if (UniqueValues.size() == 1) {
1527 Blend->replaceAllUsesWith(*UniqueValues.begin());
1528 Blend->eraseFromParent();
1529 continue;
1530 }
1531
1532 if (Blend->isNormalized())
1533 continue;
1534
1535 // Normalize the blend so its first incoming value is used as the initial
1536 // value with the others blended into it.
1537
1538 unsigned StartIndex = 0;
1539 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1540 // If a value's mask is used only by the blend then is can be deadcoded.
1541 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1542 // that's used by multiple blends where it can be removed from them all.
1543 VPValue *Mask = Blend->getMask(I);
1544 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1545 StartIndex = I;
1546 break;
1547 }
1548 }
1549
1550 SmallVector<VPValue *, 4> OperandsWithMask;
1551 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1552
1553 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1554 if (I == StartIndex)
1555 continue;
1556 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1557 OperandsWithMask.push_back(Blend->getMask(I));
1558 }
1559
1560 auto *NewBlend =
1561 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1562 OperandsWithMask, Blend->getDebugLoc());
1563 NewBlend->insertBefore(&R);
1564
1565 VPValue *DeadMask = Blend->getMask(StartIndex);
1566 Blend->replaceAllUsesWith(NewBlend);
1567 Blend->eraseFromParent();
1569
1570 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1571 VPValue *NewMask;
1572 if (NewBlend->getNumOperands() == 3 &&
1573 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1574 VPValue *Inc0 = NewBlend->getOperand(0);
1575 VPValue *Inc1 = NewBlend->getOperand(1);
1576 VPValue *OldMask = NewBlend->getOperand(2);
1577 NewBlend->setOperand(0, Inc1);
1578 NewBlend->setOperand(1, Inc0);
1579 NewBlend->setOperand(2, NewMask);
1580 if (OldMask->getNumUsers() == 0)
1581 cast<VPInstruction>(OldMask)->eraseFromParent();
1582 }
1583 }
1584 }
1585}
1586
1587/// Optimize the width of vector induction variables in \p Plan based on a known
1588/// constant Trip Count, \p BestVF and \p BestUF.
1590 ElementCount BestVF,
1591 unsigned BestUF) {
1592 // Only proceed if we have not completely removed the vector region.
1593 if (!Plan.getVectorLoopRegion())
1594 return false;
1595
1596 const APInt *TC;
1597 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1598 return false;
1599
1600 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1601 // and UF. Returns at least 8.
1602 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1603 APInt AlignedTC =
1606 APInt MaxVal = AlignedTC - 1;
1607 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1608 };
1609 unsigned NewBitWidth =
1610 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1611
1612 LLVMContext &Ctx = Plan.getContext();
1613 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1614
1615 bool MadeChange = false;
1616
1617 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1618 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1619 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1620
1621 // Currently only handle canonical IVs as it is trivial to replace the start
1622 // and stop values, and we currently only perform the optimization when the
1623 // IV has a single use.
1624 if (!WideIV || !WideIV->isCanonical() ||
1625 WideIV->hasMoreThanOneUniqueUser() ||
1626 NewIVTy == WideIV->getScalarType())
1627 continue;
1628
1629 // Currently only handle cases where the single user is a header-mask
1630 // comparison with the backedge-taken-count.
1631 VPUser *SingleUser = WideIV->getSingleUser();
1632 if (!SingleUser ||
1633 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1636 continue;
1637
1638 // Update IV operands and comparison bound to use new narrower type.
1639 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1640 WideIV->setStartValue(NewStart);
1641 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1642 WideIV->setStepValue(NewStep);
1643
1644 auto *NewBTC = new VPWidenCastRecipe(
1645 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1646 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1647 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1648 Cmp->setOperand(1, NewBTC);
1649
1650 MadeChange = true;
1651 }
1652
1653 return MadeChange;
1654}
1655
1656/// Return true if \p Cond is known to be true for given \p BestVF and \p
1657/// BestUF.
1659 ElementCount BestVF, unsigned BestUF,
1660 ScalarEvolution &SE) {
1662 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1663 &SE](VPValue *C) {
1664 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1665 });
1666
1667 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1669 m_Specific(CanIV->getBackedgeValue()),
1670 m_Specific(&Plan.getVectorTripCount()))))
1671 return false;
1672
1673 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1674 // count is not conveniently available as SCEV so far, so we compare directly
1675 // against the original trip count. This is stricter than necessary, as we
1676 // will only return true if the trip count == vector trip count.
1677 const SCEV *VectorTripCount =
1679 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1680 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1681 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1682 "Trip count SCEV must be computable");
1683 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1684 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1685 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1686}
1687
1688/// Try to replace multiple active lane masks used for control flow with
1689/// a single, wide active lane mask instruction followed by multiple
1690/// extract subvector intrinsics. This applies to the active lane mask
1691/// instructions both in the loop and in the preheader.
1692/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1693/// new extracts from the first active lane mask, which has it's last
1694/// operand (multiplier) set to UF.
1696 unsigned UF) {
1697 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1698 return false;
1699
1700 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1701 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1702 auto *Term = &ExitingVPBB->back();
1703
1704 using namespace llvm::VPlanPatternMatch;
1706 m_VPValue(), m_VPValue(), m_VPValue())))))
1707 return false;
1708
1709 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1710 LLVMContext &Ctx = Plan.getContext();
1711
1712 auto ExtractFromALM = [&](VPInstruction *ALM,
1713 SmallVectorImpl<VPValue *> &Extracts) {
1714 DebugLoc DL = ALM->getDebugLoc();
1715 for (unsigned Part = 0; Part < UF; ++Part) {
1717 Ops.append({ALM, Plan.getOrAddLiveIn(
1718 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1719 VF.getKnownMinValue() * Part))});
1720 auto *Ext =
1721 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1722 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1723 Extracts[Part] = Ext;
1724 Ext->insertAfter(ALM);
1725 }
1726 };
1727
1728 // Create a list of each active lane mask phi, ordered by unroll part.
1730 for (VPRecipeBase &R : Header->phis()) {
1732 if (!Phi)
1733 continue;
1734 VPValue *Index = nullptr;
1735 match(Phi->getBackedgeValue(),
1737 assert(Index && "Expected index from ActiveLaneMask instruction");
1738
1739 uint64_t Part;
1740 if (match(Index,
1742 m_VPValue(), m_ConstantInt(Part))))
1743 Phis[Part] = Phi;
1744 else
1745 // Anything other than a CanonicalIVIncrementForPart is part 0
1746 Phis[0] = Phi;
1747 }
1748
1749 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1750 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1751
1752 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1753 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1754
1755 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1756 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1757 "Expected incoming values of Phi to be ActiveLaneMasks");
1758
1759 // When using wide lane masks, the return type of the get.active.lane.mask
1760 // intrinsic is VF x UF (last operand).
1761 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1762 EntryALM->setOperand(2, ALMMultiplier);
1763 LoopALM->setOperand(2, ALMMultiplier);
1764
1765 // Create UF x extract vectors and insert into preheader.
1766 SmallVector<VPValue *> EntryExtracts(UF);
1767 ExtractFromALM(EntryALM, EntryExtracts);
1768
1769 // Create UF x extract vectors and insert before the loop compare & branch,
1770 // updating the compare to use the first extract.
1771 SmallVector<VPValue *> LoopExtracts(UF);
1772 ExtractFromALM(LoopALM, LoopExtracts);
1773 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1774 Not->setOperand(0, LoopExtracts[0]);
1775
1776 // Update the incoming values of active lane mask phis.
1777 for (unsigned Part = 0; Part < UF; ++Part) {
1778 Phis[Part]->setStartValue(EntryExtracts[Part]);
1779 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1780 }
1781
1782 return true;
1783}
1784
1785/// Try to simplify the branch condition of \p Plan. This may restrict the
1786/// resulting plan to \p BestVF and \p BestUF.
1788 unsigned BestUF,
1790 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1791 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1792 auto *Term = &ExitingVPBB->back();
1793 VPValue *Cond;
1794 ScalarEvolution &SE = *PSE.getSE();
1795 if (match(Term, m_BranchOnCount()) ||
1797 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1798 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1799 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1800 const SCEV *VectorTripCount =
1802 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1803 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1804 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1805 "Trip count SCEV must be computable");
1806 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1807 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1808 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1809 return false;
1810 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1811 // For BranchOnCond, check if we can prove the condition to be true using VF
1812 // and UF.
1813 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1814 return false;
1815 } else {
1816 return false;
1817 }
1818
1819 // The vector loop region only executes once. If possible, completely remove
1820 // the region, otherwise replace the terminator controlling the latch with
1821 // (BranchOnCond true).
1822 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1823 // support for other non-canonical widen induction recipes (e.g.,
1824 // VPWidenPointerInductionRecipe).
1825 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1826 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1827 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1828 return R->isCanonical();
1829 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1830 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1831 })) {
1832 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1833 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1834 VPBuilder Builder(Plan.getVectorPreheader());
1835 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1836 R->getScalarType());
1837 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1838 HeaderR.eraseFromParent();
1839 continue;
1840 }
1841 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1842 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1843 HeaderR.eraseFromParent();
1844 }
1845
1846 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1847 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1848 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1849 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1850
1851 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1852 B->setParent(nullptr);
1853
1854 VPBlockUtils::connectBlocks(Preheader, Header);
1855 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1857 } else {
1858 // The vector region contains header phis for which we cannot remove the
1859 // loop region yet.
1860 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1861 {}, {}, Term->getDebugLoc());
1862 ExitingVPBB->appendRecipe(BOC);
1863 }
1864
1865 Term->eraseFromParent();
1866
1867 return true;
1868}
1869
1870/// From the definition of llvm.experimental.get.vector.length,
1871/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
1875 vp_depth_first_deep(Plan.getEntry()))) {
1876 for (VPRecipeBase &R : *VPBB) {
1877 VPValue *AVL;
1878 if (!match(&R, m_EVL(m_VPValue(AVL))))
1879 continue;
1880
1881 ScalarEvolution &SE = *PSE.getSE();
1882 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
1883 if (isa<SCEVCouldNotCompute>(AVLSCEV))
1884 continue;
1885 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
1886 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
1887 continue;
1888
1890 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
1891 R.getDebugLoc());
1892 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
1893 return true;
1894 }
1895 }
1896 return false;
1897}
1898
1900 unsigned BestUF,
1902 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1903 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1904
1905 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1906 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1907 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1908 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
1909
1910 if (MadeChange) {
1911 Plan.setVF(BestVF);
1912 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1913 }
1914}
1915
1916/// Sink users of \p FOR after the recipe defining the previous value \p
1917/// Previous of the recurrence. \returns true if all users of \p FOR could be
1918/// re-arranged as needed or false if it is not possible.
1919static bool
1921 VPRecipeBase *Previous,
1922 VPDominatorTree &VPDT) {
1923 // Collect recipes that need sinking.
1926 Seen.insert(Previous);
1927 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1928 // The previous value must not depend on the users of the recurrence phi. In
1929 // that case, FOR is not a fixed order recurrence.
1930 if (SinkCandidate == Previous)
1931 return false;
1932
1933 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1934 !Seen.insert(SinkCandidate).second ||
1935 VPDT.properlyDominates(Previous, SinkCandidate))
1936 return true;
1937
1938 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1939 return false;
1940
1941 WorkList.push_back(SinkCandidate);
1942 return true;
1943 };
1944
1945 // Recursively sink users of FOR after Previous.
1946 WorkList.push_back(FOR);
1947 for (unsigned I = 0; I != WorkList.size(); ++I) {
1948 VPRecipeBase *Current = WorkList[I];
1949 assert(Current->getNumDefinedValues() == 1 &&
1950 "only recipes with a single defined value expected");
1951
1952 for (VPUser *User : Current->getVPSingleValue()->users()) {
1953 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1954 return false;
1955 }
1956 }
1957
1958 // Keep recipes to sink ordered by dominance so earlier instructions are
1959 // processed first.
1960 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1961 return VPDT.properlyDominates(A, B);
1962 });
1963
1964 for (VPRecipeBase *SinkCandidate : WorkList) {
1965 if (SinkCandidate == FOR)
1966 continue;
1967
1968 SinkCandidate->moveAfter(Previous);
1969 Previous = SinkCandidate;
1970 }
1971 return true;
1972}
1973
1974/// Try to hoist \p Previous and its operands before all users of \p FOR.
1976 VPRecipeBase *Previous,
1977 VPDominatorTree &VPDT) {
1978 if (cannotHoistOrSinkRecipe(*Previous))
1979 return false;
1980
1981 // Collect recipes that need hoisting.
1982 SmallVector<VPRecipeBase *> HoistCandidates;
1984 VPRecipeBase *HoistPoint = nullptr;
1985 // Find the closest hoist point by looking at all users of FOR and selecting
1986 // the recipe dominating all other users.
1987 for (VPUser *U : FOR->users()) {
1988 auto *R = cast<VPRecipeBase>(U);
1989 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1990 HoistPoint = R;
1991 }
1992 assert(all_of(FOR->users(),
1993 [&VPDT, HoistPoint](VPUser *U) {
1994 auto *R = cast<VPRecipeBase>(U);
1995 return HoistPoint == R ||
1996 VPDT.properlyDominates(HoistPoint, R);
1997 }) &&
1998 "HoistPoint must dominate all users of FOR");
1999
2000 auto NeedsHoisting = [HoistPoint, &VPDT,
2001 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2002 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2003 if (!HoistCandidate)
2004 return nullptr;
2005 VPRegionBlock *EnclosingLoopRegion =
2006 HoistCandidate->getParent()->getEnclosingLoopRegion();
2007 assert((!HoistCandidate->getRegion() ||
2008 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2009 "CFG in VPlan should still be flat, without replicate regions");
2010 // Hoist candidate was already visited, no need to hoist.
2011 if (!Visited.insert(HoistCandidate).second)
2012 return nullptr;
2013
2014 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2015 // hoisting.
2016 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2017 return nullptr;
2018
2019 // If we reached a recipe that dominates HoistPoint, we don't need to
2020 // hoist the recipe.
2021 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2022 return nullptr;
2023 return HoistCandidate;
2024 };
2025
2026 if (!NeedsHoisting(Previous->getVPSingleValue()))
2027 return true;
2028
2029 // Recursively try to hoist Previous and its operands before all users of FOR.
2030 HoistCandidates.push_back(Previous);
2031
2032 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2033 VPRecipeBase *Current = HoistCandidates[I];
2034 assert(Current->getNumDefinedValues() == 1 &&
2035 "only recipes with a single defined value expected");
2036 if (cannotHoistOrSinkRecipe(*Current))
2037 return false;
2038
2039 for (VPValue *Op : Current->operands()) {
2040 // If we reach FOR, it means the original Previous depends on some other
2041 // recurrence that in turn depends on FOR. If that is the case, we would
2042 // also need to hoist recipes involving the other FOR, which may break
2043 // dependencies.
2044 if (Op == FOR)
2045 return false;
2046
2047 if (auto *R = NeedsHoisting(Op))
2048 HoistCandidates.push_back(R);
2049 }
2050 }
2051
2052 // Order recipes to hoist by dominance so earlier instructions are processed
2053 // first.
2054 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2055 return VPDT.properlyDominates(A, B);
2056 });
2057
2058 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2059 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2060 HoistPoint->getIterator());
2061 }
2062
2063 return true;
2064}
2065
2067 VPBuilder &LoopBuilder) {
2068 VPDominatorTree VPDT(Plan);
2069
2071 for (VPRecipeBase &R :
2074 RecurrencePhis.push_back(FOR);
2075
2076 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2078 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2079 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2080 // to terminate.
2081 while (auto *PrevPhi =
2083 assert(PrevPhi->getParent() == FOR->getParent());
2084 assert(SeenPhis.insert(PrevPhi).second);
2085 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2086 }
2087
2088 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2089 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2090 return false;
2091
2092 // Introduce a recipe to combine the incoming and previous values of a
2093 // fixed-order recurrence.
2094 VPBasicBlock *InsertBlock = Previous->getParent();
2095 if (isa<VPHeaderPHIRecipe>(Previous))
2096 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2097 else
2098 LoopBuilder.setInsertPoint(InsertBlock,
2099 std::next(Previous->getIterator()));
2100
2101 auto *RecurSplice =
2103 {FOR, FOR->getBackedgeValue()});
2104
2105 FOR->replaceAllUsesWith(RecurSplice);
2106 // Set the first operand of RecurSplice to FOR again, after replacing
2107 // all users.
2108 RecurSplice->setOperand(0, FOR);
2109 }
2110 return true;
2111}
2112
2114 for (VPRecipeBase &R :
2116 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2117 if (!PhiR)
2118 continue;
2119 RecurKind RK = PhiR->getRecurrenceKind();
2120 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2122 continue;
2123
2124 for (VPUser *U : collectUsersRecursively(PhiR))
2125 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2126 RecWithFlags->dropPoisonGeneratingFlags();
2127 }
2128 }
2129}
2130
2131namespace {
2132struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2133 static bool isSentinel(const VPSingleDefRecipe *Def) {
2134 return Def == getEmptyKey() || Def == getTombstoneKey();
2135 }
2136
2137 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2138 /// return that source element type.
2139 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2140 // All VPInstructions that lower to GEPs must have the i8 source element
2141 // type (as they are PtrAdds), so we omit it.
2143 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2144 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2145 return GEP->getSourceElementType();
2146 return nullptr;
2147 })
2148 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2149 [](auto *I) { return I->getSourceElementType(); })
2150 .Default([](auto *) { return nullptr; });
2151 }
2152
2153 /// Returns true if recipe \p Def can be safely handed for CSE.
2154 static bool canHandle(const VPSingleDefRecipe *Def) {
2155 // We can extend the list of handled recipes in the future,
2156 // provided we account for the data embedded in them while checking for
2157 // equality or hashing.
2158 auto C = getOpcodeOrIntrinsicID(Def);
2159
2160 // The issue with (Insert|Extract)Value is that the index of the
2161 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2162 // VPlan.
2163 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2164 C->second == Instruction::ExtractValue)))
2165 return false;
2166
2167 // During CSE, we can only handle recipes that don't read from memory: if
2168 // they read from memory, there could be an intervening write to memory
2169 // before the next instance is CSE'd, leading to an incorrect result.
2170 return !Def->mayReadFromMemory();
2171 }
2172
2173 /// Hash the underlying data of \p Def.
2174 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2175 const VPlan *Plan = Def->getParent()->getPlan();
2176 VPTypeAnalysis TypeInfo(*Plan);
2177 hash_code Result = hash_combine(
2178 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2179 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2181 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2182 if (RFlags->hasPredicate())
2183 return hash_combine(Result, RFlags->getPredicate());
2184 return Result;
2185 }
2186
2187 /// Check equality of underlying data of \p L and \p R.
2188 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2189 if (isSentinel(L) || isSentinel(R))
2190 return L == R;
2191 if (L->getVPDefID() != R->getVPDefID() ||
2193 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2195 !equal(L->operands(), R->operands()))
2196 return false;
2198 "must have valid opcode info for both recipes");
2199 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2200 if (LFlags->hasPredicate() &&
2201 LFlags->getPredicate() !=
2202 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2203 return false;
2204 // Recipes in replicate regions implicitly depend on predicate. If either
2205 // recipe is in a replicate region, only consider them equal if both have
2206 // the same parent.
2207 const VPRegionBlock *RegionL = L->getRegion();
2208 const VPRegionBlock *RegionR = R->getRegion();
2209 if (((RegionL && RegionL->isReplicator()) ||
2210 (RegionR && RegionR->isReplicator())) &&
2211 L->getParent() != R->getParent())
2212 return false;
2213 const VPlan *Plan = L->getParent()->getPlan();
2214 VPTypeAnalysis TypeInfo(*Plan);
2215 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2216 }
2217};
2218} // end anonymous namespace
2219
2220/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2221/// Plan.
2223 VPDominatorTree VPDT(Plan);
2225
2227 vp_depth_first_deep(Plan.getEntry()))) {
2228 for (VPRecipeBase &R : *VPBB) {
2229 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2230 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2231 continue;
2232 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2233 // V must dominate Def for a valid replacement.
2234 if (!VPDT.dominates(V->getParent(), VPBB))
2235 continue;
2236 // Only keep flags present on both V and Def.
2237 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2238 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2239 Def->replaceAllUsesWith(V);
2240 continue;
2241 }
2242 CSEMap[Def] = Def;
2243 }
2244 }
2245}
2246
2247/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2248static void licm(VPlan &Plan) {
2249 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2250
2251 // Hoist any loop invariant recipes from the vector loop region to the
2252 // preheader. Preform a shallow traversal of the vector loop region, to
2253 // exclude recipes in replicate regions. Since the top-level blocks in the
2254 // vector loop region are guaranteed to execute if the vector pre-header is,
2255 // we don't need to check speculation safety.
2256 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2257 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2258 "Expected vector prehader's successor to be the vector loop region");
2260 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2261 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2263 continue;
2264 if (any_of(R.operands(), [](VPValue *Op) {
2265 return !Op->isDefinedOutsideLoopRegions();
2266 }))
2267 continue;
2268 R.moveBefore(*Preheader, Preheader->end());
2269 }
2270 }
2271}
2272
2274 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2275 if (Plan.hasScalarVFOnly())
2276 return;
2277 // Keep track of created truncates, so they can be re-used. Note that we
2278 // cannot use RAUW after creating a new truncate, as this would could make
2279 // other uses have different types for their operands, making them invalidly
2280 // typed.
2282 VPTypeAnalysis TypeInfo(Plan);
2283 VPBasicBlock *PH = Plan.getVectorPreheader();
2286 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2289 &R))
2290 continue;
2291
2292 VPValue *ResultVPV = R.getVPSingleValue();
2293 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2294 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2295 if (!NewResSizeInBits)
2296 continue;
2297
2298 // If the value wasn't vectorized, we must maintain the original scalar
2299 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2300 // skip casts which do not need to be handled explicitly here, as
2301 // redundant casts will be removed during recipe simplification.
2303 continue;
2304
2305 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2306 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2307 assert(OldResTy->isIntegerTy() && "only integer types supported");
2308 (void)OldResSizeInBits;
2309
2310 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2311
2312 // Any wrapping introduced by shrinking this operation shouldn't be
2313 // considered undefined behavior. So, we can't unconditionally copy
2314 // arithmetic wrapping flags to VPW.
2315 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2316 VPW->dropPoisonGeneratingFlags();
2317
2318 if (OldResSizeInBits != NewResSizeInBits &&
2319 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2320 // Extend result to original width.
2321 auto *Ext =
2322 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2323 Ext->insertAfter(&R);
2324 ResultVPV->replaceAllUsesWith(Ext);
2325 Ext->setOperand(0, ResultVPV);
2326 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2327 } else {
2328 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2329 "Only ICmps should not need extending the result.");
2330 }
2331
2332 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2334 continue;
2335
2336 // Shrink operands by introducing truncates as needed.
2337 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2338 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2339 auto *Op = R.getOperand(Idx);
2340 unsigned OpSizeInBits =
2342 if (OpSizeInBits == NewResSizeInBits)
2343 continue;
2344 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2345 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2346 if (!IterIsEmpty) {
2347 R.setOperand(Idx, ProcessedIter->second);
2348 continue;
2349 }
2350
2351 VPBuilder Builder;
2352 if (Op->isLiveIn())
2353 Builder.setInsertPoint(PH);
2354 else
2355 Builder.setInsertPoint(&R);
2356 VPWidenCastRecipe *NewOp =
2357 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2358 ProcessedIter->second = NewOp;
2359 R.setOperand(Idx, NewOp);
2360 }
2361
2362 }
2363 }
2364}
2365
2369 VPValue *Cond;
2370 // Skip blocks that are not terminated by BranchOnCond.
2371 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2372 continue;
2373
2374 assert(VPBB->getNumSuccessors() == 2 &&
2375 "Two successors expected for BranchOnCond");
2376 unsigned RemovedIdx;
2377 if (match(Cond, m_True()))
2378 RemovedIdx = 1;
2379 else if (match(Cond, m_False()))
2380 RemovedIdx = 0;
2381 else
2382 continue;
2383
2384 VPBasicBlock *RemovedSucc =
2385 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2386 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2387 "There must be a single edge between VPBB and its successor");
2388 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2389 // these recipes.
2390 for (VPRecipeBase &R : RemovedSucc->phis())
2391 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2392
2393 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2394 // automatically on VPlan destruction if it becomes unreachable.
2395 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2396 VPBB->back().eraseFromParent();
2397 }
2398}
2399
2419
2420// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2421// the loop terminator with a branch-on-cond recipe with the negated
2422// active-lane-mask as operand. Note that this turns the loop into an
2423// uncountable one. Only the existing terminator is replaced, all other existing
2424// recipes/users remain unchanged, except for poison-generating flags being
2425// dropped from the canonical IV increment. Return the created
2426// VPActiveLaneMaskPHIRecipe.
2427//
2428// The function uses the following definitions:
2429//
2430// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2431// calculate-trip-count-minus-VF (original TC) : original TC
2432// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2433// CanonicalIVPhi : CanonicalIVIncrement
2434// %StartV is the canonical induction start value.
2435//
2436// The function adds the following recipes:
2437//
2438// vector.ph:
2439// %TripCount = calculate-trip-count-minus-VF (original TC)
2440// [if DataWithControlFlowWithoutRuntimeCheck]
2441// %EntryInc = canonical-iv-increment-for-part %StartV
2442// %EntryALM = active-lane-mask %EntryInc, %TripCount
2443//
2444// vector.body:
2445// ...
2446// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2447// ...
2448// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2449// %ALM = active-lane-mask %InLoopInc, TripCount
2450// %Negated = Not %ALM
2451// branch-on-cond %Negated
2452//
2455 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2456 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2457 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2458 VPValue *StartV = CanonicalIVPHI->getStartValue();
2459
2460 auto *CanonicalIVIncrement =
2461 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2462 // TODO: Check if dropping the flags is needed if
2463 // !DataAndControlFlowWithoutRuntimeCheck.
2464 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2465 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2466 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2467 // we have to take unrolling into account. Each part needs to start at
2468 // Part * VF
2469 auto *VecPreheader = Plan.getVectorPreheader();
2470 VPBuilder Builder(VecPreheader);
2471
2472 // Create the ActiveLaneMask instruction using the correct start values.
2473 VPValue *TC = Plan.getTripCount();
2474
2475 VPValue *TripCount, *IncrementValue;
2477 // When the loop is guarded by a runtime overflow check for the loop
2478 // induction variable increment by VF, we can increment the value before
2479 // the get.active.lane mask and use the unmodified tripcount.
2480 IncrementValue = CanonicalIVIncrement;
2481 TripCount = TC;
2482 } else {
2483 // When avoiding a runtime check, the active.lane.mask inside the loop
2484 // uses a modified trip count and the induction variable increment is
2485 // done after the active.lane.mask intrinsic is called.
2486 IncrementValue = CanonicalIVPHI;
2487 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2488 {TC}, DL);
2489 }
2490 auto *EntryIncrement = Builder.createOverflowingOp(
2491 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2492 "index.part.next");
2493
2494 // Create the active lane mask instruction in the VPlan preheader.
2495 VPValue *ALMMultiplier =
2496 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2497 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2498 {EntryIncrement, TC, ALMMultiplier}, DL,
2499 "active.lane.mask.entry");
2500
2501 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2502 // preheader ActiveLaneMask instruction.
2503 auto *LaneMaskPhi =
2505 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2506
2507 // Create the active lane mask for the next iteration of the loop before the
2508 // original terminator.
2509 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2510 Builder.setInsertPoint(OriginalTerminator);
2511 auto *InLoopIncrement =
2512 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2513 {IncrementValue}, {false, false}, DL);
2514 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2515 {InLoopIncrement, TripCount, ALMMultiplier},
2516 DL, "active.lane.mask.next");
2517 LaneMaskPhi->addOperand(ALM);
2518
2519 // Replace the original terminator with BranchOnCond. We have to invert the
2520 // mask here because a true condition means jumping to the exit block.
2521 auto *NotMask = Builder.createNot(ALM, DL);
2522 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2523 OriginalTerminator->eraseFromParent();
2524 return LaneMaskPhi;
2525}
2526
2527/// Collect the header mask with the pattern:
2528/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2529/// TODO: Introduce explicit recipe for header-mask instead of searching
2530/// for the header-mask pattern manually.
2532 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2533 SmallVector<VPValue *> WideCanonicalIVs;
2534 auto *FoundWidenCanonicalIVUser = find_if(
2536 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2538 "Must have at most one VPWideCanonicalIVRecipe");
2539 if (FoundWidenCanonicalIVUser !=
2540 LoopRegion->getCanonicalIV()->users().end()) {
2541 auto *WideCanonicalIV =
2542 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2543 WideCanonicalIVs.push_back(WideCanonicalIV);
2544 }
2545
2546 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2547 // version of the canonical induction.
2548 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2549 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2550 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2551 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2552 WideCanonicalIVs.push_back(WidenOriginalIV);
2553 }
2554
2555 // Walk users of wide canonical IVs and find the single compare of the form
2556 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2557 VPSingleDefRecipe *HeaderMask = nullptr;
2558 for (auto *Wide : WideCanonicalIVs) {
2559 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2560 auto *VPI = dyn_cast<VPInstruction>(U);
2561 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2562 continue;
2563
2564 assert(VPI->getOperand(0) == Wide &&
2565 "WidenCanonicalIV must be the first operand of the compare");
2566 assert(!HeaderMask && "Multiple header masks found?");
2567 HeaderMask = VPI;
2568 }
2569 }
2570 return HeaderMask;
2571}
2572
2574 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2577 UseActiveLaneMaskForControlFlow) &&
2578 "DataAndControlFlowWithoutRuntimeCheck implies "
2579 "UseActiveLaneMaskForControlFlow");
2580
2581 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2582 auto *FoundWidenCanonicalIVUser = find_if(
2584 assert(FoundWidenCanonicalIVUser &&
2585 "Must have widened canonical IV when tail folding!");
2586 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2587 auto *WideCanonicalIV =
2588 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2589 VPSingleDefRecipe *LaneMask;
2590 if (UseActiveLaneMaskForControlFlow) {
2593 } else {
2594 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2595 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2596 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2597 LaneMask =
2598 B.createNaryOp(VPInstruction::ActiveLaneMask,
2599 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2600 nullptr, "active.lane.mask");
2601 }
2602
2603 // Walk users of WideCanonicalIV and replace the header mask of the form
2604 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2605 // removing the old one to ensure there is always only a single header mask.
2606 HeaderMask->replaceAllUsesWith(LaneMask);
2607 HeaderMask->eraseFromParent();
2608}
2609
2610template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2611 Op0_t In;
2613
2614 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2615
2616 template <typename OpTy> bool match(OpTy *V) const {
2617 if (m_Specific(In).match(V)) {
2618 Out = nullptr;
2619 return true;
2620 }
2621 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2622 }
2623};
2624
2625/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2626/// Returns the remaining part \p Out if so, or nullptr otherwise.
2627template <typename Op0_t, typename Op1_t>
2628static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2629 Op1_t &Out) {
2630 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2631}
2632
2633/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2634/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2635/// recipe could be created.
2636/// \p HeaderMask Header Mask.
2637/// \p CurRecipe Recipe to be transform.
2638/// \p TypeInfo VPlan-based type analysis.
2639/// \p EVL The explicit vector length parameter of vector-predication
2640/// intrinsics.
2642 VPRecipeBase &CurRecipe,
2643 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2644 VPlan *Plan = CurRecipe.getParent()->getPlan();
2645 VPValue *Addr, *Mask, *EndPtr;
2646
2647 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2648 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2649 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2650 EVLEndPtr->insertBefore(&CurRecipe);
2651 EVLEndPtr->setOperand(1, &EVL);
2652 return EVLEndPtr;
2653 };
2654
2655 if (match(&CurRecipe,
2656 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2657 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2658 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2659 EVL, Mask);
2660
2661 if (match(&CurRecipe,
2662 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2663 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2664 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2665 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2666 AdjustEndPtr(EndPtr), EVL, Mask);
2667
2668 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2669 m_RemoveMask(HeaderMask, Mask))) &&
2670 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2671 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2672 EVL, Mask);
2673
2674 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2675 m_RemoveMask(HeaderMask, Mask))) &&
2676 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2677 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2678 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2679 AdjustEndPtr(EndPtr), EVL, Mask);
2680
2681 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2682 if (Rdx->isConditional() &&
2683 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2684 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2685
2686 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2687 if (Interleave->getMask() &&
2688 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2689 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2690
2691 VPValue *LHS, *RHS;
2692 if (match(&CurRecipe,
2693 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2694 return new VPWidenIntrinsicRecipe(
2695 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2696 TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2697
2698 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2699 m_VPValue(RHS))))
2700 return new VPWidenIntrinsicRecipe(
2701 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2702 TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2703
2704 return nullptr;
2705}
2706
2707/// Replace recipes with their EVL variants.
2709 VPTypeAnalysis TypeInfo(Plan);
2710 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2711 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2712
2713 assert(all_of(Plan.getVF().users(),
2716 "User of VF that we can't transform to EVL.");
2717 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2719 });
2720
2721 assert(all_of(Plan.getVFxUF().users(),
2722 [&LoopRegion, &Plan](VPUser *U) {
2723 return match(U,
2724 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2725 m_Specific(&Plan.getVFxUF()))) ||
2726 isa<VPWidenPointerInductionRecipe>(U);
2727 }) &&
2728 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2729 "increment of the canonical induction.");
2730 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2731 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2732 // canonical induction must not be updated.
2734 });
2735
2736 // Defer erasing recipes till the end so that we don't invalidate the
2737 // VPTypeAnalysis cache.
2739
2740 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2741 // contained.
2742 bool ContainsFORs =
2744 if (ContainsFORs) {
2745 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2746 VPValue *MaxEVL = &Plan.getVF();
2747 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2748 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2749 MaxEVL = Builder.createScalarZExtOrTrunc(
2750 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2751 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2752
2753 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2754 VPValue *PrevEVL = Builder.createScalarPhi(
2755 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2756
2759 for (VPRecipeBase &R : *VPBB) {
2760 VPValue *V1, *V2;
2761 if (!match(&R,
2763 m_VPValue(V1), m_VPValue(V2))))
2764 continue;
2765 VPValue *Imm = Plan.getOrAddLiveIn(
2768 Intrinsic::experimental_vp_splice,
2769 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2770 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
2771 R.getDebugLoc());
2772 VPSplice->insertBefore(&R);
2773 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2774 ToErase.push_back(&R);
2775 }
2776 }
2777 }
2778
2779 VPValue *HeaderMask = findHeaderMask(Plan);
2780 if (!HeaderMask)
2781 return;
2782
2783 // Replace header masks with a mask equivalent to predicating by EVL:
2784 //
2785 // icmp ule widen-canonical-iv backedge-taken-count
2786 // ->
2787 // icmp ult step-vector, EVL
2788 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2789 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2790 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2791 VPValue *EVLMask = Builder.createICmp(
2793 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2794 HeaderMask->replaceAllUsesWith(EVLMask);
2795 ToErase.push_back(HeaderMask->getDefiningRecipe());
2796
2797 // Try to optimize header mask recipes away to their EVL variants.
2798 // TODO: Split optimizeMaskToEVL out and move into
2799 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2800 // tryToBuildVPlanWithVPRecipes beforehand.
2801 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2802 auto *CurRecipe = cast<VPRecipeBase>(U);
2803 VPRecipeBase *EVLRecipe =
2804 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2805 if (!EVLRecipe)
2806 continue;
2807
2808 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2809 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2810 "New recipe must define the same number of values as the "
2811 "original.");
2812 EVLRecipe->insertBefore(CurRecipe);
2814 EVLRecipe)) {
2815 for (unsigned I = 0; I < NumDefVal; ++I) {
2816 VPValue *CurVPV = CurRecipe->getVPValue(I);
2817 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2818 }
2819 }
2820 ToErase.push_back(CurRecipe);
2821 }
2822 // Remove dead EVL mask.
2823 if (EVLMask->getNumUsers() == 0)
2824 ToErase.push_back(EVLMask->getDefiningRecipe());
2825
2826 for (VPRecipeBase *R : reverse(ToErase)) {
2827 SmallVector<VPValue *> PossiblyDead(R->operands());
2828 R->eraseFromParent();
2829 for (VPValue *Op : PossiblyDead)
2831 }
2832}
2833
2834/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2835/// replaces all uses except the canonical IV increment of
2836/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2837/// is used only for loop iterations counting after this transformation.
2838///
2839/// The function uses the following definitions:
2840/// %StartV is the canonical induction start value.
2841///
2842/// The function adds the following recipes:
2843///
2844/// vector.ph:
2845/// ...
2846///
2847/// vector.body:
2848/// ...
2849/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2850/// [ %NextEVLIV, %vector.body ]
2851/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2852/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2853/// ...
2854/// %OpEVL = cast i32 %VPEVL to IVSize
2855/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2856/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2857/// ...
2858///
2859/// If MaxSafeElements is provided, the function adds the following recipes:
2860/// vector.ph:
2861/// ...
2862///
2863/// vector.body:
2864/// ...
2865/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2866/// [ %NextEVLIV, %vector.body ]
2867/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2868/// %cmp = cmp ult %AVL, MaxSafeElements
2869/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2870/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2871/// ...
2872/// %OpEVL = cast i32 %VPEVL to IVSize
2873/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2874/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2875/// ...
2876///
2878 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2879 if (Plan.hasScalarVFOnly())
2880 return;
2881 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2882 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2883
2884 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2885 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2886 VPValue *StartV = CanonicalIVPHI->getStartValue();
2887
2888 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2889 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2890 EVLPhi->insertAfter(CanonicalIVPHI);
2891 VPBuilder Builder(Header, Header->getFirstNonPhi());
2892 // Create the AVL (application vector length), starting from TC -> 0 in steps
2893 // of EVL.
2894 VPPhi *AVLPhi = Builder.createScalarPhi(
2895 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2896 VPValue *AVL = AVLPhi;
2897
2898 if (MaxSafeElements) {
2899 // Support for MaxSafeDist for correct loop emission.
2900 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2901 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2902 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2903 "safe_avl");
2904 }
2905 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2907
2908 auto *CanonicalIVIncrement =
2909 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2910 Builder.setInsertPoint(CanonicalIVIncrement);
2911 VPValue *OpVPEVL = VPEVL;
2912
2913 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2914 OpVPEVL = Builder.createScalarZExtOrTrunc(
2915 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2916
2917 auto *NextEVLIV = Builder.createOverflowingOp(
2918 Instruction::Add, {OpVPEVL, EVLPhi},
2919 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2920 CanonicalIVIncrement->hasNoSignedWrap()},
2921 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2922 EVLPhi->addOperand(NextEVLIV);
2923
2924 VPValue *NextAVL = Builder.createOverflowingOp(
2925 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2926 DebugLoc::getCompilerGenerated(), "avl.next");
2927 AVLPhi->addOperand(NextAVL);
2928
2929 transformRecipestoEVLRecipes(Plan, *VPEVL);
2930
2931 // Replace all uses of VPCanonicalIVPHIRecipe by
2932 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2933 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2934 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2935 // TODO: support unroll factor > 1.
2936 Plan.setUF(1);
2937}
2938
2940 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2941 // There should be only one EVL PHI in the entire plan.
2942 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2943
2946 for (VPRecipeBase &R : VPBB->phis())
2947 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2948 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2949 EVLPhi = PhiR;
2950 }
2951
2952 // Early return if no EVL PHI is found.
2953 if (!EVLPhi)
2954 return;
2955
2956 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2957 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2958 VPValue *AVL;
2959 [[maybe_unused]] bool FoundAVL =
2960 match(EVLIncrement,
2961 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2962 assert(FoundAVL && "Didn't find AVL?");
2963
2964 // The AVL may be capped to a safe distance.
2965 VPValue *SafeAVL;
2966 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2967 AVL = SafeAVL;
2968
2969 VPValue *AVLNext;
2970 [[maybe_unused]] bool FoundAVLNext =
2972 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2973 assert(FoundAVLNext && "Didn't find AVL backedge?");
2974
2975 // Convert EVLPhi to concrete recipe.
2976 auto *ScalarR =
2977 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2978 EVLPhi->getDebugLoc(), "evl.based.iv");
2979 EVLPhi->replaceAllUsesWith(ScalarR);
2980 EVLPhi->eraseFromParent();
2981
2982 // Replace CanonicalIVInc with EVL-PHI increment.
2983 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2984 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2985 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2986 m_Specific(&Plan.getVFxUF()))) &&
2987 "Unexpected canonical iv");
2988 Backedge->replaceAllUsesWith(EVLIncrement);
2989
2990 // Remove unused phi and increment.
2991 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2992 CanonicalIVIncrement->eraseFromParent();
2993 CanonicalIV->eraseFromParent();
2994
2995 // Replace the use of VectorTripCount in the latch-exiting block.
2996 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2997 // After: (branch-on-cond eq AVLNext, 0)
2998
2999 VPBasicBlock *LatchExiting =
3000 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3001 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3002 // Skip single-iteration loop region
3003 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3004 return;
3005 assert(LatchExitingBr &&
3006 match(LatchExitingBr,
3007 m_BranchOnCount(m_VPValue(EVLIncrement),
3008 m_Specific(&Plan.getVectorTripCount()))) &&
3009 "Unexpected terminator in EVL loop");
3010
3011 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3012 VPBuilder Builder(LatchExitingBr);
3013 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3014 Plan.getConstantInt(AVLTy, 0));
3015 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3016 LatchExitingBr->eraseFromParent();
3017}
3018
3020 VPlan &Plan, PredicatedScalarEvolution &PSE,
3021 const DenseMap<Value *, const SCEV *> &StridesMap) {
3022 // Replace VPValues for known constant strides guaranteed by predicate scalar
3023 // evolution.
3024 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3025 auto *R = cast<VPRecipeBase>(&U);
3026 return R->getRegion() ||
3027 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3028 };
3029 ValueToSCEVMapTy RewriteMap;
3030 for (const SCEV *Stride : StridesMap.values()) {
3031 using namespace SCEVPatternMatch;
3032 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3033 const APInt *StrideConst;
3034 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3035 // Only handle constant strides for now.
3036 continue;
3037
3038 auto *CI = Plan.getConstantInt(*StrideConst);
3039 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3040 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3041
3042 // The versioned value may not be used in the loop directly but through a
3043 // sext/zext. Add new live-ins in those cases.
3044 for (Value *U : StrideV->users()) {
3046 continue;
3047 VPValue *StrideVPV = Plan.getLiveIn(U);
3048 if (!StrideVPV)
3049 continue;
3050 unsigned BW = U->getType()->getScalarSizeInBits();
3051 APInt C =
3052 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3053 VPValue *CI = Plan.getConstantInt(C);
3054 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3055 }
3056 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3057 }
3058
3059 for (VPRecipeBase &R : *Plan.getEntry()) {
3060 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3061 if (!ExpSCEV)
3062 continue;
3063 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3064 auto *NewSCEV =
3065 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3066 if (NewSCEV != ScevExpr) {
3067 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3068 ExpSCEV->replaceAllUsesWith(NewExp);
3069 if (Plan.getTripCount() == ExpSCEV)
3070 Plan.resetTripCount(NewExp);
3071 }
3072 }
3073}
3074
3076 VPlan &Plan,
3077 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3078 // Collect recipes in the backward slice of `Root` that may generate a poison
3079 // value that is used after vectorization.
3081 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3083 Worklist.push_back(Root);
3084
3085 // Traverse the backward slice of Root through its use-def chain.
3086 while (!Worklist.empty()) {
3087 VPRecipeBase *CurRec = Worklist.pop_back_val();
3088
3089 if (!Visited.insert(CurRec).second)
3090 continue;
3091
3092 // Prune search if we find another recipe generating a widen memory
3093 // instruction. Widen memory instructions involved in address computation
3094 // will lead to gather/scatter instructions, which don't need to be
3095 // handled.
3097 VPHeaderPHIRecipe>(CurRec))
3098 continue;
3099
3100 // This recipe contributes to the address computation of a widen
3101 // load/store. If the underlying instruction has poison-generating flags,
3102 // drop them directly.
3103 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3104 VPValue *A, *B;
3105 // Dropping disjoint from an OR may yield incorrect results, as some
3106 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3107 // for dependence analysis). Instead, replace it with an equivalent Add.
3108 // This is possible as all users of the disjoint OR only access lanes
3109 // where the operands are disjoint or poison otherwise.
3110 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3111 RecWithFlags->isDisjoint()) {
3112 VPBuilder Builder(RecWithFlags);
3113 VPInstruction *New = Builder.createOverflowingOp(
3114 Instruction::Add, {A, B}, {false, false},
3115 RecWithFlags->getDebugLoc());
3116 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3117 RecWithFlags->replaceAllUsesWith(New);
3118 RecWithFlags->eraseFromParent();
3119 CurRec = New;
3120 } else
3121 RecWithFlags->dropPoisonGeneratingFlags();
3122 } else {
3125 (void)Instr;
3126 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3127 "found instruction with poison generating flags not covered by "
3128 "VPRecipeWithIRFlags");
3129 }
3130
3131 // Add new definitions to the worklist.
3132 for (VPValue *Operand : CurRec->operands())
3133 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3134 Worklist.push_back(OpDef);
3135 }
3136 });
3137
3138 // Traverse all the recipes in the VPlan and collect the poison-generating
3139 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3140 // VPInterleaveRecipe.
3141 auto Iter = vp_depth_first_deep(Plan.getEntry());
3143 for (VPRecipeBase &Recipe : *VPBB) {
3144 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3145 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3146 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3147 if (AddrDef && WidenRec->isConsecutive() &&
3148 BlockNeedsPredication(UnderlyingInstr.getParent()))
3149 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3150 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3151 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3152 if (AddrDef) {
3153 // Check if any member of the interleave group needs predication.
3154 const InterleaveGroup<Instruction> *InterGroup =
3155 InterleaveRec->getInterleaveGroup();
3156 bool NeedPredication = false;
3157 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3158 I < NumMembers; ++I) {
3159 Instruction *Member = InterGroup->getMember(I);
3160 if (Member)
3161 NeedPredication |= BlockNeedsPredication(Member->getParent());
3162 }
3163
3164 if (NeedPredication)
3165 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3166 }
3167 }
3168 }
3169 }
3170}
3171
3173 VPlan &Plan,
3175 &InterleaveGroups,
3176 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3177 if (InterleaveGroups.empty())
3178 return;
3179
3180 // Interleave memory: for each Interleave Group we marked earlier as relevant
3181 // for this VPlan, replace the Recipes widening its memory instructions with a
3182 // single VPInterleaveRecipe at its insertion point.
3183 VPDominatorTree VPDT(Plan);
3184 for (const auto *IG : InterleaveGroups) {
3185 auto *Start =
3186 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3187 VPIRMetadata InterleaveMD(*Start);
3188 SmallVector<VPValue *, 4> StoredValues;
3189 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3190 StoredValues.push_back(StoreR->getStoredValue());
3191 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3192 Instruction *MemberI = IG->getMember(I);
3193 if (!MemberI)
3194 continue;
3195 VPWidenMemoryRecipe *MemoryR =
3196 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3197 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3198 StoredValues.push_back(StoreR->getStoredValue());
3199 InterleaveMD.intersect(*MemoryR);
3200 }
3201
3202 bool NeedsMaskForGaps =
3203 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3204 (!StoredValues.empty() && !IG->isFull());
3205
3206 Instruction *IRInsertPos = IG->getInsertPos();
3207 auto *InsertPos =
3208 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3209
3211 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3212 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3213 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3214
3215 // Get or create the start address for the interleave group.
3216 VPValue *Addr = Start->getAddr();
3217 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3218 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3219 // We cannot re-use the address of member zero because it does not
3220 // dominate the insert position. Instead, use the address of the insert
3221 // position and create a PtrAdd adjusting it to the address of member
3222 // zero.
3223 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3224 // InsertPos or sink loads above zero members to join it.
3225 assert(IG->getIndex(IRInsertPos) != 0 &&
3226 "index of insert position shouldn't be zero");
3227 auto &DL = IRInsertPos->getDataLayout();
3228 APInt Offset(32,
3229 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3230 IG->getIndex(IRInsertPos),
3231 /*IsSigned=*/true);
3232 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3233 VPBuilder B(InsertPos);
3234 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3235 }
3236 // If the group is reverse, adjust the index to refer to the last vector
3237 // lane instead of the first. We adjust the index from the first vector
3238 // lane, rather than directly getting the pointer for lane VF - 1, because
3239 // the pointer operand of the interleaved access is supposed to be uniform.
3240 if (IG->isReverse()) {
3241 auto *ReversePtr = new VPVectorEndPointerRecipe(
3242 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3243 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3244 ReversePtr->insertBefore(InsertPos);
3245 Addr = ReversePtr;
3246 }
3247 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3248 InsertPos->getMask(), NeedsMaskForGaps,
3249 InterleaveMD, InsertPos->getDebugLoc());
3250 VPIG->insertBefore(InsertPos);
3251
3252 unsigned J = 0;
3253 for (unsigned i = 0; i < IG->getFactor(); ++i)
3254 if (Instruction *Member = IG->getMember(i)) {
3255 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3256 if (!Member->getType()->isVoidTy()) {
3257 VPValue *OriginalV = MemberR->getVPSingleValue();
3258 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3259 J++;
3260 }
3261 MemberR->eraseFromParent();
3262 }
3263 }
3264}
3265
3266/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3267/// value, phi and backedge value. In the following example:
3268///
3269/// vector.ph:
3270/// Successor(s): vector loop
3271///
3272/// <x1> vector loop: {
3273/// vector.body:
3274/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3275/// ...
3276/// EMIT branch-on-count ...
3277/// No successors
3278/// }
3279///
3280/// WIDEN-INDUCTION will get expanded to:
3281///
3282/// vector.ph:
3283/// ...
3284/// vp<%induction.start> = ...
3285/// vp<%induction.increment> = ...
3286///
3287/// Successor(s): vector loop
3288///
3289/// <x1> vector loop: {
3290/// vector.body:
3291/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3292/// ...
3293/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3294/// EMIT branch-on-count ...
3295/// No successors
3296/// }
3297static void
3299 VPTypeAnalysis &TypeInfo) {
3300 VPlan *Plan = WidenIVR->getParent()->getPlan();
3301 VPValue *Start = WidenIVR->getStartValue();
3302 VPValue *Step = WidenIVR->getStepValue();
3303 VPValue *VF = WidenIVR->getVFValue();
3304 DebugLoc DL = WidenIVR->getDebugLoc();
3305
3306 // The value from the original loop to which we are mapping the new induction
3307 // variable.
3308 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3309
3310 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3313 VPIRFlags Flags = *WidenIVR;
3314 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3315 AddOp = Instruction::Add;
3316 MulOp = Instruction::Mul;
3317 } else {
3318 AddOp = ID.getInductionOpcode();
3319 MulOp = Instruction::FMul;
3320 }
3321
3322 // If the phi is truncated, truncate the start and step values.
3323 VPBuilder Builder(Plan->getVectorPreheader());
3324 Type *StepTy = TypeInfo.inferScalarType(Step);
3325 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3326 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3327 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3328 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3329 // Truncation doesn't preserve WrapFlags.
3330 Flags.dropPoisonGeneratingFlags();
3331 StepTy = Ty;
3332 }
3333
3334 // Construct the initial value of the vector IV in the vector loop preheader.
3335 Type *IVIntTy =
3337 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3338 if (StepTy->isFloatingPointTy())
3339 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3340
3341 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3342 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3343
3344 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3345 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3346 DebugLoc::getUnknown(), "induction");
3347
3348 // Create the widened phi of the vector IV.
3349 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3350 WidenIVR->getDebugLoc(), "vec.ind");
3351 WidePHI->insertBefore(WidenIVR);
3352
3353 // Create the backedge value for the vector IV.
3354 VPValue *Inc;
3355 VPValue *Prev;
3356 // If unrolled, use the increment and prev value from the operands.
3357 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3358 Inc = SplatVF;
3359 Prev = WidenIVR->getLastUnrolledPartOperand();
3360 } else {
3361 if (VPRecipeBase *R = VF->getDefiningRecipe())
3362 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3363 // Multiply the vectorization factor by the step using integer or
3364 // floating-point arithmetic as appropriate.
3365 if (StepTy->isFloatingPointTy())
3366 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3367 DL);
3368 else
3369 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3370 TypeInfo.inferScalarType(VF), DL);
3371
3372 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3373 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3374 Prev = WidePHI;
3375 }
3376
3378 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3379 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3380 WidenIVR->getDebugLoc(), "vec.ind.next");
3381
3382 WidePHI->addOperand(Next);
3383
3384 WidenIVR->replaceAllUsesWith(WidePHI);
3385}
3386
3387/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3388/// initial value, phi and backedge value. In the following example:
3389///
3390/// <x1> vector loop: {
3391/// vector.body:
3392/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3393/// ...
3394/// EMIT branch-on-count ...
3395/// }
3396///
3397/// WIDEN-POINTER-INDUCTION will get expanded to:
3398///
3399/// <x1> vector loop: {
3400/// vector.body:
3401/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3402/// EMIT %mul = mul %stepvector, %step
3403/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3404/// ...
3405/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3406/// EMIT branch-on-count ...
3407/// }
3409 VPTypeAnalysis &TypeInfo) {
3410 VPlan *Plan = R->getParent()->getPlan();
3411 VPValue *Start = R->getStartValue();
3412 VPValue *Step = R->getStepValue();
3413 VPValue *VF = R->getVFValue();
3414
3415 assert(R->getInductionDescriptor().getKind() ==
3417 "Not a pointer induction according to InductionDescriptor!");
3418 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3419 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3420 "Recipe should have been replaced");
3421
3422 VPBuilder Builder(R);
3423 DebugLoc DL = R->getDebugLoc();
3424
3425 // Build a scalar pointer phi.
3426 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3427
3428 // Create actual address geps that use the pointer phi as base and a
3429 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3430 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3431 Type *StepTy = TypeInfo.inferScalarType(Step);
3432 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3433 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3434 VPValue *PtrAdd = Builder.createNaryOp(
3435 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3436 R->replaceAllUsesWith(PtrAdd);
3437
3438 // Create the backedge value for the scalar pointer phi.
3440 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3441 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3442 DL);
3443 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3444
3445 VPValue *InductionGEP =
3446 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3447 ScalarPtrPhi->addOperand(InductionGEP);
3448}
3449
3451 // Replace loop regions with explicity CFG.
3452 SmallVector<VPRegionBlock *> LoopRegions;
3454 vp_depth_first_deep(Plan.getEntry()))) {
3455 if (!R->isReplicator())
3456 LoopRegions.push_back(R);
3457 }
3458 for (VPRegionBlock *R : LoopRegions)
3459 R->dissolveToCFGLoop();
3460}
3461
3463 VPTypeAnalysis TypeInfo(Plan);
3466 vp_depth_first_deep(Plan.getEntry()))) {
3467 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3468 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3469 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3470 ToRemove.push_back(WidenIVR);
3471 continue;
3472 }
3473
3474 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3475 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3476 ToRemove.push_back(WidenIVR);
3477 continue;
3478 }
3479
3480 // Expand VPBlendRecipe into VPInstruction::Select.
3481 VPBuilder Builder(&R);
3482 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3483 VPValue *Select = Blend->getIncomingValue(0);
3484 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3485 Select = Builder.createSelect(Blend->getMask(I),
3486 Blend->getIncomingValue(I), Select,
3487 R.getDebugLoc(), "predphi");
3488 Blend->replaceAllUsesWith(Select);
3489 ToRemove.push_back(Blend);
3490 }
3491
3492 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3493 Expr->decompose();
3494 ToRemove.push_back(Expr);
3495 }
3496
3497 VPValue *VectorStep;
3498 VPValue *ScalarStep;
3500 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3501 continue;
3502
3503 // Expand WideIVStep.
3504 auto *VPI = cast<VPInstruction>(&R);
3505 Type *IVTy = TypeInfo.inferScalarType(VPI);
3506 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3508 ? Instruction::UIToFP
3509 : Instruction::Trunc;
3510 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3511 }
3512
3513 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3514 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3515 ScalarStep =
3516 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3517 }
3518
3519 VPIRFlags Flags;
3520 if (IVTy->isFloatingPointTy())
3521 Flags = {VPI->getFastMathFlags()};
3522
3523 unsigned MulOpc =
3524 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3525 VPInstruction *Mul = Builder.createNaryOp(
3526 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3527 VectorStep = Mul;
3528 VPI->replaceAllUsesWith(VectorStep);
3529 ToRemove.push_back(VPI);
3530 }
3531 }
3532
3533 for (VPRecipeBase *R : ToRemove)
3534 R->eraseFromParent();
3535}
3536
3538 VPBasicBlock *EarlyExitVPBB,
3539 VPlan &Plan,
3540 VPBasicBlock *HeaderVPBB,
3541 VPBasicBlock *LatchVPBB) {
3542 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3543 if (!EarlyExitVPBB->getSinglePredecessor() &&
3544 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3545 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3546 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3547 "unsupported early exit VPBB");
3548 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3549 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3550 // of the phis.
3551 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3552 cast<VPIRPhi>(&R)->swapOperands();
3553 }
3554
3555 VPBuilder Builder(LatchVPBB->getTerminator());
3556 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3557 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3558 "Terminator must be be BranchOnCond");
3559 VPValue *CondOfEarlyExitingVPBB =
3560 EarlyExitingVPBB->getTerminator()->getOperand(0);
3561 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3562 ? CondOfEarlyExitingVPBB
3563 : Builder.createNot(CondOfEarlyExitingVPBB);
3564
3565 // Split the middle block and have it conditionally branch to the early exit
3566 // block if CondToEarlyExit.
3567 VPValue *IsEarlyExitTaken =
3568 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3569 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3570 VPBasicBlock *VectorEarlyExitVPBB =
3571 Plan.createVPBasicBlock("vector.early.exit");
3572 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3573 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3574 NewMiddle->swapSuccessors();
3575
3576 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3577
3578 // Update the exit phis in the early exit block.
3579 VPBuilder MiddleBuilder(NewMiddle);
3580 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3581 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3582 auto *ExitIRI = cast<VPIRPhi>(&R);
3583 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3584 // a single predecessor and 1 if it has two.
3585 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3586 if (ExitIRI->getNumOperands() != 1) {
3587 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3588 // predecessor. Extract its last lane.
3589 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3590 }
3591
3592 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3593 if (!IncomingFromEarlyExit->isLiveIn()) {
3594 // Update the incoming value from the early exit.
3595 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3596 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3597 "first.active.lane");
3598 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3599 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3600 nullptr, "early.exit.value");
3601 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3602 }
3603 }
3604 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3605
3606 // Replace the condition controlling the non-early exit from the vector loop
3607 // with one exiting if either the original condition of the vector latch is
3608 // true or the early exit has been taken.
3609 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3610 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3611 "Unexpected terminator");
3612 auto *IsLatchExitTaken =
3613 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3614 LatchExitingBranch->getOperand(1));
3615 auto *AnyExitTaken = Builder.createNaryOp(
3616 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3617 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3618 LatchExitingBranch->eraseFromParent();
3619}
3620
3621/// This function tries convert extended in-loop reductions to
3622/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3623/// valid. The created recipe must be decomposed to its constituent
3624/// recipes before execution.
3625static VPExpressionRecipe *
3627 VFRange &Range) {
3628 Type *RedTy = Ctx.Types.inferScalarType(Red);
3629 VPValue *VecOp = Red->getVecOp();
3630
3631 // Clamp the range if using extended-reduction is profitable.
3632 auto IsExtendedRedValidAndClampRange =
3633 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3635 [&](ElementCount VF) {
3636 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3638
3639 InstructionCost ExtRedCost;
3640 InstructionCost ExtCost =
3641 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3642 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3643
3647 // FIXME: Move partial reduction creation, costing and clamping
3648 // here from LoopVectorize.cpp.
3649 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3650 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3651 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3652 } else {
3653 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3654 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3655 Red->getFastMathFlags(), CostKind);
3656 }
3657 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3658 },
3659 Range);
3660 };
3661
3662 VPValue *A;
3663 // Match reduce(ext)).
3664 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3665 IsExtendedRedValidAndClampRange(
3666 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3667 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3668 Ctx.Types.inferScalarType(A)))
3669 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3670
3671 return nullptr;
3672}
3673
3674/// This function tries convert extended in-loop reductions to
3675/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3676/// and valid. The created VPExpressionRecipe must be decomposed to its
3677/// constituent recipes before execution. Patterns of the
3678/// VPExpressionRecipe:
3679/// reduce.add(mul(...)),
3680/// reduce.add(mul(ext(A), ext(B))),
3681/// reduce.add(ext(mul(ext(A), ext(B)))).
3682static VPExpressionRecipe *
3684 VPCostContext &Ctx, VFRange &Range) {
3685 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3686
3687 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3688 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3689 return nullptr;
3690
3691 Type *RedTy = Ctx.Types.inferScalarType(Red);
3692
3693 // Clamp the range if using multiply-accumulate-reduction is profitable.
3694 auto IsMulAccValidAndClampRange =
3696 VPWidenCastRecipe *OuterExt) -> bool {
3698 [&](ElementCount VF) {
3700 Type *SrcTy =
3701 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3702 InstructionCost MulAccCost;
3703
3704 if (IsPartialReduction) {
3705 Type *SrcTy2 =
3706 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3707 // FIXME: Move partial reduction creation, costing and clamping
3708 // here from LoopVectorize.cpp.
3709 MulAccCost = Ctx.TTI.getPartialReductionCost(
3710 Opcode, SrcTy, SrcTy2, RedTy, VF,
3712 Ext0->getOpcode())
3715 Ext1->getOpcode())
3717 Mul->getOpcode(), CostKind);
3718 } else {
3719 // Only partial reductions support mixed extends at the moment.
3720 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3721 return false;
3722
3723 bool IsZExt =
3724 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3725 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3726 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3727 SrcVecTy, CostKind);
3728 }
3729
3730 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3731 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3732 InstructionCost ExtCost = 0;
3733 if (Ext0)
3734 ExtCost += Ext0->computeCost(VF, Ctx);
3735 if (Ext1)
3736 ExtCost += Ext1->computeCost(VF, Ctx);
3737 if (OuterExt)
3738 ExtCost += OuterExt->computeCost(VF, Ctx);
3739
3740 return MulAccCost.isValid() &&
3741 MulAccCost < ExtCost + MulCost + RedCost;
3742 },
3743 Range);
3744 };
3745
3746 VPValue *VecOp = Red->getVecOp();
3747 VPRecipeBase *Sub = nullptr;
3748 VPValue *A, *B;
3749 VPValue *Tmp = nullptr;
3750 // Sub reductions could have a sub between the add reduction and vec op.
3751 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3752 Sub = VecOp->getDefiningRecipe();
3753 VecOp = Tmp;
3754 }
3755
3756 // If ValB is a constant and can be safely extended, truncate it to the same
3757 // type as ExtA's operand, then extend it to the same type as ExtA. This
3758 // creates two uniform extends that can more easily be matched by the rest of
3759 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3760 // replaced with the new extend of the constant.
3761 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3762 VPWidenCastRecipe *&ExtB,
3763 VPValue *&ValB, VPWidenRecipe *Mul) {
3764 if (!ExtA || ExtB || !ValB->isLiveIn())
3765 return;
3766 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3767 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3768 const APInt *Const;
3769 if (!match(ValB, m_APInt(Const)) ||
3771 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3772 return;
3773 // The truncate ensures that the type of each extended operand is the
3774 // same, and it's been proven that the constant can be extended from
3775 // NarrowTy safely. Necessary since ExtA's extended operand would be
3776 // e.g. an i8, while the const will likely be an i32. This will be
3777 // elided by later optimisations.
3778 VPBuilder Builder(Mul);
3779 auto *Trunc =
3780 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3781 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3782 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3783 Mul->setOperand(1, ExtB);
3784 };
3785
3786 // Try to match reduce.add(mul(...)).
3787 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3790 auto *Mul = cast<VPWidenRecipe>(VecOp);
3791
3792 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3793 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3794
3795 // Match reduce.add/sub(mul(ext, ext)).
3796 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3797 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3798 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3799 if (Sub)
3800 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3801 cast<VPWidenRecipe>(Sub), Red);
3802 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3803 }
3804 // TODO: Add an expression type for this variant with a negated mul
3805 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3806 return new VPExpressionRecipe(Mul, Red);
3807 }
3808 // TODO: Add an expression type for negated versions of other expression
3809 // variants.
3810 if (Sub)
3811 return nullptr;
3812
3813 // Match reduce.add(ext(mul(A, B))).
3814 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3815 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
3816 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
3819
3820 // reduce.add(ext(mul(ext, const)))
3821 // -> reduce.add(ext(mul(ext, ext(const))))
3822 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3823
3824 // reduce.add(ext(mul(ext(A), ext(B))))
3825 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3826 // The inner extends must either have the same opcode as the outer extend or
3827 // be the same, in which case the multiply can never result in a negative
3828 // value and the outer extend can be folded away by doing wider
3829 // extends for the operands of the mul.
3830 if (Ext0 && Ext1 &&
3831 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3832 Ext0->getOpcode() == Ext1->getOpcode() &&
3833 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3834 auto *NewExt0 = new VPWidenCastRecipe(
3835 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
3836 *Ext0, *Ext0, Ext0->getDebugLoc());
3837 NewExt0->insertBefore(Ext0);
3838
3839 VPWidenCastRecipe *NewExt1 = NewExt0;
3840 if (Ext0 != Ext1) {
3841 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3842 Ext->getResultType(), nullptr, *Ext1,
3843 *Ext1, Ext1->getDebugLoc());
3844 NewExt1->insertBefore(Ext1);
3845 }
3846 Mul->setOperand(0, NewExt0);
3847 Mul->setOperand(1, NewExt1);
3848 Red->setOperand(1, Mul);
3849 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3850 }
3851 }
3852 return nullptr;
3853}
3854
3855/// This function tries to create abstract recipes from the reduction recipe for
3856/// following optimizations and cost estimation.
3858 VPCostContext &Ctx,
3859 VFRange &Range) {
3860 VPExpressionRecipe *AbstractR = nullptr;
3861 auto IP = std::next(Red->getIterator());
3862 auto *VPBB = Red->getParent();
3863 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3864 AbstractR = MulAcc;
3865 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3866 AbstractR = ExtRed;
3867 // Cannot create abstract inloop reduction recipes.
3868 if (!AbstractR)
3869 return;
3870
3871 AbstractR->insertBefore(*VPBB, IP);
3872 Red->replaceAllUsesWith(AbstractR);
3873}
3874
3885
3887 if (Plan.hasScalarVFOnly())
3888 return;
3889
3890#ifndef NDEBUG
3891 VPDominatorTree VPDT(Plan);
3892#endif
3893
3894 SmallVector<VPValue *> VPValues;
3897 append_range(VPValues, Plan.getLiveIns());
3898 for (VPRecipeBase &R : *Plan.getEntry())
3899 append_range(VPValues, R.definedValues());
3900
3901 auto *VectorPreheader = Plan.getVectorPreheader();
3902 for (VPValue *VPV : VPValues) {
3904 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3905 isa<Constant>(VPV->getLiveInIRValue())))
3906 continue;
3907
3908 // Add explicit broadcast at the insert point that dominates all users.
3909 VPBasicBlock *HoistBlock = VectorPreheader;
3910 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3911 for (VPUser *User : VPV->users()) {
3912 if (User->usesScalars(VPV))
3913 continue;
3914 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3915 HoistPoint = HoistBlock->begin();
3916 else
3917 assert(VPDT.dominates(VectorPreheader,
3918 cast<VPRecipeBase>(User)->getParent()) &&
3919 "All users must be in the vector preheader or dominated by it");
3920 }
3921
3922 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3923 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3924 VPV->replaceUsesWithIf(Broadcast,
3925 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3926 return Broadcast != &U && !U.usesScalars(VPV);
3927 });
3928 }
3929}
3930
3932 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3933
3934 // Collect candidate loads with invariant addresses and noalias scopes
3935 // metadata and memory-writing recipes with noalias metadata.
3939 vp_depth_first_shallow(LoopRegion->getEntry()))) {
3940 for (VPRecipeBase &R : *VPBB) {
3941 // Only handle single-scalar replicated loads with invariant addresses.
3942 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
3943 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
3944 RepR->getOpcode() != Instruction::Load)
3945 continue;
3946
3947 VPValue *Addr = RepR->getOperand(0);
3948 if (Addr->isDefinedOutsideLoopRegions()) {
3950 if (!Loc.AATags.Scope)
3951 continue;
3952 CandidateLoads.push_back({RepR, Loc});
3953 }
3954 }
3955 if (R.mayWriteToMemory()) {
3957 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
3958 return;
3959 Stores.push_back(*Loc);
3960 }
3961 }
3962 }
3963
3964 VPBasicBlock *Preheader = Plan.getVectorPreheader();
3965 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
3966 // Hoist the load to the preheader if it doesn't alias with any stores
3967 // according to the noalias metadata. Other loads should have been hoisted
3968 // by other passes
3969 const AAMDNodes &LoadAA = LoadLoc.AATags;
3970 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
3972 LoadAA.Scope, StoreLoc.AATags.NoAlias);
3973 })) {
3974 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
3975 }
3976 }
3977}
3978
3980 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3982 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3983 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3984
3985 VPValue *TC = Plan.getTripCount();
3986 // Skip cases for which the trip count may be non-trivial to materialize.
3987 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3988 // tail is required.
3989 if (!Plan.hasScalarTail() ||
3991 Plan.getScalarPreheader() ||
3992 !TC->isLiveIn())
3993 return;
3994
3995 // Materialize vector trip counts for constants early if it can simply
3996 // be computed as (Original TC / VF * UF) * VF * UF.
3997 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3998 // tail-folded loops.
3999 ScalarEvolution &SE = *PSE.getSE();
4000 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4001 if (!isa<SCEVConstant>(TCScev))
4002 return;
4003 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4004 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4005 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4006 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4007}
4008
4010 VPBasicBlock *VectorPH) {
4012 if (BTC->getNumUsers() == 0)
4013 return;
4014
4015 VPBuilder Builder(VectorPH, VectorPH->begin());
4016 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4017 auto *TCMO = Builder.createNaryOp(
4018 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4019 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4020 BTC->replaceAllUsesWith(TCMO);
4021}
4022
4024 if (Plan.hasScalarVFOnly())
4025 return;
4026
4027 VPTypeAnalysis TypeInfo(Plan);
4028 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4029 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4031 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4032 vp_depth_first_shallow(LoopRegion->getEntry()));
4033 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4034 // VPInstructions, excluding ones in replicate regions. Those are not
4035 // materialized explicitly yet. Those vector users are still handled in
4036 // VPReplicateRegion::execute(), via shouldPack().
4037 // TODO: materialize build vectors for replicating recipes in replicating
4038 // regions.
4039 for (VPBasicBlock *VPBB :
4040 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4041 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4043 continue;
4044 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4045 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4046 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4047 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4048 };
4049 if ((isa<VPReplicateRecipe>(DefR) &&
4050 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4051 (isa<VPInstruction>(DefR) &&
4053 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4054 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4055 continue;
4056
4057 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4058 unsigned Opcode = ScalarTy->isStructTy()
4061 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4062 BuildVector->insertAfter(DefR);
4063
4064 DefR->replaceUsesWithIf(
4065 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4066 VPUser &U, unsigned) {
4067 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4068 });
4069 }
4070 }
4071
4072 // Create explicit VPInstructions to convert vectors to scalars. The current
4073 // implementation is conservative - it may miss some cases that may or may not
4074 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4075 // if they are known to operate on scalar values.
4076 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4077 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4080 continue;
4081 for (VPValue *Def : R.definedValues()) {
4082 // Skip recipes that are single-scalar or only have their first lane
4083 // used.
4084 // TODO: The Defs skipped here may or may not be vector values.
4085 // Introduce Unpacks, and remove them later, if they are guaranteed to
4086 // produce scalar values.
4088 continue;
4089
4090 // At the moment, we create unpacks only for scalar users outside
4091 // replicate regions. Recipes inside replicate regions still extract the
4092 // required lanes implicitly.
4093 // TODO: Remove once replicate regions are unrolled completely.
4094 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4095 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4096 return U->usesScalars(Def) &&
4097 (!ParentRegion || !ParentRegion->isReplicator());
4098 };
4099 if (none_of(Def->users(), IsCandidateUnpackUser))
4100 continue;
4101
4102 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4103 if (R.isPhi())
4104 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4105 else
4106 Unpack->insertAfter(&R);
4107 Def->replaceUsesWithIf(Unpack,
4108 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4109 return IsCandidateUnpackUser(&U);
4110 });
4111 }
4112 }
4113 }
4114}
4115
4117 VPBasicBlock *VectorPHVPBB,
4118 bool TailByMasking,
4119 bool RequiresScalarEpilogue) {
4120 VPValue &VectorTC = Plan.getVectorTripCount();
4121 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4122 // There's nothing to do if there are no users of the vector trip count or its
4123 // IR value has already been set.
4124 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4125 return;
4126
4127 VPValue *TC = Plan.getTripCount();
4128 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4129 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4130 VPValue *Step = &Plan.getVFxUF();
4131
4132 // If the tail is to be folded by masking, round the number of iterations N
4133 // up to a multiple of Step instead of rounding down. This is done by first
4134 // adding Step-1 and then rounding down. Note that it's ok if this addition
4135 // overflows: the vector induction variable will eventually wrap to zero given
4136 // that it starts at zero and its Step is a power of two; the loop will then
4137 // exit, with the last early-exit vector comparison also producing all-true.
4138 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4139 // is accounted for in emitIterationCountCheck that adds an overflow check.
4140 if (TailByMasking) {
4141 TC = Builder.createNaryOp(
4142 Instruction::Add,
4143 {TC, Builder.createNaryOp(Instruction::Sub,
4144 {Step, Plan.getConstantInt(TCTy, 1)})},
4145 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4146 }
4147
4148 // Now we need to generate the expression for the part of the loop that the
4149 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4150 // iterations are not required for correctness, or N - Step, otherwise. Step
4151 // is equal to the vectorization factor (number of SIMD elements) times the
4152 // unroll factor (number of SIMD instructions).
4153 VPValue *R =
4154 Builder.createNaryOp(Instruction::URem, {TC, Step},
4155 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4156
4157 // There are cases where we *must* run at least one iteration in the remainder
4158 // loop. See the cost model for when this can happen. If the step evenly
4159 // divides the trip count, we set the remainder to be equal to the step. If
4160 // the step does not evenly divide the trip count, no adjustment is necessary
4161 // since there will already be scalar iterations. Note that the minimum
4162 // iterations check ensures that N >= Step.
4163 if (RequiresScalarEpilogue) {
4164 assert(!TailByMasking &&
4165 "requiring scalar epilogue is not supported with fail folding");
4166 VPValue *IsZero =
4167 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4168 R = Builder.createSelect(IsZero, Step, R);
4169 }
4170
4171 VPValue *Res = Builder.createNaryOp(
4172 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4173 VectorTC.replaceAllUsesWith(Res);
4174}
4175
4177 ElementCount VFEC) {
4178 VPBuilder Builder(VectorPH, VectorPH->begin());
4179 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4180 VPValue &VF = Plan.getVF();
4181 VPValue &VFxUF = Plan.getVFxUF();
4182 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4183 // used.
4184 // TODO: Assert that they aren't used.
4185
4186 // If there are no users of the runtime VF, compute VFxUF by constant folding
4187 // the multiplication of VF and UF.
4188 if (VF.getNumUsers() == 0) {
4189 VPValue *RuntimeVFxUF =
4190 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4191 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4192 return;
4193 }
4194
4195 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4196 // vscale) * UF.
4197 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4199 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4201 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4202 }
4203 VF.replaceAllUsesWith(RuntimeVF);
4204
4205 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4206 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4207 VFxUF.replaceAllUsesWith(MulByUF);
4208}
4209
4212 const DataLayout &DL = SE.getDataLayout();
4213 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4214
4215 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4216 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4217 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4218 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4220 continue;
4221 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4222 if (!ExpSCEV)
4223 break;
4224 const SCEV *Expr = ExpSCEV->getSCEV();
4225 Value *Res =
4226 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4227 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4228 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4229 ExpSCEV->replaceAllUsesWith(Exp);
4230 if (Plan.getTripCount() == ExpSCEV)
4231 Plan.resetTripCount(Exp);
4232 ExpSCEV->eraseFromParent();
4233 }
4235 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4236 "after any VPIRInstructions");
4237 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4238 // to the VPIRBasicBlock.
4239 auto EI = Entry->begin();
4240 for (Instruction &I : drop_end(*EntryBB)) {
4241 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4242 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4243 EI++;
4244 continue;
4245 }
4247 }
4248
4249 return ExpandedSCEVs;
4250}
4251
4252/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4253/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4254/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4255/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4256/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4257/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4258/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4259/// is defined at \p Idx of a load interleave group.
4260static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4261 VPValue *OpV, unsigned Idx) {
4262 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4263 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4264 if (!Member0OpR)
4265 return Member0Op == OpV;
4266 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4267 return !W->getMask() && Member0Op == OpV;
4268 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4269 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4270 return false;
4271}
4272
4273/// Returns true if \p IR is a full interleave group with factor and number of
4274/// members both equal to \p VF. The interleave group must also access the full
4275/// vector width \p VectorRegWidth.
4277 ElementCount VF,
4278 VPTypeAnalysis &TypeInfo,
4279 TypeSize VectorRegWidth) {
4280 if (!InterleaveR || InterleaveR->getMask())
4281 return false;
4282
4283 Type *GroupElementTy = nullptr;
4284 if (InterleaveR->getStoredValues().empty()) {
4285 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4286 if (!all_of(InterleaveR->definedValues(),
4287 [&TypeInfo, GroupElementTy](VPValue *Op) {
4288 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4289 }))
4290 return false;
4291 } else {
4292 GroupElementTy =
4293 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4294 if (!all_of(InterleaveR->getStoredValues(),
4295 [&TypeInfo, GroupElementTy](VPValue *Op) {
4296 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4297 }))
4298 return false;
4299 }
4300
4301 unsigned VFMin = VF.getKnownMinValue();
4302 TypeSize GroupSize = TypeSize::get(
4303 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4304 const auto *IG = InterleaveR->getInterleaveGroup();
4305 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4306 GroupSize == VectorRegWidth;
4307}
4308
4309/// Returns true if \p VPValue is a narrow VPValue.
4310static bool isAlreadyNarrow(VPValue *VPV) {
4311 if (VPV->isLiveIn())
4312 return true;
4313 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4314 return RepR && RepR->isSingleScalar();
4315}
4316
4317// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4318// a narrow variant.
4319static VPValue *
4321 auto *R = V->getDefiningRecipe();
4322 if (!R || NarrowedOps.contains(V))
4323 return V;
4324
4325 if (isAlreadyNarrow(V))
4326 return V;
4327
4328 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4329 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4330 WideMember0->setOperand(
4331 Idx,
4332 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4333 return V;
4334 }
4335
4336 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4337 // Narrow interleave group to wide load, as transformed VPlan will only
4338 // process one original iteration.
4339 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4340 auto *L = new VPWidenLoadRecipe(
4341 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4342 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4343 L->insertBefore(LoadGroup);
4344 NarrowedOps.insert(L);
4345 return L;
4346 }
4347
4348 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4349 assert(RepR->isSingleScalar() &&
4350 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4351 "must be a single scalar load");
4352 NarrowedOps.insert(RepR);
4353 return RepR;
4354 }
4355
4356 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4357 VPValue *PtrOp = WideLoad->getAddr();
4358 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4359 PtrOp = VecPtr->getOperand(0);
4360 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4361 // process one original iteration.
4362 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4363 /*IsUniform*/ true,
4364 /*Mask*/ nullptr, {}, *WideLoad);
4365 N->insertBefore(WideLoad);
4366 NarrowedOps.insert(N);
4367 return N;
4368}
4369
4371 TypeSize VectorRegWidth) {
4372 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4373 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4374 return;
4375
4376 VPTypeAnalysis TypeInfo(Plan);
4377
4379 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4381 continue;
4382
4385 continue;
4386
4387 // Bail out on recipes not supported at the moment:
4388 // * phi recipes other than the canonical induction
4389 // * recipes writing to memory except interleave groups
4390 // Only support plans with a canonical induction phi.
4391 if (R.isPhi())
4392 return;
4393
4394 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4395 if (R.mayWriteToMemory() && !InterleaveR)
4396 return;
4397
4398 // Do not narrow interleave groups if there are VectorPointer recipes and
4399 // the plan was unrolled. The recipe implicitly uses VF from
4400 // VPTransformState.
4401 // TODO: Remove restriction once the VF for the VectorPointer offset is
4402 // modeled explicitly as operand.
4403 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4404 return;
4405
4406 // All other ops are allowed, but we reject uses that cannot be converted
4407 // when checking all allowed consumers (store interleave groups) below.
4408 if (!InterleaveR)
4409 continue;
4410
4411 // Bail out on non-consecutive interleave groups.
4412 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4413 VectorRegWidth))
4414 return;
4415
4416 // Skip read interleave groups.
4417 if (InterleaveR->getStoredValues().empty())
4418 continue;
4419
4420 // Narrow interleave groups, if all operands are already matching narrow
4421 // ops.
4422 auto *Member0 = InterleaveR->getStoredValues()[0];
4423 if (isAlreadyNarrow(Member0) &&
4424 all_of(InterleaveR->getStoredValues(),
4425 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4426 StoreGroups.push_back(InterleaveR);
4427 continue;
4428 }
4429
4430 // For now, we only support full interleave groups storing load interleave
4431 // groups.
4432 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4433 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4434 if (!DefR)
4435 return false;
4436 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4437 return IR && IR->getInterleaveGroup()->isFull() &&
4438 IR->getVPValue(Op.index()) == Op.value();
4439 })) {
4440 StoreGroups.push_back(InterleaveR);
4441 continue;
4442 }
4443
4444 // Check if all values feeding InterleaveR are matching wide recipes, which
4445 // operands that can be narrowed.
4446 auto *WideMember0 =
4447 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4448 if (!WideMember0)
4449 return;
4450 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4452 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4453 R->getNumOperands() > 2)
4454 return;
4455 if (any_of(enumerate(R->operands()),
4456 [WideMember0, Idx = I](const auto &P) {
4457 const auto &[OpIdx, OpV] = P;
4458 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4459 }))
4460 return;
4461 }
4462 StoreGroups.push_back(InterleaveR);
4463 }
4464
4465 if (StoreGroups.empty())
4466 return;
4467
4468 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4469 SmallPtrSet<VPValue *, 4> NarrowedOps;
4470 // Narrow operation tree rooted at store groups.
4471 for (auto *StoreGroup : StoreGroups) {
4472 VPValue *Res =
4473 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4474 auto *SI =
4475 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4476 auto *S = new VPWidenStoreRecipe(
4477 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4478 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4479 S->insertBefore(StoreGroup);
4480 StoreGroup->eraseFromParent();
4481 }
4482
4483 // Adjust induction to reflect that the transformed plan only processes one
4484 // original iteration.
4485 auto *CanIV = VectorLoop->getCanonicalIV();
4486 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4487 VPBuilder PHBuilder(Plan.getVectorPreheader());
4488
4489 VPValue *UF = Plan.getOrAddLiveIn(
4490 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4491 if (VF.isScalable()) {
4492 VPValue *VScale = PHBuilder.createElementCount(
4494 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4495 Inc->setOperand(1, VScaleUF);
4496 Plan.getVF().replaceAllUsesWith(VScale);
4497 } else {
4498 Inc->setOperand(1, UF);
4500 Plan.getConstantInt(CanIV->getScalarType(), 1));
4501 }
4502 removeDeadRecipes(Plan);
4503}
4504
4505/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4506/// BranchOnCond recipe.
4508 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4509 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4510 auto *MiddleTerm =
4512 // Only add branch metadata if there is a (conditional) terminator.
4513 if (!MiddleTerm)
4514 return;
4515
4516 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4517 "must have a BranchOnCond");
4518 // Assume that `TripCount % VectorStep ` is equally distributed.
4519 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4520 if (VF.isScalable() && VScaleForTuning.has_value())
4521 VectorStep *= *VScaleForTuning;
4522 assert(VectorStep > 0 && "trip count should not be zero");
4523 MDBuilder MDB(Plan.getContext());
4524 MDNode *BranchWeights =
4525 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4526 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
4527}
4528
4529/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4530/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4531/// the end value of the induction.
4533 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4534 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4535 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4536 // Truncated wide inductions resume from the last lane of their vector value
4537 // in the last vector iteration which is handled elsewhere.
4538 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4539 return nullptr;
4540
4541 VPValue *Start = WideIV->getStartValue();
4542 VPValue *Step = WideIV->getStepValue();
4544 VPValue *EndValue = VectorTC;
4545 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4546 EndValue = VectorPHBuilder.createDerivedIV(
4547 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4548 Start, VectorTC, Step);
4549 }
4550
4551 // EndValue is derived from the vector trip count (which has the same type as
4552 // the widest induction) and thus may be wider than the induction here.
4553 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4554 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4555 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4556 ScalarTypeOfWideIV,
4557 WideIV->getDebugLoc());
4558 }
4559
4560 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4561 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4562 return ResumePhiRecipe;
4563}
4564
4566 VPlan &Plan, VPRecipeBuilder &Builder,
4567 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4568 VPTypeAnalysis TypeInfo(Plan);
4569 auto *ScalarPH = Plan.getScalarPreheader();
4570 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4571 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4572 VPBuilder VectorPHBuilder(
4573 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4574 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4575 VPBuilder ScalarPHBuilder(ScalarPH);
4576 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4577 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4578
4579 // TODO: Extract final value from induction recipe initially, optimize to
4580 // pre-computed end value together in optimizeInductionExitUsers.
4581 auto *VectorPhiR =
4582 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4583 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4585 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4586 &Plan.getVectorTripCount())) {
4587 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4588 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4589 ScalarPhiIRI->addOperand(ResumePhi);
4590 continue;
4591 }
4592 // TODO: Also handle truncated inductions here. Computing end-values
4593 // separately should be done as VPlan-to-VPlan optimization, after
4594 // legalizing all resume values to use the last lane from the loop.
4595 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4596 "should only skip truncated wide inductions");
4597 continue;
4598 }
4599
4600 // The backedge value provides the value to resume coming out of a loop,
4601 // which for FORs is a vector whose last element needs to be extracted. The
4602 // start value provides the value if the loop is bypassed.
4603 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4604 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4605 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4606 "Cannot handle loops with uncountable early exits");
4607 if (IsFOR)
4608 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4609 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4610 "vector.recur.extract");
4611 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4612 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4613 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4614 ScalarPhiIRI->addOperand(ResumePhiR);
4615 }
4616}
4617
4619 VFRange &Range) {
4620 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4621 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4622 auto *MiddleVPBB = Plan.getMiddleBlock();
4623 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4624 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4625
4626 auto IsScalableOne = [](ElementCount VF) -> bool {
4627 return VF == ElementCount::getScalable(1);
4628 };
4629
4630 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4631 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4632 if (!FOR)
4633 continue;
4634
4635 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4636 "Cannot handle loops with uncountable early exits");
4637
4638 // This is the second phase of vectorizing first-order recurrences, creating
4639 // extract for users outside the loop. An overview of the transformation is
4640 // described below. Suppose we have the following loop with some use after
4641 // the loop of the last a[i-1],
4642 //
4643 // for (int i = 0; i < n; ++i) {
4644 // t = a[i - 1];
4645 // b[i] = a[i] - t;
4646 // }
4647 // use t;
4648 //
4649 // There is a first-order recurrence on "a". For this loop, the shorthand
4650 // scalar IR looks like:
4651 //
4652 // scalar.ph:
4653 // s.init = a[-1]
4654 // br scalar.body
4655 //
4656 // scalar.body:
4657 // i = phi [0, scalar.ph], [i+1, scalar.body]
4658 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4659 // s2 = a[i]
4660 // b[i] = s2 - s1
4661 // br cond, scalar.body, exit.block
4662 //
4663 // exit.block:
4664 // use = lcssa.phi [s1, scalar.body]
4665 //
4666 // In this example, s1 is a recurrence because it's value depends on the
4667 // previous iteration. In the first phase of vectorization, we created a
4668 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4669 // for users in the scalar preheader and exit block.
4670 //
4671 // vector.ph:
4672 // v_init = vector(..., ..., ..., a[-1])
4673 // br vector.body
4674 //
4675 // vector.body
4676 // i = phi [0, vector.ph], [i+4, vector.body]
4677 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4678 // v2 = a[i, i+1, i+2, i+3]
4679 // b[i] = v2 - v1
4680 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4681 // b[i, i+1, i+2, i+3] = v2 - v1
4682 // br cond, vector.body, middle.block
4683 //
4684 // middle.block:
4685 // vector.recur.extract.for.phi = v2(2)
4686 // vector.recur.extract = v2(3)
4687 // br cond, scalar.ph, exit.block
4688 //
4689 // scalar.ph:
4690 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4691 // [s.init, otherwise]
4692 // br scalar.body
4693 //
4694 // scalar.body:
4695 // i = phi [0, scalar.ph], [i+1, scalar.body]
4696 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4697 // s2 = a[i]
4698 // b[i] = s2 - s1
4699 // br cond, scalar.body, exit.block
4700 //
4701 // exit.block:
4702 // lo = lcssa.phi [s1, scalar.body],
4703 // [vector.recur.extract.for.phi, middle.block]
4704 //
4705 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4706 // Extract the penultimate value of the recurrence and use it as operand for
4707 // the VPIRInstruction modeling the phi.
4708 for (VPUser *U : FOR->users()) {
4709 using namespace llvm::VPlanPatternMatch;
4710 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4711 continue;
4712 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4713 // penultimate value of the recurrence. Instead we rely on the existing
4714 // extract of the last element from the result of
4715 // VPInstruction::FirstOrderRecurrenceSplice.
4716 // TODO: Consider vscale_range info and UF.
4718 Range))
4719 return;
4720 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4721 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4722 {}, "vector.recur.extract.for.phi");
4723 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4724 }
4725 }
4726}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3622
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3983
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4058
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4010
iterator end()
Definition VPlan.h:4020
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4018
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4071
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4032
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2480
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2514
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2504
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2520
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2500
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3033
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3564
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:432
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:405
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:417
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3733
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3654
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3078
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2055
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2095
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2084
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4136
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4160
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1031
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1115
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1066
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1061
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1058
@ CanonicalIVIncrementForPart
Definition VPlan.h:1051
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2622
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2614
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2643
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2696
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2654
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3220
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPRegionBlock * getRegion()
Definition VPlan.h:4288
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2907
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2746
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4171
const VPBlockBase * getEntry() const
Definition VPlan.h:4207
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4282
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4239
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4224
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4269
const VPBlockBase * getExiting() const
Definition VPlan.h:4219
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4232
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2952
bool isSingleScalar() const
Definition VPlan.h:2993
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3017
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3803
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:530
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1917
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3696
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1545
Instruction::CastOps getOpcode() const
Definition VPlan.h:1581
A recipe for handling GEP instructions.
Definition VPlan.h:1841
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2119
PHINode * getPHINode() const
Definition VPlan.h:2161
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2147
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2164
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2195
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2271
A recipe for widening vector intrinsics.
Definition VPlan.h:1595
A common base class for widening memory operations.
Definition VPlan.h:3263
A recipe for widened phis.
Definition VPlan.h:2329
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1497
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4301
bool hasVF(ElementCount VF) const
Definition VPlan.h:4506
LLVMContext & getContext() const
Definition VPlan.h:4494
VPBasicBlock * getEntry()
Definition VPlan.h:4394
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4485
bool hasScalableVF() const
Definition VPlan.h:4507
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4492
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4488
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4456
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4563
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4477
unsigned getUF() const
Definition VPlan.h:4526
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4640
bool hasUF(unsigned UF) const
Definition VPlan.h:4524
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4446
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4569
void setVF(ElementCount VF)
Definition VPlan.h:4500
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4539
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4470
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4419
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4618
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4566
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4548
bool hasScalarVFOnly() const
Definition VPlan.h:4517
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4437
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4588
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4442
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4585
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4399
void setUF(unsigned UF)
Definition VPlan.h:4531
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4672
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2371
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3395
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3354
A recipe for widening select instructions.
Definition VPlan.h:1794
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3479
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3436
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.