LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 drop_end(Ingredient.operands()), CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133static bool sinkScalarOperands(VPlan &Plan) {
134 auto Iter = vp_depth_first_deep(Plan.getEntry());
135 bool Changed = false;
136 // First, collect the operands of all recipes in replicate blocks as seeds for
137 // sinking.
140 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
141 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
142 continue;
143 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
144 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
145 continue;
146 for (auto &Recipe : *VPBB) {
147 for (VPValue *Op : Recipe.operands())
148 if (auto *Def =
149 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
150 WorkList.insert({VPBB, Def});
151 }
152 }
153
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 // Try to sink each replicate or scalar IV steps recipe in the worklist.
156 for (unsigned I = 0; I != WorkList.size(); ++I) {
157 VPBasicBlock *SinkTo;
158 VPSingleDefRecipe *SinkCandidate;
159 std::tie(SinkTo, SinkCandidate) = WorkList[I];
160 if (SinkCandidate->getParent() == SinkTo ||
161 SinkCandidate->mayHaveSideEffects() ||
162 SinkCandidate->mayReadOrWriteMemory())
163 continue;
164 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
165 if (!ScalarVFOnly && RepR->isSingleScalar())
166 continue;
167 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
168 continue;
169
170 bool NeedsDuplicating = false;
171 // All recipe users of the sink candidate must be in the same block SinkTo
172 // or all users outside of SinkTo must be uniform-after-vectorization (
173 // i.e., only first lane is used) . In the latter case, we need to duplicate
174 // SinkCandidate.
175 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
176 SinkCandidate](VPUser *U) {
177 auto *UI = cast<VPRecipeBase>(U);
178 if (UI->getParent() == SinkTo)
179 return true;
180 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
181 // We only know how to duplicate VPReplicateRecipes and
182 // VPScalarIVStepsRecipes for now.
183 return NeedsDuplicating &&
185 };
186 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
187 continue;
188
189 if (NeedsDuplicating) {
190 if (ScalarVFOnly)
191 continue;
192 VPSingleDefRecipe *Clone;
193 if (auto *SinkCandidateRepR =
194 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
195 // TODO: Handle converting to uniform recipes as separate transform,
196 // then cloning should be sufficient here.
197 Instruction *I = SinkCandidate->getUnderlyingInstr();
198 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
199 nullptr /*Mask*/, *SinkCandidateRepR);
200 // TODO: add ".cloned" suffix to name of Clone's VPValue.
201 } else {
202 Clone = SinkCandidate->clone();
203 }
204
205 Clone->insertBefore(SinkCandidate);
206 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
207 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
208 });
209 }
210 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
211 for (VPValue *Op : SinkCandidate->operands())
212 if (auto *Def =
213 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
214 WorkList.insert({SinkTo, Def});
215 Changed = true;
216 }
217 return Changed;
218}
219
220/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
221/// the mask.
223 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
224 if (!EntryBB || EntryBB->size() != 1 ||
225 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
226 return nullptr;
227
228 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
229}
230
231/// If \p R is a triangle region, return the 'then' block of the triangle.
233 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
234 if (EntryBB->getNumSuccessors() != 2)
235 return nullptr;
236
237 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
238 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
239 if (!Succ0 || !Succ1)
240 return nullptr;
241
242 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
243 return nullptr;
244 if (Succ0->getSingleSuccessor() == Succ1)
245 return Succ0;
246 if (Succ1->getSingleSuccessor() == Succ0)
247 return Succ1;
248 return nullptr;
249}
250
251// Merge replicate regions in their successor region, if a replicate region
252// is connected to a successor replicate region with the same predicate by a
253// single, empty VPBasicBlock.
255 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
256
257 // Collect replicate regions followed by an empty block, followed by another
258 // replicate region with matching masks to process front. This is to avoid
259 // iterator invalidation issues while merging regions.
262 vp_depth_first_deep(Plan.getEntry()))) {
263 if (!Region1->isReplicator())
264 continue;
265 auto *MiddleBasicBlock =
266 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
267 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
268 continue;
269
270 auto *Region2 =
271 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
272 if (!Region2 || !Region2->isReplicator())
273 continue;
274
275 VPValue *Mask1 = getPredicatedMask(Region1);
276 VPValue *Mask2 = getPredicatedMask(Region2);
277 if (!Mask1 || Mask1 != Mask2)
278 continue;
279
280 assert(Mask1 && Mask2 && "both region must have conditions");
281 WorkList.push_back(Region1);
282 }
283
284 // Move recipes from Region1 to its successor region, if both are triangles.
285 for (VPRegionBlock *Region1 : WorkList) {
286 if (TransformedRegions.contains(Region1))
287 continue;
288 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
289 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
290
291 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
292 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
293 if (!Then1 || !Then2)
294 continue;
295
296 // Note: No fusion-preventing memory dependencies are expected in either
297 // region. Such dependencies should be rejected during earlier dependence
298 // checks, which guarantee accesses can be re-ordered for vectorization.
299 //
300 // Move recipes to the successor region.
301 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
302 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
303
304 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
305 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
306
307 // Move VPPredInstPHIRecipes from the merge block to the successor region's
308 // merge block. Update all users inside the successor region to use the
309 // original values.
310 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
311 VPValue *PredInst1 =
312 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
313 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
314 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
315 return cast<VPRecipeBase>(&U)->getParent() == Then2;
316 });
317
318 // Remove phi recipes that are unused after merging the regions.
319 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
320 Phi1ToMove.eraseFromParent();
321 continue;
322 }
323 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
324 }
325
326 // Remove the dead recipes in Region1's entry block.
327 for (VPRecipeBase &R :
328 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
329 R.eraseFromParent();
330
331 // Finally, remove the first region.
332 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
333 VPBlockUtils::disconnectBlocks(Pred, Region1);
334 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
335 }
336 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
337 TransformedRegions.insert(Region1);
338 }
339
340 return !TransformedRegions.empty();
341}
342
344 VPlan &Plan) {
345 Instruction *Instr = PredRecipe->getUnderlyingInstr();
346 // Build the triangular if-then region.
347 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
348 assert(Instr->getParent() && "Predicated instruction not in any basic block");
349 auto *BlockInMask = PredRecipe->getMask();
350 auto *MaskDef = BlockInMask->getDefiningRecipe();
351 auto *BOMRecipe = new VPBranchOnMaskRecipe(
352 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
353 auto *Entry =
354 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
355
356 // Replace predicated replicate recipe with a replicate recipe without a
357 // mask but in the replicate region.
358 auto *RecipeWithoutMask = new VPReplicateRecipe(
359 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
360 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
361 auto *Pred =
362 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
363
364 VPPredInstPHIRecipe *PHIRecipe = nullptr;
365 if (PredRecipe->getNumUsers() != 0) {
366 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
367 RecipeWithoutMask->getDebugLoc());
368 PredRecipe->replaceAllUsesWith(PHIRecipe);
369 PHIRecipe->setOperand(0, RecipeWithoutMask);
370 }
371 PredRecipe->eraseFromParent();
372 auto *Exiting =
373 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
375 Plan.createReplicateRegion(Entry, Exiting, RegionName);
376
377 // Note: first set Entry as region entry and then connect successors starting
378 // from it in order, to propagate the "parent" of each VPBasicBlock.
379 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
380 VPBlockUtils::connectBlocks(Pred, Exiting);
381
382 return Region;
383}
384
385static void addReplicateRegions(VPlan &Plan) {
388 vp_depth_first_deep(Plan.getEntry()))) {
389 for (VPRecipeBase &R : *VPBB)
390 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
391 if (RepR->isPredicated())
392 WorkList.push_back(RepR);
393 }
394 }
395
396 unsigned BBNum = 0;
397 for (VPReplicateRecipe *RepR : WorkList) {
398 VPBasicBlock *CurrentBlock = RepR->getParent();
399 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
400
401 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
402 SplitBlock->setName(
403 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
404 // Record predicated instructions for above packing optimizations.
406 Region->setParent(CurrentBlock->getParent());
408
409 VPRegionBlock *ParentRegion = Region->getParent();
410 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
411 ParentRegion->setExiting(SplitBlock);
412 }
413}
414
415/// Remove redundant VPBasicBlocks by merging them into their predecessor if
416/// the predecessor has a single successor.
420 vp_depth_first_deep(Plan.getEntry()))) {
421 // Don't fold the blocks in the skeleton of the Plan into their single
422 // predecessors for now.
423 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
424 if (!VPBB->getParent())
425 continue;
426 auto *PredVPBB =
427 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
428 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
429 isa<VPIRBasicBlock>(PredVPBB))
430 continue;
431 WorkList.push_back(VPBB);
432 }
433
434 for (VPBasicBlock *VPBB : WorkList) {
435 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
436 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
437 R.moveBefore(*PredVPBB, PredVPBB->end());
438 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
439 auto *ParentRegion = VPBB->getParent();
440 if (ParentRegion && ParentRegion->getExiting() == VPBB)
441 ParentRegion->setExiting(PredVPBB);
442 for (auto *Succ : to_vector(VPBB->successors())) {
444 VPBlockUtils::connectBlocks(PredVPBB, Succ);
445 }
446 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
447 }
448 return !WorkList.empty();
449}
450
452 // Convert masked VPReplicateRecipes to if-then region blocks.
454
455 bool ShouldSimplify = true;
456 while (ShouldSimplify) {
457 ShouldSimplify = sinkScalarOperands(Plan);
458 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
459 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
460 }
461}
462
463/// Remove redundant casts of inductions.
464///
465/// Such redundant casts are casts of induction variables that can be ignored,
466/// because we already proved that the casted phi is equal to the uncasted phi
467/// in the vectorized loop. There is no need to vectorize the cast - the same
468/// value can be used for both the phi and casts in the vector loop.
470 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
472 if (!IV || IV->getTruncInst())
473 continue;
474
475 // A sequence of IR Casts has potentially been recorded for IV, which
476 // *must be bypassed* when the IV is vectorized, because the vectorized IV
477 // will produce the desired casted value. This sequence forms a def-use
478 // chain and is provided in reverse order, ending with the cast that uses
479 // the IV phi. Search for the recipe of the last cast in the chain and
480 // replace it with the original IV. Note that only the final cast is
481 // expected to have users outside the cast-chain and the dead casts left
482 // over will be cleaned up later.
483 auto &Casts = IV->getInductionDescriptor().getCastInsts();
484 VPValue *FindMyCast = IV;
485 for (Instruction *IRCast : reverse(Casts)) {
486 VPSingleDefRecipe *FoundUserCast = nullptr;
487 for (auto *U : FindMyCast->users()) {
488 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
489 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
490 FoundUserCast = UserCast;
491 break;
492 }
493 }
494 FindMyCast = FoundUserCast;
495 }
496 FindMyCast->replaceAllUsesWith(IV);
497 }
498}
499
500/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
501/// recipe, if it exists.
503 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
504 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
505 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
506 for (VPUser *U : CanonicalIV->users()) {
508 if (WidenNewIV)
509 break;
510 }
511
512 if (!WidenNewIV)
513 return;
514
515 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
516 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
517 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
518
519 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
520 continue;
521
522 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
523 // everything WidenNewIV's users need. That is, WidenOriginalIV will
524 // generate a vector phi or all users of WidenNewIV demand the first lane
525 // only.
526 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
527 vputils::onlyFirstLaneUsed(WidenNewIV)) {
528 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
529 WidenNewIV->eraseFromParent();
530 return;
531 }
532 }
533}
534
535/// Returns true if \p R is dead and can be removed.
536static bool isDeadRecipe(VPRecipeBase &R) {
537 // Do remove conditional assume instructions as their conditions may be
538 // flattened.
539 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
540 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
542 if (IsConditionalAssume)
543 return true;
544
545 if (R.mayHaveSideEffects())
546 return false;
547
548 // Recipe is dead if no user keeps the recipe alive.
549 return all_of(R.definedValues(),
550 [](VPValue *V) { return V->getNumUsers() == 0; });
551}
552
555 vp_post_order_deep(Plan.getEntry()))) {
556 // The recipes in the block are processed in reverse order, to catch chains
557 // of dead recipes.
558 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
559 if (isDeadRecipe(R)) {
560 R.eraseFromParent();
561 continue;
562 }
563
564 // Check if R is a dead VPPhi <-> update cycle and remove it.
565 auto *PhiR = dyn_cast<VPPhi>(&R);
566 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
567 continue;
568 VPValue *Incoming = PhiR->getOperand(1);
569 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
570 Incoming->getNumUsers() != 1)
571 continue;
572 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
573 PhiR->eraseFromParent();
574 Incoming->getDefiningRecipe()->eraseFromParent();
575 }
576 }
577}
578
581 Instruction::BinaryOps InductionOpcode,
582 FPMathOperator *FPBinOp, Instruction *TruncI,
583 VPValue *StartV, VPValue *Step, DebugLoc DL,
584 VPBuilder &Builder) {
585 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
586 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
587 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
588 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
589 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
590
591 // Truncate base induction if needed.
592 VPTypeAnalysis TypeInfo(Plan);
593 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
594 if (TruncI) {
595 Type *TruncTy = TruncI->getType();
596 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
597 "Not truncating.");
598 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
599 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
600 ResultTy = TruncTy;
601 }
602
603 // Truncate step if needed.
604 Type *StepTy = TypeInfo.inferScalarType(Step);
605 if (ResultTy != StepTy) {
606 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
607 "Not truncating.");
608 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
609 auto *VecPreheader =
611 VPBuilder::InsertPointGuard Guard(Builder);
612 Builder.setInsertPoint(VecPreheader);
613 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
614 }
615 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
616 &Plan.getVF(), DL);
617}
618
621 for (unsigned I = 0; I != Users.size(); ++I) {
623 if (isa<VPHeaderPHIRecipe>(Cur))
624 continue;
625 for (VPValue *V : Cur->definedValues())
626 Users.insert_range(V->users());
627 }
628 return Users.takeVector();
629}
630
631/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
632/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
633/// VPWidenPointerInductionRecipe will generate vectors only. If some users
634/// require vectors while other require scalars, the scalar uses need to extract
635/// the scalars from the generated vectors (Note that this is different to how
636/// int/fp inductions are handled). Legalize extract-from-ends using uniform
637/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
638/// the correct end value is available. Also optimize
639/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
640/// providing them scalar steps built on the canonical scalar IV and update the
641/// original IV's users. This is an optional optimization to reduce the needs of
642/// vector extracts.
645 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
646 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
647 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
648 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
649 if (!PhiR)
650 continue;
651
652 // Try to narrow wide and replicating recipes to uniform recipes, based on
653 // VPlan analysis.
654 // TODO: Apply to all recipes in the future, to replace legacy uniformity
655 // analysis.
656 auto Users = collectUsersRecursively(PhiR);
657 for (VPUser *U : reverse(Users)) {
658 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
659 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
660 // Skip recipes that shouldn't be narrowed.
661 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
662 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
663 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
664 continue;
665
666 // Skip recipes that may have other lanes than their first used.
668 continue;
669
670 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
671 Def->operands(), /*IsUniform*/ true);
672 Clone->insertAfter(Def);
673 Def->replaceAllUsesWith(Clone);
674 }
675
676 // Replace wide pointer inductions which have only their scalars used by
677 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
678 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
679 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
680 continue;
681
682 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
683 VPValue *StartV =
684 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
685 VPValue *StepV = PtrIV->getOperand(1);
687 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
688 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
689
690 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
691 PtrIV->getDebugLoc(), "next.gep");
692
693 PtrIV->replaceAllUsesWith(PtrAdd);
694 continue;
695 }
696
697 // Replace widened induction with scalar steps for users that only use
698 // scalars.
699 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
700 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
701 return U->usesScalars(WideIV);
702 }))
703 continue;
704
705 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
707 Plan, ID.getKind(), ID.getInductionOpcode(),
708 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
709 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
710 WideIV->getDebugLoc(), Builder);
711
712 // Update scalar users of IV to use Step instead.
713 if (!HasOnlyVectorVFs)
714 WideIV->replaceAllUsesWith(Steps);
715 else
716 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
717 return U.usesScalars(WideIV);
718 });
719 }
720}
721
722/// Check if \p VPV is an untruncated wide induction, either before or after the
723/// increment. If so return the header IV (before the increment), otherwise
724/// return null.
726 ScalarEvolution &SE) {
727 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
728 if (WideIV) {
729 // VPV itself is a wide induction, separately compute the end value for exit
730 // users if it is not a truncated IV.
731 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
732 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
733 }
734
735 // Check if VPV is an optimizable induction increment.
736 VPRecipeBase *Def = VPV->getDefiningRecipe();
737 if (!Def || Def->getNumOperands() != 2)
738 return nullptr;
739 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
740 if (!WideIV)
741 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
742 if (!WideIV)
743 return nullptr;
744
745 auto IsWideIVInc = [&]() {
746 auto &ID = WideIV->getInductionDescriptor();
747
748 // Check if VPV increments the induction by the induction step.
749 VPValue *IVStep = WideIV->getStepValue();
750 switch (ID.getInductionOpcode()) {
751 case Instruction::Add:
752 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
753 case Instruction::FAdd:
755 m_Specific(IVStep)));
756 case Instruction::FSub:
757 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
758 m_Specific(IVStep)));
759 case Instruction::Sub: {
760 // IVStep will be the negated step of the subtraction. Check if Step == -1
761 // * IVStep.
762 VPValue *Step;
763 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
764 return false;
765 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
766 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
767 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
768 !isa<SCEVCouldNotCompute>(StepSCEV) &&
769 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
770 }
771 default:
772 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
773 match(VPV, m_GetElementPtr(m_Specific(WideIV),
774 m_Specific(WideIV->getStepValue())));
775 }
776 llvm_unreachable("should have been covered by switch above");
777 };
778 return IsWideIVInc() ? WideIV : nullptr;
779}
780
781/// Attempts to optimize the induction variable exit values for users in the
782/// early exit block.
784 VPTypeAnalysis &TypeInfo,
785 VPBlockBase *PredVPBB,
786 VPValue *Op,
787 ScalarEvolution &SE) {
788 VPValue *Incoming, *Mask;
791 return nullptr;
792
793 auto *WideIV = getOptimizableIVOf(Incoming, SE);
794 if (!WideIV)
795 return nullptr;
796
797 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
798 if (WideIntOrFp && WideIntOrFp->getTruncInst())
799 return nullptr;
800
801 // Calculate the final index.
802 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
803 auto *CanonicalIV = LoopRegion->getCanonicalIV();
804 Type *CanonicalIVType = CanonicalIV->getScalarType();
805 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
806
807 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
808 VPValue *FirstActiveLane =
809 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
810 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
811 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
812 FirstActiveLaneType, DL);
813 VPValue *EndValue =
814 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
815
816 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
817 // changed it means the exit is using the incremented value, so we need to
818 // add the step.
819 if (Incoming != WideIV) {
820 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
821 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
822 }
823
824 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
825 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
826 VPValue *Start = WideIV->getStartValue();
827 VPValue *Step = WideIV->getStepValue();
828 EndValue = B.createDerivedIV(
829 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
830 Start, EndValue, Step);
831 }
832
833 return EndValue;
834}
835
836/// Attempts to optimize the induction variable exit values for users in the
837/// exit block coming from the latch in the original scalar loop.
839 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
843 return nullptr;
844
845 auto *WideIV = getOptimizableIVOf(Incoming, SE);
846 if (!WideIV)
847 return nullptr;
848
849 VPValue *EndValue = EndValues.lookup(WideIV);
850 assert(EndValue && "end value must have been pre-computed");
851
852 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
853 // changed it means the exit is using the incremented value, so we don't
854 // need to subtract the step.
855 if (Incoming != WideIV)
856 return EndValue;
857
858 // Otherwise, subtract the step from the EndValue.
859 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
860 VPValue *Step = WideIV->getStepValue();
861 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
862 if (ScalarTy->isIntegerTy())
863 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
864 if (ScalarTy->isPointerTy()) {
865 Type *StepTy = TypeInfo.inferScalarType(Step);
866 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
867 return B.createPtrAdd(EndValue,
868 B.createNaryOp(Instruction::Sub, {Zero, Step}),
869 DebugLoc::getUnknown(), "ind.escape");
870 }
871 if (ScalarTy->isFloatingPointTy()) {
872 const auto &ID = WideIV->getInductionDescriptor();
873 return B.createNaryOp(
874 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
875 ? Instruction::FSub
876 : Instruction::FAdd,
877 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
878 }
879 llvm_unreachable("all possible induction types must be handled");
880 return nullptr;
881}
882
884 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
885 ScalarEvolution &SE) {
886 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
887 VPTypeAnalysis TypeInfo(Plan);
888 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
889 for (VPRecipeBase &R : ExitVPBB->phis()) {
890 auto *ExitIRI = cast<VPIRPhi>(&R);
891
892 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
893 VPValue *Escape = nullptr;
894 if (PredVPBB == MiddleVPBB)
895 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
896 ExitIRI->getOperand(Idx),
897 EndValues, SE);
898 else
899 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
900 ExitIRI->getOperand(Idx), SE);
901 if (Escape)
902 ExitIRI->setOperand(Idx, Escape);
903 }
904 }
905 }
906}
907
908/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
909/// them with already existing recipes expanding the same SCEV expression.
912
913 for (VPRecipeBase &R :
915 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
916 if (!ExpR)
917 continue;
918
919 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
920 if (Inserted)
921 continue;
922 ExpR->replaceAllUsesWith(V->second);
923 ExpR->eraseFromParent();
924 }
925}
926
928 SmallVector<VPValue *> WorkList;
930 WorkList.push_back(V);
931
932 while (!WorkList.empty()) {
933 VPValue *Cur = WorkList.pop_back_val();
934 if (!Seen.insert(Cur).second)
935 continue;
937 if (!R)
938 continue;
939 if (!isDeadRecipe(*R))
940 continue;
941 append_range(WorkList, R->operands());
942 R->eraseFromParent();
943 }
944}
945
946/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
947/// Returns an optional pair, where the first element indicates whether it is
948/// an intrinsic ID.
949static std::optional<std::pair<bool, unsigned>>
951 return TypeSwitch<const VPSingleDefRecipe *,
952 std::optional<std::pair<bool, unsigned>>>(R)
955 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
956 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
957 return std::make_pair(true, I->getVectorIntrinsicID());
958 })
959 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
960 // For recipes that do not directly map to LLVM IR instructions,
961 // assign opcodes after the last VPInstruction opcode (which is also
962 // after the last IR Instruction opcode), based on the VPDefID.
963 return std::make_pair(false,
964 VPInstruction::OpsEnd + 1 + I->getVPDefID());
965 })
966 .Default([](auto *) { return std::nullopt; });
967}
968
969/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
970/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
971/// Operands are foldable live-ins.
973 ArrayRef<VPValue *> Operands,
974 const DataLayout &DL,
975 VPTypeAnalysis &TypeInfo) {
976 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
977 if (!OpcodeOrIID)
978 return nullptr;
979
981 for (VPValue *Op : Operands) {
982 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
983 return nullptr;
984 Ops.push_back(Op->getLiveInIRValue());
985 }
986
987 auto FoldToIRValue = [&]() -> Value * {
988 InstSimplifyFolder Folder(DL);
989 if (OpcodeOrIID->first) {
990 if (R.getNumOperands() != 2)
991 return nullptr;
992 unsigned ID = OpcodeOrIID->second;
993 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
994 TypeInfo.inferScalarType(&R));
995 }
996 unsigned Opcode = OpcodeOrIID->second;
997 if (Instruction::isBinaryOp(Opcode))
998 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
999 Ops[0], Ops[1]);
1000 if (Instruction::isCast(Opcode))
1001 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1002 TypeInfo.inferScalarType(R.getVPSingleValue()));
1003 switch (Opcode) {
1005 return Folder.FoldSelect(Ops[0], Ops[1],
1007 case VPInstruction::Not:
1008 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1010 case Instruction::Select:
1011 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1012 case Instruction::ICmp:
1013 case Instruction::FCmp:
1014 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1015 Ops[1]);
1016 case Instruction::GetElementPtr: {
1017 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1018 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1019 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1020 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1021 }
1024 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1025 Ops[0], Ops[1],
1026 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1027 // An extract of a live-in is an extract of a broadcast, so return the
1028 // broadcasted element.
1029 case Instruction::ExtractElement:
1030 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1031 return Ops[0];
1032 }
1033 return nullptr;
1034 };
1035
1036 if (Value *V = FoldToIRValue())
1037 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1038 return nullptr;
1039}
1040
1041/// Try to simplify recipe \p R.
1042static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1043 VPlan *Plan = R.getParent()->getPlan();
1044
1045 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1046 if (!Def)
1047 return;
1048
1049 // Simplification of live-in IR values for SingleDef recipes using
1050 // InstSimplifyFolder.
1051 const DataLayout &DL =
1053 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1054 return Def->replaceAllUsesWith(V);
1055
1056 // Fold PredPHI LiveIn -> LiveIn.
1057 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1058 VPValue *Op = PredPHI->getOperand(0);
1059 if (Op->isLiveIn())
1060 PredPHI->replaceAllUsesWith(Op);
1061 }
1062
1063 VPBuilder Builder(Def);
1064 VPValue *A;
1065 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1066 Type *TruncTy = TypeInfo.inferScalarType(Def);
1067 Type *ATy = TypeInfo.inferScalarType(A);
1068 if (TruncTy == ATy) {
1069 Def->replaceAllUsesWith(A);
1070 } else {
1071 // Don't replace a scalarizing recipe with a widened cast.
1072 if (isa<VPReplicateRecipe>(Def))
1073 return;
1074 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1075
1076 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1077 ? Instruction::SExt
1078 : Instruction::ZExt;
1079 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1080 TruncTy);
1081 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1082 // UnderlyingExt has distinct return type, used to retain legacy cost.
1083 Ext->setUnderlyingValue(UnderlyingExt);
1084 }
1085 Def->replaceAllUsesWith(Ext);
1086 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1087 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1088 Def->replaceAllUsesWith(Trunc);
1089 }
1090 }
1091#ifndef NDEBUG
1092 // Verify that the cached type info is for both A and its users is still
1093 // accurate by comparing it to freshly computed types.
1094 VPTypeAnalysis TypeInfo2(*Plan);
1095 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1096 for (VPUser *U : A->users()) {
1097 auto *R = cast<VPRecipeBase>(U);
1098 for (VPValue *VPV : R->definedValues())
1099 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1100 }
1101#endif
1102 }
1103
1104 // Simplify (X && Y) || (X && !Y) -> X.
1105 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1106 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1107 // recipes to be visited during simplification.
1108 VPValue *X, *Y, *Z;
1109 if (match(Def,
1112 Def->replaceAllUsesWith(X);
1113 Def->eraseFromParent();
1114 return;
1115 }
1116
1117 // x | 1 -> 1
1118 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1119 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1120
1121 // x | 0 -> x
1122 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1123 return Def->replaceAllUsesWith(X);
1124
1125 // x & 0 -> 0
1126 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1127 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1128
1129 // x && false -> false
1130 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1131 return Def->replaceAllUsesWith(Def->getOperand(1));
1132
1133 // (x && y) || (x && z) -> x && (y || z)
1136 // Simplify only if one of the operands has one use to avoid creating an
1137 // extra recipe.
1138 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1139 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1140 return Def->replaceAllUsesWith(
1141 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1142
1143 // x && !x -> 0
1145 return Def->replaceAllUsesWith(Plan->getFalse());
1146
1147 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1148 return Def->replaceAllUsesWith(X);
1149
1150 // select !c, x, y -> select c, y, x
1151 VPValue *C;
1152 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1153 Def->setOperand(0, C);
1154 Def->setOperand(1, Y);
1155 Def->setOperand(2, X);
1156 return;
1157 }
1158
1159 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1160 // tail folding it is likely that x is a header mask and can be simplified
1161 // further.
1163 m_VPValue(Z))) &&
1164 X->hasMoreThanOneUniqueUser())
1165 return Def->replaceAllUsesWith(
1166 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1167
1168 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1169 return Def->replaceAllUsesWith(A);
1170
1171 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1172 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1173 : R.getOperand(0));
1174
1175 if (match(Def, m_Not(m_VPValue(A)))) {
1176 if (match(A, m_Not(m_VPValue(A))))
1177 return Def->replaceAllUsesWith(A);
1178
1179 // Try to fold Not into compares by adjusting the predicate in-place.
1180 CmpPredicate Pred;
1181 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1182 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1183 if (all_of(Cmp->users(),
1185 m_Not(m_Specific(Cmp)),
1186 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1187 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1188 for (VPUser *U : to_vector(Cmp->users())) {
1189 auto *R = cast<VPSingleDefRecipe>(U);
1190 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1191 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1192 R->setOperand(1, Y);
1193 R->setOperand(2, X);
1194 } else {
1195 // not (cmp pred) -> cmp inv_pred
1196 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1197 R->replaceAllUsesWith(Cmp);
1198 }
1199 }
1200 // If Cmp doesn't have a debug location, use the one from the negation,
1201 // to preserve the location.
1202 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1203 Cmp->setDebugLoc(R.getDebugLoc());
1204 }
1205 }
1206 }
1207
1208 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1209 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1210 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1211 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1212 TypeInfo.inferScalarType(Def))
1213 return Def->replaceAllUsesWith(Def->getOperand(1));
1214
1216 m_One()))) {
1217 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1218 if (TypeInfo.inferScalarType(X) != WideStepTy)
1219 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1220 Def->replaceAllUsesWith(X);
1221 return;
1222 }
1223
1224 // For i1 vp.merges produced by AnyOf reductions:
1225 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1227 m_VPValue(X), m_VPValue())) &&
1229 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1230 Def->setOperand(1, Def->getOperand(0));
1231 Def->setOperand(0, Y);
1232 return;
1233 }
1234
1235 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1236 if (Phi->getOperand(0) == Phi->getOperand(1))
1237 Def->replaceAllUsesWith(Phi->getOperand(0));
1238 return;
1239 }
1240
1241 // Look through ExtractLastElement (BuildVector ....).
1244 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1245 Def->replaceAllUsesWith(
1246 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1247 return;
1248 }
1249
1250 // Look through ExtractPenultimateElement (BuildVector ....).
1252 m_BuildVector()))) {
1253 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1254 Def->replaceAllUsesWith(
1255 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1256 return;
1257 }
1258
1259 uint64_t Idx;
1261 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1262 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1263 return;
1264 }
1265
1266 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1267 if (Phi->getNumOperands() == 1)
1268 Phi->replaceAllUsesWith(Phi->getOperand(0));
1269 return;
1270 }
1271
1272 // Some simplifications can only be applied after unrolling. Perform them
1273 // below.
1274 if (!Plan->isUnrolled())
1275 return;
1276
1277 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1278 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1279 isa<VPPhi>(X)) {
1280 auto *Phi = cast<VPPhi>(X);
1281 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1282 Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
1283 Phi->setOperand(0, Y);
1284 Def->replaceAllUsesWith(Phi);
1285 return;
1286 }
1287 }
1288
1289 // VPVectorPointer for part 0 can be replaced by their start pointer.
1290 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1291 if (VecPtr->isFirstPart()) {
1292 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1293 return;
1294 }
1295 }
1296
1297 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1298 // the first lane is demanded.
1299 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1300 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1301 Steps->replaceAllUsesWith(Steps->getOperand(0));
1302 return;
1303 }
1304 }
1305 // Simplify redundant ReductionStartVector recipes after unrolling.
1306 VPValue *StartV;
1308 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1309 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1310 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1311 return PhiR && PhiR->isInLoop();
1312 });
1313 return;
1314 }
1315
1316 if (match(Def,
1319 Def->replaceAllUsesWith(A);
1320 return;
1321 }
1322
1327 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1328 all_of(A->users(),
1329 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1330 return Def->replaceAllUsesWith(A);
1331 }
1332
1333 if (Plan->getUF() == 1 &&
1335 return Def->replaceAllUsesWith(
1336 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1337 }
1338}
1339
1342 Plan.getEntry());
1343 VPTypeAnalysis TypeInfo(Plan);
1345 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1346 simplifyRecipe(R, TypeInfo);
1347 }
1348 }
1349}
1350
1352 if (Plan.hasScalarVFOnly())
1353 return;
1354
1355 // Try to narrow wide and replicating recipes to single scalar recipes,
1356 // based on VPlan analysis. Only process blocks in the loop region for now,
1357 // without traversing into nested regions, as recipes in replicate regions
1358 // cannot be converted yet.
1361 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1363 continue;
1364 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1365 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1366 continue;
1367
1368 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1369 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1370 vputils::isSingleScalar(RepR->getOperand(1))) {
1371 auto *Clone = new VPReplicateRecipe(
1372 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1373 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1374 Clone->insertBefore(RepOrWidenR);
1375 unsigned ExtractOpc =
1376 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1379 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1380 Ext->insertBefore(Clone);
1381 Clone->setOperand(0, Ext);
1382 RepR->eraseFromParent();
1383 continue;
1384 }
1385
1386 // Skip recipes that aren't single scalars or don't have only their
1387 // scalar results used. In the latter case, we would introduce extra
1388 // broadcasts.
1389 if (!vputils::isSingleScalar(RepOrWidenR) ||
1390 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1391 return U->usesScalars(RepOrWidenR) ||
1392 match(cast<VPRecipeBase>(U),
1393 m_CombineOr(m_ExtractLastElement(m_VPValue()),
1394 m_ExtractLastLanePerPart(m_VPValue())));
1395 }))
1396 continue;
1397
1398 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1399 RepOrWidenR->operands(),
1400 true /*IsSingleScalar*/);
1401 Clone->insertBefore(RepOrWidenR);
1402 RepOrWidenR->replaceAllUsesWith(Clone);
1403 }
1404 }
1405}
1406
1407/// Try to see if all of \p Blend's masks share a common value logically and'ed
1408/// and remove it from the masks.
1410 if (Blend->isNormalized())
1411 return;
1412 VPValue *CommonEdgeMask;
1413 if (!match(Blend->getMask(0),
1414 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1415 return;
1416 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1417 if (!match(Blend->getMask(I),
1418 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1419 return;
1420 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1421 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1422}
1423
1424/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1425/// to make sure the masks are simplified.
1426static void simplifyBlends(VPlan &Plan) {
1429 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1430 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1431 if (!Blend)
1432 continue;
1433
1434 removeCommonBlendMask(Blend);
1435
1436 // Try to remove redundant blend recipes.
1437 SmallPtrSet<VPValue *, 4> UniqueValues;
1438 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1439 UniqueValues.insert(Blend->getIncomingValue(0));
1440 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1441 if (!match(Blend->getMask(I), m_False()))
1442 UniqueValues.insert(Blend->getIncomingValue(I));
1443
1444 if (UniqueValues.size() == 1) {
1445 Blend->replaceAllUsesWith(*UniqueValues.begin());
1446 Blend->eraseFromParent();
1447 continue;
1448 }
1449
1450 if (Blend->isNormalized())
1451 continue;
1452
1453 // Normalize the blend so its first incoming value is used as the initial
1454 // value with the others blended into it.
1455
1456 unsigned StartIndex = 0;
1457 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1458 // If a value's mask is used only by the blend then is can be deadcoded.
1459 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1460 // that's used by multiple blends where it can be removed from them all.
1461 VPValue *Mask = Blend->getMask(I);
1462 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1463 StartIndex = I;
1464 break;
1465 }
1466 }
1467
1468 SmallVector<VPValue *, 4> OperandsWithMask;
1469 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1470
1471 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1472 if (I == StartIndex)
1473 continue;
1474 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1475 OperandsWithMask.push_back(Blend->getMask(I));
1476 }
1477
1478 auto *NewBlend =
1479 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1480 OperandsWithMask, Blend->getDebugLoc());
1481 NewBlend->insertBefore(&R);
1482
1483 VPValue *DeadMask = Blend->getMask(StartIndex);
1484 Blend->replaceAllUsesWith(NewBlend);
1485 Blend->eraseFromParent();
1487
1488 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1489 VPValue *NewMask;
1490 if (NewBlend->getNumOperands() == 3 &&
1491 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1492 VPValue *Inc0 = NewBlend->getOperand(0);
1493 VPValue *Inc1 = NewBlend->getOperand(1);
1494 VPValue *OldMask = NewBlend->getOperand(2);
1495 NewBlend->setOperand(0, Inc1);
1496 NewBlend->setOperand(1, Inc0);
1497 NewBlend->setOperand(2, NewMask);
1498 if (OldMask->getNumUsers() == 0)
1499 cast<VPInstruction>(OldMask)->eraseFromParent();
1500 }
1501 }
1502 }
1503}
1504
1505/// Optimize the width of vector induction variables in \p Plan based on a known
1506/// constant Trip Count, \p BestVF and \p BestUF.
1508 ElementCount BestVF,
1509 unsigned BestUF) {
1510 // Only proceed if we have not completely removed the vector region.
1511 if (!Plan.getVectorLoopRegion())
1512 return false;
1513
1514 const APInt *TC;
1515 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1516 return false;
1517
1518 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1519 // and UF. Returns at least 8.
1520 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1521 APInt AlignedTC =
1524 APInt MaxVal = AlignedTC - 1;
1525 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1526 };
1527 unsigned NewBitWidth =
1528 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1529
1530 LLVMContext &Ctx = Plan.getContext();
1531 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1532
1533 bool MadeChange = false;
1534
1535 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1536 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1537 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1538
1539 // Currently only handle canonical IVs as it is trivial to replace the start
1540 // and stop values, and we currently only perform the optimization when the
1541 // IV has a single use.
1542 if (!WideIV || !WideIV->isCanonical() ||
1543 WideIV->hasMoreThanOneUniqueUser() ||
1544 NewIVTy == WideIV->getScalarType())
1545 continue;
1546
1547 // Currently only handle cases where the single user is a header-mask
1548 // comparison with the backedge-taken-count.
1549 if (!match(*WideIV->user_begin(),
1550 m_ICmp(m_Specific(WideIV),
1553 continue;
1554
1555 // Update IV operands and comparison bound to use new narrower type.
1556 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1557 WideIV->setStartValue(NewStart);
1558 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1559 WideIV->setStepValue(NewStep);
1560
1561 auto *NewBTC = new VPWidenCastRecipe(
1562 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1563 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1564 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1565 Cmp->setOperand(1, NewBTC);
1566
1567 MadeChange = true;
1568 }
1569
1570 return MadeChange;
1571}
1572
1573/// Return true if \p Cond is known to be true for given \p BestVF and \p
1574/// BestUF.
1576 ElementCount BestVF, unsigned BestUF,
1577 ScalarEvolution &SE) {
1579 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1580 &SE](VPValue *C) {
1581 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1582 });
1583
1584 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1586 m_Specific(CanIV->getBackedgeValue()),
1587 m_Specific(&Plan.getVectorTripCount()))))
1588 return false;
1589
1590 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1591 // count is not conveniently available as SCEV so far, so we compare directly
1592 // against the original trip count. This is stricter than necessary, as we
1593 // will only return true if the trip count == vector trip count.
1594 const SCEV *VectorTripCount =
1596 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1597 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1598 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1599 "Trip count SCEV must be computable");
1600 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1601 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1602 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1603}
1604
1605/// Try to replace multiple active lane masks used for control flow with
1606/// a single, wide active lane mask instruction followed by multiple
1607/// extract subvector intrinsics. This applies to the active lane mask
1608/// instructions both in the loop and in the preheader.
1609/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1610/// new extracts from the first active lane mask, which has it's last
1611/// operand (multiplier) set to UF.
1613 unsigned UF) {
1614 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1615 return false;
1616
1617 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1618 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1619 auto *Term = &ExitingVPBB->back();
1620
1621 using namespace llvm::VPlanPatternMatch;
1623 m_VPValue(), m_VPValue(), m_VPValue())))))
1624 return false;
1625
1626 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1627 LLVMContext &Ctx = Plan.getContext();
1628
1629 auto ExtractFromALM = [&](VPInstruction *ALM,
1630 SmallVectorImpl<VPValue *> &Extracts) {
1631 DebugLoc DL = ALM->getDebugLoc();
1632 for (unsigned Part = 0; Part < UF; ++Part) {
1634 Ops.append({ALM, Plan.getOrAddLiveIn(
1635 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1636 VF.getKnownMinValue() * Part))});
1637 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1639 Extracts[Part] = Ext;
1640 Ext->insertAfter(ALM);
1641 }
1642 };
1643
1644 // Create a list of each active lane mask phi, ordered by unroll part.
1646 for (VPRecipeBase &R : Header->phis()) {
1648 if (!Phi)
1649 continue;
1650 VPValue *Index = nullptr;
1651 match(Phi->getBackedgeValue(),
1653 assert(Index && "Expected index from ActiveLaneMask instruction");
1654
1655 uint64_t Part;
1656 if (match(Index,
1658 m_VPValue(), m_ConstantInt(Part))))
1659 Phis[Part] = Phi;
1660 else
1661 // Anything other than a CanonicalIVIncrementForPart is part 0
1662 Phis[0] = Phi;
1663 }
1664
1665 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1666 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1667
1668 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1669 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1670
1671 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1672 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1673 "Expected incoming values of Phi to be ActiveLaneMasks");
1674
1675 // When using wide lane masks, the return type of the get.active.lane.mask
1676 // intrinsic is VF x UF (last operand).
1677 VPValue *ALMMultiplier =
1678 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1679 EntryALM->setOperand(2, ALMMultiplier);
1680 LoopALM->setOperand(2, ALMMultiplier);
1681
1682 // Create UF x extract vectors and insert into preheader.
1683 SmallVector<VPValue *> EntryExtracts(UF);
1684 ExtractFromALM(EntryALM, EntryExtracts);
1685
1686 // Create UF x extract vectors and insert before the loop compare & branch,
1687 // updating the compare to use the first extract.
1688 SmallVector<VPValue *> LoopExtracts(UF);
1689 ExtractFromALM(LoopALM, LoopExtracts);
1690 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1691 Not->setOperand(0, LoopExtracts[0]);
1692
1693 // Update the incoming values of active lane mask phis.
1694 for (unsigned Part = 0; Part < UF; ++Part) {
1695 Phis[Part]->setStartValue(EntryExtracts[Part]);
1696 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1697 }
1698
1699 return true;
1700}
1701
1702/// Try to simplify the branch condition of \p Plan. This may restrict the
1703/// resulting plan to \p BestVF and \p BestUF.
1705 unsigned BestUF,
1707 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1708 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1709 auto *Term = &ExitingVPBB->back();
1710 VPValue *Cond;
1711 ScalarEvolution &SE = *PSE.getSE();
1712 if (match(Term, m_BranchOnCount()) ||
1714 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1715 // Try to simplify the branch condition if TC <= VF * UF when the latch
1716 // terminator is BranchOnCount or BranchOnCond where the input is
1717 // Not(ActiveLaneMask).
1718 const SCEV *TripCount =
1720 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1721 "Trip count SCEV must be computable");
1722 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1723 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1724 if (TripCount->isZero() ||
1725 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1726 return false;
1727 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1728 // For BranchOnCond, check if we can prove the condition to be true using VF
1729 // and UF.
1730 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1731 return false;
1732 } else {
1733 return false;
1734 }
1735
1736 // The vector loop region only executes once. If possible, completely remove
1737 // the region, otherwise replace the terminator controlling the latch with
1738 // (BranchOnCond true).
1739 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1740 // support for other non-canonical widen induction recipes (e.g.,
1741 // VPWidenPointerInductionRecipe).
1742 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1743 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1744 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1745 return R->isCanonical();
1746 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1747 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1748 })) {
1749 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1750 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1751 VPBuilder Builder(Plan.getVectorPreheader());
1752 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1753 R->getScalarType());
1754 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1755 HeaderR.eraseFromParent();
1756 continue;
1757 }
1758 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1759 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1760 HeaderR.eraseFromParent();
1761 }
1762
1763 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1764 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1765 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1766 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1767
1768 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1769 B->setParent(nullptr);
1770
1771 VPBlockUtils::connectBlocks(Preheader, Header);
1772 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1774 } else {
1775 // The vector region contains header phis for which we cannot remove the
1776 // loop region yet.
1777 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1778 Term->getDebugLoc());
1779 ExitingVPBB->appendRecipe(BOC);
1780 }
1781
1782 Term->eraseFromParent();
1783
1784 return true;
1785}
1786
1788 unsigned BestUF,
1790 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1791 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1792
1793 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1794 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1795 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1796
1797 if (MadeChange) {
1798 Plan.setVF(BestVF);
1799 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1800 }
1801 // TODO: Further simplifications are possible
1802 // 1. Replace inductions with constants.
1803 // 2. Replace vector loop region with VPBasicBlock.
1804}
1805
1806/// Sink users of \p FOR after the recipe defining the previous value \p
1807/// Previous of the recurrence. \returns true if all users of \p FOR could be
1808/// re-arranged as needed or false if it is not possible.
1809static bool
1811 VPRecipeBase *Previous,
1812 VPDominatorTree &VPDT) {
1813 // Collect recipes that need sinking.
1816 Seen.insert(Previous);
1817 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1818 // The previous value must not depend on the users of the recurrence phi. In
1819 // that case, FOR is not a fixed order recurrence.
1820 if (SinkCandidate == Previous)
1821 return false;
1822
1823 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1824 !Seen.insert(SinkCandidate).second ||
1825 VPDT.properlyDominates(Previous, SinkCandidate))
1826 return true;
1827
1828 if (SinkCandidate->mayHaveSideEffects())
1829 return false;
1830
1831 WorkList.push_back(SinkCandidate);
1832 return true;
1833 };
1834
1835 // Recursively sink users of FOR after Previous.
1836 WorkList.push_back(FOR);
1837 for (unsigned I = 0; I != WorkList.size(); ++I) {
1838 VPRecipeBase *Current = WorkList[I];
1839 assert(Current->getNumDefinedValues() == 1 &&
1840 "only recipes with a single defined value expected");
1841
1842 for (VPUser *User : Current->getVPSingleValue()->users()) {
1843 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1844 return false;
1845 }
1846 }
1847
1848 // Keep recipes to sink ordered by dominance so earlier instructions are
1849 // processed first.
1850 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1851 return VPDT.properlyDominates(A, B);
1852 });
1853
1854 for (VPRecipeBase *SinkCandidate : WorkList) {
1855 if (SinkCandidate == FOR)
1856 continue;
1857
1858 SinkCandidate->moveAfter(Previous);
1859 Previous = SinkCandidate;
1860 }
1861 return true;
1862}
1863
1864/// Try to hoist \p Previous and its operands before all users of \p FOR.
1866 VPRecipeBase *Previous,
1867 VPDominatorTree &VPDT) {
1868 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1869 return false;
1870
1871 // Collect recipes that need hoisting.
1872 SmallVector<VPRecipeBase *> HoistCandidates;
1874 VPRecipeBase *HoistPoint = nullptr;
1875 // Find the closest hoist point by looking at all users of FOR and selecting
1876 // the recipe dominating all other users.
1877 for (VPUser *U : FOR->users()) {
1878 auto *R = cast<VPRecipeBase>(U);
1879 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1880 HoistPoint = R;
1881 }
1882 assert(all_of(FOR->users(),
1883 [&VPDT, HoistPoint](VPUser *U) {
1884 auto *R = cast<VPRecipeBase>(U);
1885 return HoistPoint == R ||
1886 VPDT.properlyDominates(HoistPoint, R);
1887 }) &&
1888 "HoistPoint must dominate all users of FOR");
1889
1890 auto NeedsHoisting = [HoistPoint, &VPDT,
1891 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1892 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1893 if (!HoistCandidate)
1894 return nullptr;
1895 VPRegionBlock *EnclosingLoopRegion =
1896 HoistCandidate->getParent()->getEnclosingLoopRegion();
1897 assert((!HoistCandidate->getRegion() ||
1898 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1899 "CFG in VPlan should still be flat, without replicate regions");
1900 // Hoist candidate was already visited, no need to hoist.
1901 if (!Visited.insert(HoistCandidate).second)
1902 return nullptr;
1903
1904 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1905 // hoisting.
1906 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1907 return nullptr;
1908
1909 // If we reached a recipe that dominates HoistPoint, we don't need to
1910 // hoist the recipe.
1911 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1912 return nullptr;
1913 return HoistCandidate;
1914 };
1915 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1916 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1917 // associated dependencies.
1918 return !HoistCandidate->mayHaveSideEffects();
1919 };
1920
1921 if (!NeedsHoisting(Previous->getVPSingleValue()))
1922 return true;
1923
1924 // Recursively try to hoist Previous and its operands before all users of FOR.
1925 HoistCandidates.push_back(Previous);
1926
1927 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1928 VPRecipeBase *Current = HoistCandidates[I];
1929 assert(Current->getNumDefinedValues() == 1 &&
1930 "only recipes with a single defined value expected");
1931 if (!CanHoist(Current))
1932 return false;
1933
1934 for (VPValue *Op : Current->operands()) {
1935 // If we reach FOR, it means the original Previous depends on some other
1936 // recurrence that in turn depends on FOR. If that is the case, we would
1937 // also need to hoist recipes involving the other FOR, which may break
1938 // dependencies.
1939 if (Op == FOR)
1940 return false;
1941
1942 if (auto *R = NeedsHoisting(Op))
1943 HoistCandidates.push_back(R);
1944 }
1945 }
1946
1947 // Order recipes to hoist by dominance so earlier instructions are processed
1948 // first.
1949 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1950 return VPDT.properlyDominates(A, B);
1951 });
1952
1953 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1954 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1955 HoistPoint->getIterator());
1956 }
1957
1958 return true;
1959}
1960
1962 VPBuilder &LoopBuilder) {
1963 VPDominatorTree VPDT(Plan);
1964
1966 for (VPRecipeBase &R :
1969 RecurrencePhis.push_back(FOR);
1970
1971 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1973 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1974 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1975 // to terminate.
1976 while (auto *PrevPhi =
1978 assert(PrevPhi->getParent() == FOR->getParent());
1979 assert(SeenPhis.insert(PrevPhi).second);
1980 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1981 }
1982
1983 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1984 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1985 return false;
1986
1987 // Introduce a recipe to combine the incoming and previous values of a
1988 // fixed-order recurrence.
1989 VPBasicBlock *InsertBlock = Previous->getParent();
1990 if (isa<VPHeaderPHIRecipe>(Previous))
1991 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1992 else
1993 LoopBuilder.setInsertPoint(InsertBlock,
1994 std::next(Previous->getIterator()));
1995
1996 auto *RecurSplice =
1998 {FOR, FOR->getBackedgeValue()});
1999
2000 FOR->replaceAllUsesWith(RecurSplice);
2001 // Set the first operand of RecurSplice to FOR again, after replacing
2002 // all users.
2003 RecurSplice->setOperand(0, FOR);
2004 }
2005 return true;
2006}
2007
2009 for (VPRecipeBase &R :
2011 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2012 if (!PhiR)
2013 continue;
2014 RecurKind RK = PhiR->getRecurrenceKind();
2015 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2017 continue;
2018
2019 for (VPUser *U : collectUsersRecursively(PhiR))
2020 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2021 RecWithFlags->dropPoisonGeneratingFlags();
2022 }
2023 }
2024}
2025
2026namespace {
2027struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2028 static bool isSentinel(const VPSingleDefRecipe *Def) {
2029 return Def == getEmptyKey() || Def == getTombstoneKey();
2030 }
2031
2032 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2033 /// return that source element type.
2034 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2035 // All VPInstructions that lower to GEPs must have the i8 source element
2036 // type (as they are PtrAdds), so we omit it.
2038 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2039 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2040 return GEP->getSourceElementType();
2041 return nullptr;
2042 })
2043 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2044 [](auto *I) { return I->getSourceElementType(); })
2045 .Default([](auto *) { return nullptr; });
2046 }
2047
2048 /// Returns true if recipe \p Def can be safely handed for CSE.
2049 static bool canHandle(const VPSingleDefRecipe *Def) {
2050 // We can extend the list of handled recipes in the future,
2051 // provided we account for the data embedded in them while checking for
2052 // equality or hashing.
2053 auto C = getOpcodeOrIntrinsicID(Def);
2054
2055 // The issue with (Insert|Extract)Value is that the index of the
2056 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2057 // VPlan.
2058 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2059 C->second == Instruction::ExtractValue)))
2060 return false;
2061
2062 // During CSE, we can only handle recipes that don't read from memory: if
2063 // they read from memory, there could be an intervening write to memory
2064 // before the next instance is CSE'd, leading to an incorrect result.
2065 return !Def->mayReadFromMemory();
2066 }
2067
2068 /// Hash the underlying data of \p Def.
2069 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2070 const VPlan *Plan = Def->getParent()->getPlan();
2071 VPTypeAnalysis TypeInfo(*Plan);
2072 hash_code Result = hash_combine(
2073 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2074 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2076 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2077 if (RFlags->hasPredicate())
2078 return hash_combine(Result, RFlags->getPredicate());
2079 return Result;
2080 }
2081
2082 /// Check equality of underlying data of \p L and \p R.
2083 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2084 if (isSentinel(L) || isSentinel(R))
2085 return L == R;
2086 if (L->getVPDefID() != R->getVPDefID() ||
2088 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2090 !equal(L->operands(), R->operands()))
2091 return false;
2093 "must have valid opcode info for both recipes");
2094 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2095 if (LFlags->hasPredicate() &&
2096 LFlags->getPredicate() !=
2097 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2098 return false;
2099 // Recipes in replicate regions implicitly depend on predicate. If either
2100 // recipe is in a replicate region, only consider them equal if both have
2101 // the same parent.
2102 const VPRegionBlock *RegionL = L->getRegion();
2103 const VPRegionBlock *RegionR = R->getRegion();
2104 if (((RegionL && RegionL->isReplicator()) ||
2105 (RegionR && RegionR->isReplicator())) &&
2106 L->getParent() != R->getParent())
2107 return false;
2108 const VPlan *Plan = L->getParent()->getPlan();
2109 VPTypeAnalysis TypeInfo(*Plan);
2110 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2111 }
2112};
2113} // end anonymous namespace
2114
2115/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2116/// Plan.
2118 VPDominatorTree VPDT(Plan);
2120
2122 vp_depth_first_deep(Plan.getEntry()))) {
2123 for (VPRecipeBase &R : *VPBB) {
2124 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2125 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2126 continue;
2127 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2128 // V must dominate Def for a valid replacement.
2129 if (!VPDT.dominates(V->getParent(), VPBB))
2130 continue;
2131 // Only keep flags present on both V and Def.
2132 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2133 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2134 Def->replaceAllUsesWith(V);
2135 continue;
2136 }
2137 CSEMap[Def] = Def;
2138 }
2139 }
2140}
2141
2142/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2143static void licm(VPlan &Plan) {
2144 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2145
2146 // Return true if we do not know how to (mechanically) hoist a given recipe
2147 // out of a loop region.
2148 auto CannotHoistRecipe = [](VPRecipeBase &R) {
2149 // Assumes don't alias anything or throw; as long as they're guaranteed to
2150 // execute, they're safe to hoist.
2152 return false;
2153
2154 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
2155 // their memory location is not modified in the vector loop.
2156 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
2157 return true;
2158
2159 // Allocas cannot be hoisted.
2160 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2161 return RepR && RepR->getOpcode() == Instruction::Alloca;
2162 };
2163
2164 // Hoist any loop invariant recipes from the vector loop region to the
2165 // preheader. Preform a shallow traversal of the vector loop region, to
2166 // exclude recipes in replicate regions. Since the top-level blocks in the
2167 // vector loop region are guaranteed to execute if the vector pre-header is,
2168 // we don't need to check speculation safety.
2169 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2170 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2171 "Expected vector prehader's successor to be the vector loop region");
2173 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2174 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2175 if (CannotHoistRecipe(R))
2176 continue;
2177 if (any_of(R.operands(), [](VPValue *Op) {
2178 return !Op->isDefinedOutsideLoopRegions();
2179 }))
2180 continue;
2181 R.moveBefore(*Preheader, Preheader->end());
2182 }
2183 }
2184}
2185
2187 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2188 if (Plan.hasScalarVFOnly())
2189 return;
2190 // Keep track of created truncates, so they can be re-used. Note that we
2191 // cannot use RAUW after creating a new truncate, as this would could make
2192 // other uses have different types for their operands, making them invalidly
2193 // typed.
2195 VPTypeAnalysis TypeInfo(Plan);
2196 VPBasicBlock *PH = Plan.getVectorPreheader();
2199 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2202 &R))
2203 continue;
2204
2205 VPValue *ResultVPV = R.getVPSingleValue();
2206 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2207 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2208 if (!NewResSizeInBits)
2209 continue;
2210
2211 // If the value wasn't vectorized, we must maintain the original scalar
2212 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2213 // skip casts which do not need to be handled explicitly here, as
2214 // redundant casts will be removed during recipe simplification.
2216 continue;
2217
2218 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2219 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2220 assert(OldResTy->isIntegerTy() && "only integer types supported");
2221 (void)OldResSizeInBits;
2222
2223 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2224
2225 // Any wrapping introduced by shrinking this operation shouldn't be
2226 // considered undefined behavior. So, we can't unconditionally copy
2227 // arithmetic wrapping flags to VPW.
2228 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2229 VPW->dropPoisonGeneratingFlags();
2230
2231 if (OldResSizeInBits != NewResSizeInBits &&
2232 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2233 // Extend result to original width.
2234 auto *Ext =
2235 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2236 Ext->insertAfter(&R);
2237 ResultVPV->replaceAllUsesWith(Ext);
2238 Ext->setOperand(0, ResultVPV);
2239 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2240 } else {
2241 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2242 "Only ICmps should not need extending the result.");
2243 }
2244
2245 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2247 continue;
2248
2249 // Shrink operands by introducing truncates as needed.
2250 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2251 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2252 auto *Op = R.getOperand(Idx);
2253 unsigned OpSizeInBits =
2255 if (OpSizeInBits == NewResSizeInBits)
2256 continue;
2257 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2258 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2259 if (!IterIsEmpty) {
2260 R.setOperand(Idx, ProcessedIter->second);
2261 continue;
2262 }
2263
2264 VPBuilder Builder;
2265 if (Op->isLiveIn())
2266 Builder.setInsertPoint(PH);
2267 else
2268 Builder.setInsertPoint(&R);
2269 VPWidenCastRecipe *NewOp =
2270 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2271 ProcessedIter->second = NewOp;
2272 R.setOperand(Idx, NewOp);
2273 }
2274
2275 }
2276 }
2277}
2278
2282 VPValue *Cond;
2283 // Skip blocks that are not terminated by BranchOnCond.
2284 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2285 continue;
2286
2287 assert(VPBB->getNumSuccessors() == 2 &&
2288 "Two successors expected for BranchOnCond");
2289 unsigned RemovedIdx;
2290 if (match(Cond, m_True()))
2291 RemovedIdx = 1;
2292 else if (match(Cond, m_False()))
2293 RemovedIdx = 0;
2294 else
2295 continue;
2296
2297 VPBasicBlock *RemovedSucc =
2298 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2299 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2300 "There must be a single edge between VPBB and its successor");
2301 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2302 // these recipes.
2303 for (VPRecipeBase &R : RemovedSucc->phis())
2304 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2305
2306 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2307 // automatically on VPlan destruction if it becomes unreachable.
2308 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2309 VPBB->back().eraseFromParent();
2310 }
2311}
2312
2331
2332// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2333// the loop terminator with a branch-on-cond recipe with the negated
2334// active-lane-mask as operand. Note that this turns the loop into an
2335// uncountable one. Only the existing terminator is replaced, all other existing
2336// recipes/users remain unchanged, except for poison-generating flags being
2337// dropped from the canonical IV increment. Return the created
2338// VPActiveLaneMaskPHIRecipe.
2339//
2340// The function uses the following definitions:
2341//
2342// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2343// calculate-trip-count-minus-VF (original TC) : original TC
2344// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2345// CanonicalIVPhi : CanonicalIVIncrement
2346// %StartV is the canonical induction start value.
2347//
2348// The function adds the following recipes:
2349//
2350// vector.ph:
2351// %TripCount = calculate-trip-count-minus-VF (original TC)
2352// [if DataWithControlFlowWithoutRuntimeCheck]
2353// %EntryInc = canonical-iv-increment-for-part %StartV
2354// %EntryALM = active-lane-mask %EntryInc, %TripCount
2355//
2356// vector.body:
2357// ...
2358// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2359// ...
2360// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2361// %ALM = active-lane-mask %InLoopInc, TripCount
2362// %Negated = Not %ALM
2363// branch-on-cond %Negated
2364//
2367 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2368 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2369 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2370 VPValue *StartV = CanonicalIVPHI->getStartValue();
2371
2372 auto *CanonicalIVIncrement =
2373 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2374 // TODO: Check if dropping the flags is needed if
2375 // !DataAndControlFlowWithoutRuntimeCheck.
2376 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2377 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2378 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2379 // we have to take unrolling into account. Each part needs to start at
2380 // Part * VF
2381 auto *VecPreheader = Plan.getVectorPreheader();
2382 VPBuilder Builder(VecPreheader);
2383
2384 // Create the ActiveLaneMask instruction using the correct start values.
2385 VPValue *TC = Plan.getTripCount();
2386
2387 VPValue *TripCount, *IncrementValue;
2389 // When the loop is guarded by a runtime overflow check for the loop
2390 // induction variable increment by VF, we can increment the value before
2391 // the get.active.lane mask and use the unmodified tripcount.
2392 IncrementValue = CanonicalIVIncrement;
2393 TripCount = TC;
2394 } else {
2395 // When avoiding a runtime check, the active.lane.mask inside the loop
2396 // uses a modified trip count and the induction variable increment is
2397 // done after the active.lane.mask intrinsic is called.
2398 IncrementValue = CanonicalIVPHI;
2399 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2400 {TC}, DL);
2401 }
2402 auto *EntryIncrement = Builder.createOverflowingOp(
2403 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2404 "index.part.next");
2405
2406 // Create the active lane mask instruction in the VPlan preheader.
2407 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2408 ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
2409 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2410 {EntryIncrement, TC, ALMMultiplier}, DL,
2411 "active.lane.mask.entry");
2412
2413 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2414 // preheader ActiveLaneMask instruction.
2415 auto *LaneMaskPhi =
2417 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2418
2419 // Create the active lane mask for the next iteration of the loop before the
2420 // original terminator.
2421 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2422 Builder.setInsertPoint(OriginalTerminator);
2423 auto *InLoopIncrement =
2424 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2425 {IncrementValue}, {false, false}, DL);
2426 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2427 {InLoopIncrement, TripCount, ALMMultiplier},
2428 DL, "active.lane.mask.next");
2429 LaneMaskPhi->addOperand(ALM);
2430
2431 // Replace the original terminator with BranchOnCond. We have to invert the
2432 // mask here because a true condition means jumping to the exit block.
2433 auto *NotMask = Builder.createNot(ALM, DL);
2434 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2435 OriginalTerminator->eraseFromParent();
2436 return LaneMaskPhi;
2437}
2438
2439/// Collect the header mask with the pattern:
2440/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2441/// TODO: Introduce explicit recipe for header-mask instead of searching
2442/// for the header-mask pattern manually.
2444 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2445 SmallVector<VPValue *> WideCanonicalIVs;
2446 auto *FoundWidenCanonicalIVUser = find_if(
2448 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2450 "Must have at most one VPWideCanonicalIVRecipe");
2451 if (FoundWidenCanonicalIVUser !=
2452 LoopRegion->getCanonicalIV()->users().end()) {
2453 auto *WideCanonicalIV =
2454 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2455 WideCanonicalIVs.push_back(WideCanonicalIV);
2456 }
2457
2458 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2459 // version of the canonical induction.
2460 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2461 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2462 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2463 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2464 WideCanonicalIVs.push_back(WidenOriginalIV);
2465 }
2466
2467 // Walk users of wide canonical IVs and find the single compare of the form
2468 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2469 VPSingleDefRecipe *HeaderMask = nullptr;
2470 for (auto *Wide : WideCanonicalIVs) {
2471 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2472 auto *VPI = dyn_cast<VPInstruction>(U);
2473 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2474 continue;
2475
2476 assert(VPI->getOperand(0) == Wide &&
2477 "WidenCanonicalIV must be the first operand of the compare");
2478 assert(!HeaderMask && "Multiple header masks found?");
2479 HeaderMask = VPI;
2480 }
2481 }
2482 return HeaderMask;
2483}
2484
2486 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2489 UseActiveLaneMaskForControlFlow) &&
2490 "DataAndControlFlowWithoutRuntimeCheck implies "
2491 "UseActiveLaneMaskForControlFlow");
2492
2493 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2494 auto *FoundWidenCanonicalIVUser = find_if(
2496 assert(FoundWidenCanonicalIVUser &&
2497 "Must have widened canonical IV when tail folding!");
2498 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2499 auto *WideCanonicalIV =
2500 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2501 VPSingleDefRecipe *LaneMask;
2502 if (UseActiveLaneMaskForControlFlow) {
2505 } else {
2506 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2507 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2508 ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
2509 LaneMask =
2510 B.createNaryOp(VPInstruction::ActiveLaneMask,
2511 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2512 nullptr, "active.lane.mask");
2513 }
2514
2515 // Walk users of WideCanonicalIV and replace the header mask of the form
2516 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2517 // removing the old one to ensure there is always only a single header mask.
2518 HeaderMask->replaceAllUsesWith(LaneMask);
2519 HeaderMask->eraseFromParent();
2520}
2521
2522/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2523/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2524/// recipe could be created.
2525/// \p HeaderMask Header Mask.
2526/// \p CurRecipe Recipe to be transform.
2527/// \p TypeInfo VPlan-based type analysis.
2528/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2529/// \p EVL The explicit vector length parameter of vector-predication
2530/// intrinsics.
2532 VPRecipeBase &CurRecipe,
2533 VPTypeAnalysis &TypeInfo,
2534 VPValue &AllOneMask, VPValue &EVL) {
2535 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2536 // header mask.
2537 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2538 assert(OrigMask && "Unmasked recipe when folding tail");
2539 // HeaderMask will be handled using EVL.
2540 VPValue *Mask;
2541 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2542 return Mask;
2543 return HeaderMask == OrigMask ? nullptr : OrigMask;
2544 };
2545
2546 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2547 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2548 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2549 if (!EndPtr)
2550 return Addr;
2551 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2552 "VPVectorEndPointerRecipe with non-VF VF operand?");
2553 assert(
2554 all_of(EndPtr->users(),
2555 [](VPUser *U) {
2556 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2557 }) &&
2558 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2559 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2560 EVLAddr->insertBefore(&CurRecipe);
2561 EVLAddr->setOperand(1, &EVL);
2562 return EVLAddr;
2563 };
2564
2567 VPValue *NewMask = GetNewMask(L->getMask());
2568 VPValue *NewAddr = GetNewAddr(L->getAddr());
2569 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2570 })
2571 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2572 VPValue *NewMask = GetNewMask(S->getMask());
2573 VPValue *NewAddr = GetNewAddr(S->getAddr());
2574 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2575 })
2576 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2577 VPValue *NewMask = GetNewMask(IR->getMask());
2578 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2579 })
2580 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2581 VPValue *NewMask = GetNewMask(Red->getCondOp());
2582 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2583 })
2584 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2585 VPValue *LHS, *RHS;
2586 // Transform select with a header mask condition
2587 // select(header_mask, LHS, RHS)
2588 // into vector predication merge.
2589 // vp.merge(all-true, LHS, RHS, EVL)
2590 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2591 m_VPValue(RHS))))
2592 return nullptr;
2593 // Use all true as the condition because this transformation is
2594 // limited to selects whose condition is a header mask.
2595 return new VPWidenIntrinsicRecipe(
2596 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2597 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2598 })
2599 .Default([&](VPRecipeBase *R) { return nullptr; });
2600}
2601
2602/// Replace recipes with their EVL variants.
2604 VPTypeAnalysis TypeInfo(Plan);
2605 VPValue *AllOneMask = Plan.getTrue();
2606 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2607 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2608
2609 assert(all_of(Plan.getVF().users(),
2612 "User of VF that we can't transform to EVL.");
2613 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2615 });
2616
2617 assert(all_of(Plan.getVFxUF().users(),
2618 [&LoopRegion, &Plan](VPUser *U) {
2619 return match(U,
2620 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2621 m_Specific(&Plan.getVFxUF()))) ||
2622 isa<VPWidenPointerInductionRecipe>(U);
2623 }) &&
2624 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2625 "increment of the canonical induction.");
2626 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2627 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2628 // canonical induction must not be updated.
2630 });
2631
2632 // Defer erasing recipes till the end so that we don't invalidate the
2633 // VPTypeAnalysis cache.
2635
2636 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2637 // contained.
2638 bool ContainsFORs =
2640 if (ContainsFORs) {
2641 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2642 VPValue *MaxEVL = &Plan.getVF();
2643 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2644 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2645 MaxEVL = Builder.createScalarZExtOrTrunc(
2646 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2647 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2648
2649 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2650 VPValue *PrevEVL = Builder.createScalarPhi(
2651 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2652
2655 for (VPRecipeBase &R : *VPBB) {
2656 VPValue *V1, *V2;
2657 if (!match(&R,
2659 m_VPValue(V1), m_VPValue(V2))))
2660 continue;
2661 VPValue *Imm = Plan.getOrAddLiveIn(
2664 Intrinsic::experimental_vp_splice,
2665 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2666 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2667 VPSplice->insertBefore(&R);
2668 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2669 ToErase.push_back(&R);
2670 }
2671 }
2672 }
2673
2674 VPValue *HeaderMask = findHeaderMask(Plan);
2675 if (!HeaderMask)
2676 return;
2677
2678 // Replace header masks with a mask equivalent to predicating by EVL:
2679 //
2680 // icmp ule widen-canonical-iv backedge-taken-count
2681 // ->
2682 // icmp ult step-vector, EVL
2683 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2684 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2685 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2686 VPValue *EVLMask = Builder.createICmp(
2688 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2689 HeaderMask->replaceAllUsesWith(EVLMask);
2690 ToErase.push_back(HeaderMask->getDefiningRecipe());
2691
2692 // Try to optimize header mask recipes away to their EVL variants.
2693 // TODO: Split optimizeMaskToEVL out and move into
2694 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2695 // tryToBuildVPlanWithVPRecipes beforehand.
2696 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2697 auto *CurRecipe = cast<VPRecipeBase>(U);
2698 VPRecipeBase *EVLRecipe =
2699 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2700 if (!EVLRecipe)
2701 continue;
2702
2703 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2704 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2705 "New recipe must define the same number of values as the "
2706 "original.");
2707 EVLRecipe->insertBefore(CurRecipe);
2709 EVLRecipe)) {
2710 for (unsigned I = 0; I < NumDefVal; ++I) {
2711 VPValue *CurVPV = CurRecipe->getVPValue(I);
2712 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2713 }
2714 }
2715 ToErase.push_back(CurRecipe);
2716 }
2717 // Remove dead EVL mask.
2718 if (EVLMask->getNumUsers() == 0)
2719 ToErase.push_back(EVLMask->getDefiningRecipe());
2720
2721 for (VPRecipeBase *R : reverse(ToErase)) {
2722 SmallVector<VPValue *> PossiblyDead(R->operands());
2723 R->eraseFromParent();
2724 for (VPValue *Op : PossiblyDead)
2726 }
2727}
2728
2729/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2730/// replaces all uses except the canonical IV increment of
2731/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2732/// is used only for loop iterations counting after this transformation.
2733///
2734/// The function uses the following definitions:
2735/// %StartV is the canonical induction start value.
2736///
2737/// The function adds the following recipes:
2738///
2739/// vector.ph:
2740/// ...
2741///
2742/// vector.body:
2743/// ...
2744/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2745/// [ %NextEVLIV, %vector.body ]
2746/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2747/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2748/// ...
2749/// %OpEVL = cast i32 %VPEVL to IVSize
2750/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2751/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2752/// ...
2753///
2754/// If MaxSafeElements is provided, the function adds the following recipes:
2755/// vector.ph:
2756/// ...
2757///
2758/// vector.body:
2759/// ...
2760/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2761/// [ %NextEVLIV, %vector.body ]
2762/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2763/// %cmp = cmp ult %AVL, MaxSafeElements
2764/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2765/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2766/// ...
2767/// %OpEVL = cast i32 %VPEVL to IVSize
2768/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2769/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2770/// ...
2771///
2773 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2774 if (Plan.hasScalarVFOnly())
2775 return;
2776 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2777 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2778
2779 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2780 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2781 VPValue *StartV = CanonicalIVPHI->getStartValue();
2782
2783 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2784 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2785 EVLPhi->insertAfter(CanonicalIVPHI);
2786 VPBuilder Builder(Header, Header->getFirstNonPhi());
2787 // Create the AVL (application vector length), starting from TC -> 0 in steps
2788 // of EVL.
2789 VPPhi *AVLPhi = Builder.createScalarPhi(
2790 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2791 VPValue *AVL = AVLPhi;
2792
2793 if (MaxSafeElements) {
2794 // Support for MaxSafeDist for correct loop emission.
2795 VPValue *AVLSafe =
2796 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2797 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2798 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2799 "safe_avl");
2800 }
2801 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2803
2804 auto *CanonicalIVIncrement =
2805 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2806 Builder.setInsertPoint(CanonicalIVIncrement);
2807 VPValue *OpVPEVL = VPEVL;
2808
2809 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2810 OpVPEVL = Builder.createScalarZExtOrTrunc(
2811 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2812
2813 auto *NextEVLIV = Builder.createOverflowingOp(
2814 Instruction::Add, {OpVPEVL, EVLPhi},
2815 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2816 CanonicalIVIncrement->hasNoSignedWrap()},
2817 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2818 EVLPhi->addOperand(NextEVLIV);
2819
2820 VPValue *NextAVL = Builder.createOverflowingOp(
2821 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2822 DebugLoc::getCompilerGenerated(), "avl.next");
2823 AVLPhi->addOperand(NextAVL);
2824
2825 transformRecipestoEVLRecipes(Plan, *VPEVL);
2826
2827 // Replace all uses of VPCanonicalIVPHIRecipe by
2828 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2829 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2830 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2831 // TODO: support unroll factor > 1.
2832 Plan.setUF(1);
2833}
2834
2836 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2837 // There should be only one EVL PHI in the entire plan.
2838 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2839
2842 for (VPRecipeBase &R : VPBB->phis())
2843 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2844 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2845 EVLPhi = PhiR;
2846 }
2847
2848 // Early return if no EVL PHI is found.
2849 if (!EVLPhi)
2850 return;
2851
2852 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2853 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2854 VPValue *AVL;
2855 [[maybe_unused]] bool FoundAVL =
2856 match(EVLIncrement,
2857 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2858 assert(FoundAVL && "Didn't find AVL?");
2859
2860 // The AVL may be capped to a safe distance.
2861 VPValue *SafeAVL;
2862 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2863 AVL = SafeAVL;
2864
2865 VPValue *AVLNext;
2866 [[maybe_unused]] bool FoundAVLNext =
2868 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2869 assert(FoundAVLNext && "Didn't find AVL backedge?");
2870
2871 // Convert EVLPhi to concrete recipe.
2872 auto *ScalarR =
2873 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2874 EVLPhi->getDebugLoc(), "evl.based.iv");
2875 EVLPhi->replaceAllUsesWith(ScalarR);
2876 EVLPhi->eraseFromParent();
2877
2878 // Replace CanonicalIVInc with EVL-PHI increment.
2879 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2880 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2881 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2882 m_Specific(&Plan.getVFxUF()))) &&
2883 "Unexpected canonical iv");
2884 Backedge->replaceAllUsesWith(EVLIncrement);
2885
2886 // Remove unused phi and increment.
2887 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2888 CanonicalIVIncrement->eraseFromParent();
2889 CanonicalIV->eraseFromParent();
2890
2891 // Replace the use of VectorTripCount in the latch-exiting block.
2892 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2893 // After: (branch-on-cond eq AVLNext, 0)
2894
2895 VPBasicBlock *LatchExiting =
2896 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2897 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2898 // Skip single-iteration loop region
2899 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2900 return;
2901 assert(LatchExitingBr &&
2902 match(LatchExitingBr,
2903 m_BranchOnCount(m_VPValue(EVLIncrement),
2904 m_Specific(&Plan.getVectorTripCount()))) &&
2905 "Unexpected terminator in EVL loop");
2906
2907 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2908 VPBuilder Builder(LatchExitingBr);
2909 VPValue *Cmp =
2910 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2912 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2913 LatchExitingBr->eraseFromParent();
2914}
2915
2917 VPlan &Plan, PredicatedScalarEvolution &PSE,
2918 const DenseMap<Value *, const SCEV *> &StridesMap) {
2919 // Replace VPValues for known constant strides guaranteed by predicate scalar
2920 // evolution.
2921 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2922 auto *R = cast<VPRecipeBase>(&U);
2923 return R->getRegion() ||
2924 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2925 };
2926 ValueToSCEVMapTy RewriteMap;
2927 for (const SCEV *Stride : StridesMap.values()) {
2928 using namespace SCEVPatternMatch;
2929 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2930 const APInt *StrideConst;
2931 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2932 // Only handle constant strides for now.
2933 continue;
2934
2935 auto *CI =
2936 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2937 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2938 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2939
2940 // The versioned value may not be used in the loop directly but through a
2941 // sext/zext. Add new live-ins in those cases.
2942 for (Value *U : StrideV->users()) {
2944 continue;
2945 VPValue *StrideVPV = Plan.getLiveIn(U);
2946 if (!StrideVPV)
2947 continue;
2948 unsigned BW = U->getType()->getScalarSizeInBits();
2949 APInt C =
2950 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2951 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2952 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2953 }
2954 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2955 }
2956
2957 for (VPRecipeBase &R : *Plan.getEntry()) {
2958 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2959 if (!ExpSCEV)
2960 continue;
2961 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2962 auto *NewSCEV =
2963 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2964 if (NewSCEV != ScevExpr) {
2965 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2966 ExpSCEV->replaceAllUsesWith(NewExp);
2967 if (Plan.getTripCount() == ExpSCEV)
2968 Plan.resetTripCount(NewExp);
2969 }
2970 }
2971}
2972
2974 VPlan &Plan,
2975 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2976 // Collect recipes in the backward slice of `Root` that may generate a poison
2977 // value that is used after vectorization.
2979 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2981 Worklist.push_back(Root);
2982
2983 // Traverse the backward slice of Root through its use-def chain.
2984 while (!Worklist.empty()) {
2985 VPRecipeBase *CurRec = Worklist.pop_back_val();
2986
2987 if (!Visited.insert(CurRec).second)
2988 continue;
2989
2990 // Prune search if we find another recipe generating a widen memory
2991 // instruction. Widen memory instructions involved in address computation
2992 // will lead to gather/scatter instructions, which don't need to be
2993 // handled.
2995 VPHeaderPHIRecipe>(CurRec))
2996 continue;
2997
2998 // This recipe contributes to the address computation of a widen
2999 // load/store. If the underlying instruction has poison-generating flags,
3000 // drop them directly.
3001 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3002 VPValue *A, *B;
3003 // Dropping disjoint from an OR may yield incorrect results, as some
3004 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3005 // for dependence analysis). Instead, replace it with an equivalent Add.
3006 // This is possible as all users of the disjoint OR only access lanes
3007 // where the operands are disjoint or poison otherwise.
3008 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3009 RecWithFlags->isDisjoint()) {
3010 VPBuilder Builder(RecWithFlags);
3011 VPInstruction *New = Builder.createOverflowingOp(
3012 Instruction::Add, {A, B}, {false, false},
3013 RecWithFlags->getDebugLoc());
3014 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3015 RecWithFlags->replaceAllUsesWith(New);
3016 RecWithFlags->eraseFromParent();
3017 CurRec = New;
3018 } else
3019 RecWithFlags->dropPoisonGeneratingFlags();
3020 } else {
3023 (void)Instr;
3024 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3025 "found instruction with poison generating flags not covered by "
3026 "VPRecipeWithIRFlags");
3027 }
3028
3029 // Add new definitions to the worklist.
3030 for (VPValue *Operand : CurRec->operands())
3031 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3032 Worklist.push_back(OpDef);
3033 }
3034 });
3035
3036 // Traverse all the recipes in the VPlan and collect the poison-generating
3037 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3038 // VPInterleaveRecipe.
3039 auto Iter = vp_depth_first_deep(Plan.getEntry());
3041 for (VPRecipeBase &Recipe : *VPBB) {
3042 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3043 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3044 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3045 if (AddrDef && WidenRec->isConsecutive() &&
3046 BlockNeedsPredication(UnderlyingInstr.getParent()))
3047 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3048 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3049 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3050 if (AddrDef) {
3051 // Check if any member of the interleave group needs predication.
3052 const InterleaveGroup<Instruction> *InterGroup =
3053 InterleaveRec->getInterleaveGroup();
3054 bool NeedPredication = false;
3055 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3056 I < NumMembers; ++I) {
3057 Instruction *Member = InterGroup->getMember(I);
3058 if (Member)
3059 NeedPredication |= BlockNeedsPredication(Member->getParent());
3060 }
3061
3062 if (NeedPredication)
3063 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3064 }
3065 }
3066 }
3067 }
3068}
3069
3071 VPlan &Plan,
3073 &InterleaveGroups,
3074 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3075 if (InterleaveGroups.empty())
3076 return;
3077
3078 // Interleave memory: for each Interleave Group we marked earlier as relevant
3079 // for this VPlan, replace the Recipes widening its memory instructions with a
3080 // single VPInterleaveRecipe at its insertion point.
3081 VPDominatorTree VPDT(Plan);
3082 for (const auto *IG : InterleaveGroups) {
3083 auto *Start =
3084 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3085 VPIRMetadata InterleaveMD(*Start);
3086 SmallVector<VPValue *, 4> StoredValues;
3087 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3088 StoredValues.push_back(StoreR->getStoredValue());
3089 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3090 Instruction *MemberI = IG->getMember(I);
3091 if (!MemberI)
3092 continue;
3093 VPWidenMemoryRecipe *MemoryR =
3094 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3095 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3096 StoredValues.push_back(StoreR->getStoredValue());
3097 InterleaveMD.intersect(*MemoryR);
3098 }
3099
3100 bool NeedsMaskForGaps =
3101 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3102 (!StoredValues.empty() && !IG->isFull());
3103
3104 Instruction *IRInsertPos = IG->getInsertPos();
3105 auto *InsertPos =
3106 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3107
3109 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3110 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3111 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3112
3113 // Get or create the start address for the interleave group.
3114 VPValue *Addr = Start->getAddr();
3115 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3116 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3117 // We cannot re-use the address of member zero because it does not
3118 // dominate the insert position. Instead, use the address of the insert
3119 // position and create a PtrAdd adjusting it to the address of member
3120 // zero.
3121 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3122 // InsertPos or sink loads above zero members to join it.
3123 assert(IG->getIndex(IRInsertPos) != 0 &&
3124 "index of insert position shouldn't be zero");
3125 auto &DL = IRInsertPos->getDataLayout();
3126 APInt Offset(32,
3127 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3128 IG->getIndex(IRInsertPos),
3129 /*IsSigned=*/true);
3130 VPValue *OffsetVPV =
3131 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
3132 VPBuilder B(InsertPos);
3133 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3134 }
3135 // If the group is reverse, adjust the index to refer to the last vector
3136 // lane instead of the first. We adjust the index from the first vector
3137 // lane, rather than directly getting the pointer for lane VF - 1, because
3138 // the pointer operand of the interleaved access is supposed to be uniform.
3139 if (IG->isReverse()) {
3140 auto *ReversePtr = new VPVectorEndPointerRecipe(
3141 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3142 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3143 ReversePtr->insertBefore(InsertPos);
3144 Addr = ReversePtr;
3145 }
3146 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3147 InsertPos->getMask(), NeedsMaskForGaps,
3148 InterleaveMD, InsertPos->getDebugLoc());
3149 VPIG->insertBefore(InsertPos);
3150
3151 unsigned J = 0;
3152 for (unsigned i = 0; i < IG->getFactor(); ++i)
3153 if (Instruction *Member = IG->getMember(i)) {
3154 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3155 if (!Member->getType()->isVoidTy()) {
3156 VPValue *OriginalV = MemberR->getVPSingleValue();
3157 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3158 J++;
3159 }
3160 MemberR->eraseFromParent();
3161 }
3162 }
3163}
3164
3165/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3166/// value, phi and backedge value. In the following example:
3167///
3168/// vector.ph:
3169/// Successor(s): vector loop
3170///
3171/// <x1> vector loop: {
3172/// vector.body:
3173/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3174/// ...
3175/// EMIT branch-on-count ...
3176/// No successors
3177/// }
3178///
3179/// WIDEN-INDUCTION will get expanded to:
3180///
3181/// vector.ph:
3182/// ...
3183/// vp<%induction.start> = ...
3184/// vp<%induction.increment> = ...
3185///
3186/// Successor(s): vector loop
3187///
3188/// <x1> vector loop: {
3189/// vector.body:
3190/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3191/// ...
3192/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3193/// EMIT branch-on-count ...
3194/// No successors
3195/// }
3196static void
3198 VPTypeAnalysis &TypeInfo) {
3199 VPlan *Plan = WidenIVR->getParent()->getPlan();
3200 VPValue *Start = WidenIVR->getStartValue();
3201 VPValue *Step = WidenIVR->getStepValue();
3202 VPValue *VF = WidenIVR->getVFValue();
3203 DebugLoc DL = WidenIVR->getDebugLoc();
3204
3205 // The value from the original loop to which we are mapping the new induction
3206 // variable.
3207 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3208
3209 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3212 // FIXME: The newly created binary instructions should contain nsw/nuw
3213 // flags, which can be found from the original scalar operations.
3214 VPIRFlags Flags;
3215 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3216 AddOp = Instruction::Add;
3217 MulOp = Instruction::Mul;
3218 } else {
3219 AddOp = ID.getInductionOpcode();
3220 MulOp = Instruction::FMul;
3221 Flags = ID.getInductionBinOp()->getFastMathFlags();
3222 }
3223
3224 // If the phi is truncated, truncate the start and step values.
3225 VPBuilder Builder(Plan->getVectorPreheader());
3226 Type *StepTy = TypeInfo.inferScalarType(Step);
3227 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3228 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3229 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3230 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3231 StepTy = Ty;
3232 }
3233
3234 // Construct the initial value of the vector IV in the vector loop preheader.
3235 Type *IVIntTy =
3237 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3238 if (StepTy->isFloatingPointTy())
3239 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3240
3241 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3242 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3243
3244 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3245 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3246 DebugLoc::getUnknown(), "induction");
3247
3248 // Create the widened phi of the vector IV.
3249 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3250 WidenIVR->getDebugLoc(), "vec.ind");
3251 WidePHI->insertBefore(WidenIVR);
3252
3253 // Create the backedge value for the vector IV.
3254 VPValue *Inc;
3255 VPValue *Prev;
3256 // If unrolled, use the increment and prev value from the operands.
3257 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3258 Inc = SplatVF;
3259 Prev = WidenIVR->getLastUnrolledPartOperand();
3260 } else {
3261 if (VPRecipeBase *R = VF->getDefiningRecipe())
3262 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3263 // Multiply the vectorization factor by the step using integer or
3264 // floating-point arithmetic as appropriate.
3265 if (StepTy->isFloatingPointTy())
3266 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3267 DL);
3268 else
3269 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3270 TypeInfo.inferScalarType(VF), DL);
3271
3272 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3273 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3274 Prev = WidePHI;
3275 }
3276
3278 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3279 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3280 WidenIVR->getDebugLoc(), "vec.ind.next");
3281
3282 WidePHI->addOperand(Next);
3283
3284 WidenIVR->replaceAllUsesWith(WidePHI);
3285}
3286
3287/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3288/// initial value, phi and backedge value. In the following example:
3289///
3290/// <x1> vector loop: {
3291/// vector.body:
3292/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3293/// ...
3294/// EMIT branch-on-count ...
3295/// }
3296///
3297/// WIDEN-POINTER-INDUCTION will get expanded to:
3298///
3299/// <x1> vector loop: {
3300/// vector.body:
3301/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3302/// EMIT %mul = mul %stepvector, %step
3303/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3304/// ...
3305/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3306/// EMIT branch-on-count ...
3307/// }
3309 VPTypeAnalysis &TypeInfo) {
3310 VPlan *Plan = R->getParent()->getPlan();
3311 VPValue *Start = R->getStartValue();
3312 VPValue *Step = R->getStepValue();
3313 VPValue *VF = R->getVFValue();
3314
3315 assert(R->getInductionDescriptor().getKind() ==
3317 "Not a pointer induction according to InductionDescriptor!");
3318 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3319 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3320 "Recipe should have been replaced");
3321
3322 VPBuilder Builder(R);
3323 DebugLoc DL = R->getDebugLoc();
3324
3325 // Build a scalar pointer phi.
3326 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3327
3328 // Create actual address geps that use the pointer phi as base and a
3329 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3330 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3331 Type *StepTy = TypeInfo.inferScalarType(Step);
3332 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3333 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3334 VPValue *PtrAdd = Builder.createNaryOp(
3335 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3336 R->replaceAllUsesWith(PtrAdd);
3337
3338 // Create the backedge value for the scalar pointer phi.
3340 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3341 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3342 DL);
3343 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3344
3345 VPValue *InductionGEP =
3346 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3347 ScalarPtrPhi->addOperand(InductionGEP);
3348}
3349
3351 // Replace loop regions with explicity CFG.
3352 SmallVector<VPRegionBlock *> LoopRegions;
3354 vp_depth_first_deep(Plan.getEntry()))) {
3355 if (!R->isReplicator())
3356 LoopRegions.push_back(R);
3357 }
3358 for (VPRegionBlock *R : LoopRegions)
3359 R->dissolveToCFGLoop();
3360}
3361
3363 VPTypeAnalysis TypeInfo(Plan);
3366 vp_depth_first_deep(Plan.getEntry()))) {
3367 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3368 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3369 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3370 ToRemove.push_back(WidenIVR);
3371 continue;
3372 }
3373
3374 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3375 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3376 ToRemove.push_back(WidenIVR);
3377 continue;
3378 }
3379
3380 // Expand VPBlendRecipe into VPInstruction::Select.
3381 VPBuilder Builder(&R);
3382 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3383 VPValue *Select = Blend->getIncomingValue(0);
3384 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3385 Select = Builder.createSelect(Blend->getMask(I),
3386 Blend->getIncomingValue(I), Select,
3387 R.getDebugLoc(), "predphi");
3388 Blend->replaceAllUsesWith(Select);
3389 ToRemove.push_back(Blend);
3390 }
3391
3392 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3393 Expr->decompose();
3394 ToRemove.push_back(Expr);
3395 }
3396
3397 VPValue *VectorStep;
3398 VPValue *ScalarStep;
3400 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3401 continue;
3402
3403 // Expand WideIVStep.
3404 auto *VPI = cast<VPInstruction>(&R);
3405 Type *IVTy = TypeInfo.inferScalarType(VPI);
3406 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3408 ? Instruction::UIToFP
3409 : Instruction::Trunc;
3410 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3411 }
3412
3413 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3414 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3415 ScalarStep =
3416 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3417 }
3418
3419 VPIRFlags Flags;
3420 if (IVTy->isFloatingPointTy())
3421 Flags = {VPI->getFastMathFlags()};
3422
3423 unsigned MulOpc =
3424 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3425 VPInstruction *Mul = Builder.createNaryOp(
3426 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3427 VectorStep = Mul;
3428 VPI->replaceAllUsesWith(VectorStep);
3429 ToRemove.push_back(VPI);
3430 }
3431 }
3432
3433 for (VPRecipeBase *R : ToRemove)
3434 R->eraseFromParent();
3435}
3436
3438 VPBasicBlock *EarlyExitVPBB,
3439 VPlan &Plan,
3440 VPBasicBlock *HeaderVPBB,
3441 VPBasicBlock *LatchVPBB) {
3442 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3443 if (!EarlyExitVPBB->getSinglePredecessor() &&
3444 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3445 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3446 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3447 "unsupported early exit VPBB");
3448 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3449 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3450 // of the phis.
3451 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3452 cast<VPIRPhi>(&R)->swapOperands();
3453 }
3454
3455 VPBuilder Builder(LatchVPBB->getTerminator());
3456 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3457 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3458 "Terminator must be be BranchOnCond");
3459 VPValue *CondOfEarlyExitingVPBB =
3460 EarlyExitingVPBB->getTerminator()->getOperand(0);
3461 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3462 ? CondOfEarlyExitingVPBB
3463 : Builder.createNot(CondOfEarlyExitingVPBB);
3464
3465 // Split the middle block and have it conditionally branch to the early exit
3466 // block if CondToEarlyExit.
3467 VPValue *IsEarlyExitTaken =
3468 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3469 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3470 VPBasicBlock *VectorEarlyExitVPBB =
3471 Plan.createVPBasicBlock("vector.early.exit");
3472 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3473 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3474 NewMiddle->swapSuccessors();
3475
3476 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3477
3478 // Update the exit phis in the early exit block.
3479 VPBuilder MiddleBuilder(NewMiddle);
3480 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3481 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3482 auto *ExitIRI = cast<VPIRPhi>(&R);
3483 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3484 // a single predecessor and 1 if it has two.
3485 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3486 if (ExitIRI->getNumOperands() != 1) {
3487 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3488 // predecessor. Extract its last lane.
3489 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3490 }
3491
3492 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3493 if (!IncomingFromEarlyExit->isLiveIn()) {
3494 // Update the incoming value from the early exit.
3495 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3496 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3497 "first.active.lane");
3498 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3499 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3500 nullptr, "early.exit.value");
3501 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3502 }
3503 }
3504 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3505
3506 // Replace the condition controlling the non-early exit from the vector loop
3507 // with one exiting if either the original condition of the vector latch is
3508 // true or the early exit has been taken.
3509 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3510 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3511 "Unexpected terminator");
3512 auto *IsLatchExitTaken =
3513 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3514 LatchExitingBranch->getOperand(1));
3515 auto *AnyExitTaken = Builder.createNaryOp(
3516 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3517 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3518 LatchExitingBranch->eraseFromParent();
3519}
3520
3521/// This function tries convert extended in-loop reductions to
3522/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3523/// valid. The created recipe must be decomposed to its constituent
3524/// recipes before execution.
3525static VPExpressionRecipe *
3527 VFRange &Range) {
3528 Type *RedTy = Ctx.Types.inferScalarType(Red);
3529 VPValue *VecOp = Red->getVecOp();
3530
3531 // Clamp the range if using extended-reduction is profitable.
3532 auto IsExtendedRedValidAndClampRange =
3533 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3535 [&](ElementCount VF) {
3536 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3538
3539 InstructionCost ExtRedCost;
3540 InstructionCost ExtCost =
3541 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3542 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3543
3547 // FIXME: Move partial reduction creation, costing and clamping
3548 // here from LoopVectorize.cpp.
3549 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3550 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3551 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3552 } else {
3553 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3554 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3555 Red->getFastMathFlags(), CostKind);
3556 }
3557 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3558 },
3559 Range);
3560 };
3561
3562 VPValue *A;
3563 // Match reduce(ext)).
3564 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3565 IsExtendedRedValidAndClampRange(
3566 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3567 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3568 Ctx.Types.inferScalarType(A)))
3569 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3570
3571 return nullptr;
3572}
3573
3574/// This function tries convert extended in-loop reductions to
3575/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3576/// and valid. The created VPExpressionRecipe must be decomposed to its
3577/// constituent recipes before execution. Patterns of the
3578/// VPExpressionRecipe:
3579/// reduce.add(mul(...)),
3580/// reduce.add(mul(ext(A), ext(B))),
3581/// reduce.add(ext(mul(ext(A), ext(B)))).
3582static VPExpressionRecipe *
3584 VPCostContext &Ctx, VFRange &Range) {
3585 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3586
3587 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3588 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3589 return nullptr;
3590
3591 Type *RedTy = Ctx.Types.inferScalarType(Red);
3592
3593 // Clamp the range if using multiply-accumulate-reduction is profitable.
3594 auto IsMulAccValidAndClampRange =
3596 VPWidenCastRecipe *OuterExt) -> bool {
3598 [&](ElementCount VF) {
3600 Type *SrcTy =
3601 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3602 InstructionCost MulAccCost;
3603
3604 if (IsPartialReduction) {
3605 Type *SrcTy2 =
3606 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3607 // FIXME: Move partial reduction creation, costing and clamping
3608 // here from LoopVectorize.cpp.
3609 MulAccCost = Ctx.TTI.getPartialReductionCost(
3610 Opcode, SrcTy, SrcTy2, RedTy, VF,
3612 Ext0->getOpcode())
3615 Ext1->getOpcode())
3617 Mul->getOpcode(), CostKind);
3618 } else {
3619 // Only partial reductions support mixed extends at the moment.
3620 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3621 return false;
3622
3623 bool IsZExt =
3624 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3625 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3626 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3627 SrcVecTy, CostKind);
3628 }
3629
3630 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3631 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3632 InstructionCost ExtCost = 0;
3633 if (Ext0)
3634 ExtCost += Ext0->computeCost(VF, Ctx);
3635 if (Ext1)
3636 ExtCost += Ext1->computeCost(VF, Ctx);
3637 if (OuterExt)
3638 ExtCost += OuterExt->computeCost(VF, Ctx);
3639
3640 return MulAccCost.isValid() &&
3641 MulAccCost < ExtCost + MulCost + RedCost;
3642 },
3643 Range);
3644 };
3645
3646 VPValue *VecOp = Red->getVecOp();
3647 VPRecipeBase *Sub = nullptr;
3648 VPValue *A, *B;
3649 VPValue *Tmp = nullptr;
3650 // Sub reductions could have a sub between the add reduction and vec op.
3651 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3652 Sub = VecOp->getDefiningRecipe();
3653 VecOp = Tmp;
3654 }
3655 // Try to match reduce.add(mul(...)).
3656 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3657 auto *RecipeA =
3658 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3659 auto *RecipeB =
3660 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3661 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3662
3663 // Match reduce.add/sub(mul(ext, ext)).
3664 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3665 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3666 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3667 if (Sub)
3668 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3669 cast<VPWidenRecipe>(Sub), Red);
3670 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3671 }
3672 // Match reduce.add(mul).
3673 // TODO: Add an expression type for this variant with a negated mul
3674 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3675 return new VPExpressionRecipe(Mul, Red);
3676 }
3677 // TODO: Add an expression type for negated versions of other expression
3678 // variants.
3679 if (Sub)
3680 return nullptr;
3681 // Match reduce.add(ext(mul(ext(A), ext(B)))).
3682 // All extend recipes must have same opcode or A == B
3683 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
3685 m_ZExtOrSExt(m_VPValue()))))) {
3686 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3687 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3688 auto *Ext0 =
3689 cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
3690 auto *Ext1 =
3691 cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
3692 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3693 Ext0->getOpcode() == Ext1->getOpcode() &&
3694 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3695 auto *NewExt0 = new VPWidenCastRecipe(
3696 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3697 *Ext0, Ext0->getDebugLoc());
3698 NewExt0->insertBefore(Ext0);
3699
3700 VPWidenCastRecipe *NewExt1 = NewExt0;
3701 if (Ext0 != Ext1) {
3702 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3703 Ext->getResultType(), *Ext1, *Ext1,
3704 Ext1->getDebugLoc());
3705 NewExt1->insertBefore(Ext1);
3706 }
3707 Mul->setOperand(0, NewExt0);
3708 Mul->setOperand(1, NewExt1);
3709 Red->setOperand(1, Mul);
3710 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3711 }
3712 }
3713 return nullptr;
3714}
3715
3716/// This function tries to create abstract recipes from the reduction recipe for
3717/// following optimizations and cost estimation.
3719 VPCostContext &Ctx,
3720 VFRange &Range) {
3721 VPExpressionRecipe *AbstractR = nullptr;
3722 auto IP = std::next(Red->getIterator());
3723 auto *VPBB = Red->getParent();
3724 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3725 AbstractR = MulAcc;
3726 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3727 AbstractR = ExtRed;
3728 // Cannot create abstract inloop reduction recipes.
3729 if (!AbstractR)
3730 return;
3731
3732 AbstractR->insertBefore(*VPBB, IP);
3733 Red->replaceAllUsesWith(AbstractR);
3734}
3735
3746
3748 if (Plan.hasScalarVFOnly())
3749 return;
3750
3751#ifndef NDEBUG
3752 VPDominatorTree VPDT(Plan);
3753#endif
3754
3755 SmallVector<VPValue *> VPValues;
3758 append_range(VPValues, Plan.getLiveIns());
3759 for (VPRecipeBase &R : *Plan.getEntry())
3760 append_range(VPValues, R.definedValues());
3761
3762 auto *VectorPreheader = Plan.getVectorPreheader();
3763 for (VPValue *VPV : VPValues) {
3765 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3766 isa<Constant>(VPV->getLiveInIRValue())))
3767 continue;
3768
3769 // Add explicit broadcast at the insert point that dominates all users.
3770 VPBasicBlock *HoistBlock = VectorPreheader;
3771 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3772 for (VPUser *User : VPV->users()) {
3773 if (User->usesScalars(VPV))
3774 continue;
3775 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3776 HoistPoint = HoistBlock->begin();
3777 else
3778 assert(VPDT.dominates(VectorPreheader,
3779 cast<VPRecipeBase>(User)->getParent()) &&
3780 "All users must be in the vector preheader or dominated by it");
3781 }
3782
3783 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3784 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3785 VPV->replaceUsesWithIf(Broadcast,
3786 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3787 return Broadcast != &U && !U.usesScalars(VPV);
3788 });
3789 }
3790}
3791
3793 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3795 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3796 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3797
3798 VPValue *TC = Plan.getTripCount();
3799 // Skip cases for which the trip count may be non-trivial to materialize.
3800 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3801 // tail is required.
3802 if (!Plan.hasScalarTail() ||
3804 Plan.getScalarPreheader() ||
3805 !TC->isLiveIn())
3806 return;
3807
3808 // Materialize vector trip counts for constants early if it can simply
3809 // be computed as (Original TC / VF * UF) * VF * UF.
3810 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3811 // tail-folded loops.
3812 ScalarEvolution &SE = *PSE.getSE();
3813 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3814 if (!isa<SCEVConstant>(TCScev))
3815 return;
3816 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3817 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3818 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3819 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3820}
3821
3823 VPBasicBlock *VectorPH) {
3825 if (BTC->getNumUsers() == 0)
3826 return;
3827
3828 VPBuilder Builder(VectorPH, VectorPH->begin());
3829 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3830 auto *TCMO = Builder.createNaryOp(
3831 Instruction::Sub,
3832 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3833 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3834 BTC->replaceAllUsesWith(TCMO);
3835}
3836
3838 if (Plan.hasScalarVFOnly())
3839 return;
3840
3841 VPTypeAnalysis TypeInfo(Plan);
3842 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3843 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3845 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3846 vp_depth_first_shallow(LoopRegion->getEntry()));
3847 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3848 // VPInstructions, excluding ones in replicate regions. Those are not
3849 // materialized explicitly yet. Those vector users are still handled in
3850 // VPReplicateRegion::execute(), via shouldPack().
3851 // TODO: materialize build vectors for replicating recipes in replicating
3852 // regions.
3853 for (VPBasicBlock *VPBB :
3854 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3855 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3857 continue;
3858 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3859 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3860 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3861 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3862 };
3863 if ((isa<VPReplicateRecipe>(DefR) &&
3864 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3865 (isa<VPInstruction>(DefR) &&
3867 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3868 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3869 continue;
3870
3871 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3872 unsigned Opcode = ScalarTy->isStructTy()
3875 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3876 BuildVector->insertAfter(DefR);
3877
3878 DefR->replaceUsesWithIf(
3879 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3880 VPUser &U, unsigned) {
3881 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3882 });
3883 }
3884 }
3885
3886 // Create explicit VPInstructions to convert vectors to scalars. The current
3887 // implementation is conservative - it may miss some cases that may or may not
3888 // be vector values. TODO: introduce Unpacks speculatively - remove them later
3889 // if they are known to operate on scalar values.
3890 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3891 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3894 continue;
3895 for (VPValue *Def : R.definedValues()) {
3896 // Skip recipes that are single-scalar or only have their first lane
3897 // used.
3898 // TODO: The Defs skipped here may or may not be vector values.
3899 // Introduce Unpacks, and remove them later, if they are guaranteed to
3900 // produce scalar values.
3902 continue;
3903
3904 // At the moment, we create unpacks only for scalar users outside
3905 // replicate regions. Recipes inside replicate regions still extract the
3906 // required lanes implicitly.
3907 // TODO: Remove once replicate regions are unrolled completely.
3908 auto IsCandidateUnpackUser = [Def](VPUser *U) {
3909 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3910 return U->usesScalars(Def) &&
3911 (!ParentRegion || !ParentRegion->isReplicator());
3912 };
3913 if (none_of(Def->users(), IsCandidateUnpackUser))
3914 continue;
3915
3916 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3917 if (R.isPhi())
3918 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3919 else
3920 Unpack->insertAfter(&R);
3921 Def->replaceUsesWithIf(Unpack,
3922 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
3923 return IsCandidateUnpackUser(&U);
3924 });
3925 }
3926 }
3927 }
3928}
3929
3931 VPBasicBlock *VectorPHVPBB,
3932 bool TailByMasking,
3933 bool RequiresScalarEpilogue) {
3934 VPValue &VectorTC = Plan.getVectorTripCount();
3935 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3936 // There's nothing to do if there are no users of the vector trip count or its
3937 // IR value has already been set.
3938 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3939 return;
3940
3941 VPValue *TC = Plan.getTripCount();
3942 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3943 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3944 VPValue *Step = &Plan.getVFxUF();
3945
3946 // If the tail is to be folded by masking, round the number of iterations N
3947 // up to a multiple of Step instead of rounding down. This is done by first
3948 // adding Step-1 and then rounding down. Note that it's ok if this addition
3949 // overflows: the vector induction variable will eventually wrap to zero given
3950 // that it starts at zero and its Step is a power of two; the loop will then
3951 // exit, with the last early-exit vector comparison also producing all-true.
3952 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3953 // is accounted for in emitIterationCountCheck that adds an overflow check.
3954 if (TailByMasking) {
3955 TC = Builder.createNaryOp(
3956 Instruction::Add,
3957 {TC, Builder.createNaryOp(
3958 Instruction::Sub,
3959 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3960 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3961 }
3962
3963 // Now we need to generate the expression for the part of the loop that the
3964 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3965 // iterations are not required for correctness, or N - Step, otherwise. Step
3966 // is equal to the vectorization factor (number of SIMD elements) times the
3967 // unroll factor (number of SIMD instructions).
3968 VPValue *R =
3969 Builder.createNaryOp(Instruction::URem, {TC, Step},
3970 DebugLoc::getCompilerGenerated(), "n.mod.vf");
3971
3972 // There are cases where we *must* run at least one iteration in the remainder
3973 // loop. See the cost model for when this can happen. If the step evenly
3974 // divides the trip count, we set the remainder to be equal to the step. If
3975 // the step does not evenly divide the trip count, no adjustment is necessary
3976 // since there will already be scalar iterations. Note that the minimum
3977 // iterations check ensures that N >= Step.
3978 if (RequiresScalarEpilogue) {
3979 assert(!TailByMasking &&
3980 "requiring scalar epilogue is not supported with fail folding");
3981 VPValue *IsZero = Builder.createICmp(
3982 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
3983 R = Builder.createSelect(IsZero, Step, R);
3984 }
3985
3986 VPValue *Res = Builder.createNaryOp(
3987 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
3988 VectorTC.replaceAllUsesWith(Res);
3989}
3990
3992 ElementCount VFEC) {
3993 VPBuilder Builder(VectorPH, VectorPH->begin());
3994 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3995 VPValue &VF = Plan.getVF();
3996 VPValue &VFxUF = Plan.getVFxUF();
3997 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
3998 // used.
3999 // TODO: Assert that they aren't used.
4000
4001 // If there are no users of the runtime VF, compute VFxUF by constant folding
4002 // the multiplication of VF and UF.
4003 if (VF.getNumUsers() == 0) {
4004 VPValue *RuntimeVFxUF =
4005 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4006 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4007 return;
4008 }
4009
4010 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4011 // vscale) * UF.
4012 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4014 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4016 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4017 }
4018 VF.replaceAllUsesWith(RuntimeVF);
4019
4020 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
4021 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4022 VFxUF.replaceAllUsesWith(MulByUF);
4023}
4024
4027 const DataLayout &DL = SE.getDataLayout();
4028 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/true);
4029
4030 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4031 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4032 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4033 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4035 continue;
4036 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4037 if (!ExpSCEV)
4038 break;
4039 const SCEV *Expr = ExpSCEV->getSCEV();
4040 Value *Res =
4041 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4042 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4043 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4044 ExpSCEV->replaceAllUsesWith(Exp);
4045 if (Plan.getTripCount() == ExpSCEV)
4046 Plan.resetTripCount(Exp);
4047 ExpSCEV->eraseFromParent();
4048 }
4050 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4051 "after any VPIRInstructions");
4052 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4053 // to the VPIRBasicBlock.
4054 auto EI = Entry->begin();
4055 for (Instruction &I : drop_end(*EntryBB)) {
4056 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4057 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4058 EI++;
4059 continue;
4060 }
4062 }
4063
4064 return ExpandedSCEVs;
4065}
4066
4067/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4068/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4069/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4070/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4071/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4072/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4073/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4074/// is defined at \p Idx of a load interleave group.
4075static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4076 VPValue *OpV, unsigned Idx) {
4077 auto *DefR = OpV->getDefiningRecipe();
4078 if (!DefR)
4079 return WideMember0->getOperand(OpIdx) == OpV;
4080 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
4081 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
4082
4083 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
4084 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4085 return false;
4086}
4087
4088/// Returns true if \p IR is a full interleave group with factor and number of
4089/// members both equal to \p VF. The interleave group must also access the full
4090/// vector width \p VectorRegWidth.
4092 unsigned VF, VPTypeAnalysis &TypeInfo,
4093 unsigned VectorRegWidth) {
4094 if (!InterleaveR || InterleaveR->getMask())
4095 return false;
4096
4097 Type *GroupElementTy = nullptr;
4098 if (InterleaveR->getStoredValues().empty()) {
4099 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4100 if (!all_of(InterleaveR->definedValues(),
4101 [&TypeInfo, GroupElementTy](VPValue *Op) {
4102 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4103 }))
4104 return false;
4105 } else {
4106 GroupElementTy =
4107 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4108 if (!all_of(InterleaveR->getStoredValues(),
4109 [&TypeInfo, GroupElementTy](VPValue *Op) {
4110 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4111 }))
4112 return false;
4113 }
4114
4115 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4116 auto IG = InterleaveR->getInterleaveGroup();
4117 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4118 GroupSize == VectorRegWidth;
4119}
4120
4121/// Returns true if \p VPValue is a narrow VPValue.
4122static bool isAlreadyNarrow(VPValue *VPV) {
4123 if (VPV->isLiveIn())
4124 return true;
4125 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4126 return RepR && RepR->isSingleScalar();
4127}
4128
4130 unsigned VectorRegWidth) {
4131 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4132 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4133 return;
4134
4135 VPTypeAnalysis TypeInfo(Plan);
4136
4137 unsigned VFMinVal = VF.getKnownMinValue();
4139 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4141 continue;
4142
4145 continue;
4146
4147 // Bail out on recipes not supported at the moment:
4148 // * phi recipes other than the canonical induction
4149 // * recipes writing to memory except interleave groups
4150 // Only support plans with a canonical induction phi.
4151 if (R.isPhi())
4152 return;
4153
4154 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4155 if (R.mayWriteToMemory() && !InterleaveR)
4156 return;
4157
4158 // Do not narrow interleave groups if there are VectorPointer recipes and
4159 // the plan was unrolled. The recipe implicitly uses VF from
4160 // VPTransformState.
4161 // TODO: Remove restriction once the VF for the VectorPointer offset is
4162 // modeled explicitly as operand.
4163 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4164 return;
4165
4166 // All other ops are allowed, but we reject uses that cannot be converted
4167 // when checking all allowed consumers (store interleave groups) below.
4168 if (!InterleaveR)
4169 continue;
4170
4171 // Bail out on non-consecutive interleave groups.
4172 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4173 VectorRegWidth))
4174 return;
4175
4176 // Skip read interleave groups.
4177 if (InterleaveR->getStoredValues().empty())
4178 continue;
4179
4180 // Narrow interleave groups, if all operands are already matching narrow
4181 // ops.
4182 auto *Member0 = InterleaveR->getStoredValues()[0];
4183 if (isAlreadyNarrow(Member0) &&
4184 all_of(InterleaveR->getStoredValues(),
4185 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4186 StoreGroups.push_back(InterleaveR);
4187 continue;
4188 }
4189
4190 // For now, we only support full interleave groups storing load interleave
4191 // groups.
4192 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4193 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4194 if (!DefR)
4195 return false;
4196 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4197 return IR && IR->getInterleaveGroup()->isFull() &&
4198 IR->getVPValue(Op.index()) == Op.value();
4199 })) {
4200 StoreGroups.push_back(InterleaveR);
4201 continue;
4202 }
4203
4204 // Check if all values feeding InterleaveR are matching wide recipes, which
4205 // operands that can be narrowed.
4206 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4207 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4208 if (!WideMember0)
4209 return;
4210 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4211 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4212 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4213 R->getNumOperands() > 2)
4214 return;
4215 if (any_of(enumerate(R->operands()),
4216 [WideMember0, Idx = I](const auto &P) {
4217 const auto &[OpIdx, OpV] = P;
4218 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4219 }))
4220 return;
4221 }
4222 StoreGroups.push_back(InterleaveR);
4223 }
4224
4225 if (StoreGroups.empty())
4226 return;
4227
4228 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4229 SmallPtrSet<VPValue *, 4> NarrowedOps;
4230 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4231 auto *R = V->getDefiningRecipe();
4232 if (!R || NarrowedOps.contains(V))
4233 return V;
4234 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4235 // Narrow interleave group to wide load, as transformed VPlan will only
4236 // process one original iteration.
4237 auto *L = new VPWidenLoadRecipe(
4238 *cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
4239 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4240 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4241 L->insertBefore(LoadGroup);
4242 NarrowedOps.insert(L);
4243 return L;
4244 }
4245
4246 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4247 assert(RepR->isSingleScalar() &&
4248 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4249 "must be a single scalar load");
4250 NarrowedOps.insert(RepR);
4251 return RepR;
4252 }
4253 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4254
4255 VPValue *PtrOp = WideLoad->getAddr();
4256 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4257 PtrOp = VecPtr->getOperand(0);
4258 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4259 // process one original iteration.
4260 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4261 /*IsUniform*/ true,
4262 /*Mask*/ nullptr, *WideLoad);
4263 N->insertBefore(WideLoad);
4264 NarrowedOps.insert(N);
4265 return N;
4266 };
4267
4268 // Narrow operation tree rooted at store groups.
4269 for (auto *StoreGroup : StoreGroups) {
4270 VPValue *Res = nullptr;
4271 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4272 if (isAlreadyNarrow(Member0)) {
4273 Res = Member0;
4274 } else if (auto *WideMember0 =
4276 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4277 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4278 Res = WideMember0;
4279 } else {
4280 Res = NarrowOp(Member0);
4281 }
4282
4283 auto *S = new VPWidenStoreRecipe(
4284 *cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
4285 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4286 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4287 S->insertBefore(StoreGroup);
4288 StoreGroup->eraseFromParent();
4289 }
4290
4291 // Adjust induction to reflect that the transformed plan only processes one
4292 // original iteration.
4293 auto *CanIV = VectorLoop->getCanonicalIV();
4294 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4295 VPBuilder PHBuilder(Plan.getVectorPreheader());
4296
4297 VPValue *UF = Plan.getOrAddLiveIn(
4298 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4299 if (VF.isScalable()) {
4300 VPValue *VScale = PHBuilder.createElementCount(
4301 CanIV->getScalarType(), ElementCount::getScalable(1));
4302 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4303 Inc->setOperand(1, VScaleUF);
4304 Plan.getVF().replaceAllUsesWith(VScale);
4305 } else {
4306 Inc->setOperand(1, UF);
4308 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4309 }
4310 removeDeadRecipes(Plan);
4311}
4312
4313/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4314/// BranchOnCond recipe.
4316 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4317 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4318 auto *MiddleTerm =
4320 // Only add branch metadata if there is a (conditional) terminator.
4321 if (!MiddleTerm)
4322 return;
4323
4324 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4325 "must have a BranchOnCond");
4326 // Assume that `TripCount % VectorStep ` is equally distributed.
4327 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4328 if (VF.isScalable() && VScaleForTuning.has_value())
4329 VectorStep *= *VScaleForTuning;
4330 assert(VectorStep > 0 && "trip count should not be zero");
4331 MDBuilder MDB(Plan.getContext());
4332 MDNode *BranchWeights =
4333 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4334 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4335}
4336
4337/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4338/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4339/// the end value of the induction.
4341 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4342 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4343 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4344 // Truncated wide inductions resume from the last lane of their vector value
4345 // in the last vector iteration which is handled elsewhere.
4346 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4347 return nullptr;
4348
4349 VPValue *Start = WideIV->getStartValue();
4350 VPValue *Step = WideIV->getStepValue();
4352 VPValue *EndValue = VectorTC;
4353 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4354 EndValue = VectorPHBuilder.createDerivedIV(
4355 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4356 Start, VectorTC, Step);
4357 }
4358
4359 // EndValue is derived from the vector trip count (which has the same type as
4360 // the widest induction) and thus may be wider than the induction here.
4361 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4362 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4363 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4364 ScalarTypeOfWideIV,
4365 WideIV->getDebugLoc());
4366 }
4367
4368 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4369 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4370 return ResumePhiRecipe;
4371}
4372
4374 VPlan &Plan, VPRecipeBuilder &Builder,
4375 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4376 VPTypeAnalysis TypeInfo(Plan);
4377 auto *ScalarPH = Plan.getScalarPreheader();
4378 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4379 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4380 VPBuilder VectorPHBuilder(
4381 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4382 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4383 VPBuilder ScalarPHBuilder(ScalarPH);
4384 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4385 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4386
4387 // TODO: Extract final value from induction recipe initially, optimize to
4388 // pre-computed end value together in optimizeInductionExitUsers.
4389 auto *VectorPhiR =
4390 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4391 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4393 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4394 &Plan.getVectorTripCount())) {
4395 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4396 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4397 ScalarPhiIRI->addOperand(ResumePhi);
4398 continue;
4399 }
4400 // TODO: Also handle truncated inductions here. Computing end-values
4401 // separately should be done as VPlan-to-VPlan optimization, after
4402 // legalizing all resume values to use the last lane from the loop.
4403 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4404 "should only skip truncated wide inductions");
4405 continue;
4406 }
4407
4408 // The backedge value provides the value to resume coming out of a loop,
4409 // which for FORs is a vector whose last element needs to be extracted. The
4410 // start value provides the value if the loop is bypassed.
4411 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4412 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4413 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4414 "Cannot handle loops with uncountable early exits");
4415 if (IsFOR)
4416 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4417 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4418 "vector.recur.extract");
4419 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4420 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4421 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4422 ScalarPhiIRI->addOperand(ResumePhiR);
4423 }
4424}
4425
4427 VFRange &Range) {
4428 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4429 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4430 auto *MiddleVPBB = Plan.getMiddleBlock();
4431 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4432 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4433
4434 auto IsScalableOne = [](ElementCount VF) -> bool {
4435 return VF == ElementCount::getScalable(1);
4436 };
4437
4438 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4439 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4440 if (!FOR)
4441 continue;
4442
4443 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4444 "Cannot handle loops with uncountable early exits");
4445
4446 // This is the second phase of vectorizing first-order recurrences, creating
4447 // extract for users outside the loop. An overview of the transformation is
4448 // described below. Suppose we have the following loop with some use after
4449 // the loop of the last a[i-1],
4450 //
4451 // for (int i = 0; i < n; ++i) {
4452 // t = a[i - 1];
4453 // b[i] = a[i] - t;
4454 // }
4455 // use t;
4456 //
4457 // There is a first-order recurrence on "a". For this loop, the shorthand
4458 // scalar IR looks like:
4459 //
4460 // scalar.ph:
4461 // s.init = a[-1]
4462 // br scalar.body
4463 //
4464 // scalar.body:
4465 // i = phi [0, scalar.ph], [i+1, scalar.body]
4466 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4467 // s2 = a[i]
4468 // b[i] = s2 - s1
4469 // br cond, scalar.body, exit.block
4470 //
4471 // exit.block:
4472 // use = lcssa.phi [s1, scalar.body]
4473 //
4474 // In this example, s1 is a recurrence because it's value depends on the
4475 // previous iteration. In the first phase of vectorization, we created a
4476 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4477 // for users in the scalar preheader and exit block.
4478 //
4479 // vector.ph:
4480 // v_init = vector(..., ..., ..., a[-1])
4481 // br vector.body
4482 //
4483 // vector.body
4484 // i = phi [0, vector.ph], [i+4, vector.body]
4485 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4486 // v2 = a[i, i+1, i+2, i+3]
4487 // b[i] = v2 - v1
4488 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4489 // b[i, i+1, i+2, i+3] = v2 - v1
4490 // br cond, vector.body, middle.block
4491 //
4492 // middle.block:
4493 // vector.recur.extract.for.phi = v2(2)
4494 // vector.recur.extract = v2(3)
4495 // br cond, scalar.ph, exit.block
4496 //
4497 // scalar.ph:
4498 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4499 // [s.init, otherwise]
4500 // br scalar.body
4501 //
4502 // scalar.body:
4503 // i = phi [0, scalar.ph], [i+1, scalar.body]
4504 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4505 // s2 = a[i]
4506 // b[i] = s2 - s1
4507 // br cond, scalar.body, exit.block
4508 //
4509 // exit.block:
4510 // lo = lcssa.phi [s1, scalar.body],
4511 // [vector.recur.extract.for.phi, middle.block]
4512 //
4513 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4514 // Extract the penultimate value of the recurrence and use it as operand for
4515 // the VPIRInstruction modeling the phi.
4516 for (VPUser *U : FOR->users()) {
4517 using namespace llvm::VPlanPatternMatch;
4518 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4519 continue;
4520 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4521 // penultimate value of the recurrence. Instead we rely on the existing
4522 // extract of the last element from the result of
4523 // VPInstruction::FirstOrderRecurrenceSplice.
4524 // TODO: Consider vscale_range info and UF.
4526 Range))
4527 return;
4528 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4529 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4530 {}, "vector.recur.extract.for.phi");
4531 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4532 }
4533 }
4534}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1557
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3522
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3809
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3884
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3836
iterator end()
Definition VPlan.h:3846
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3844
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3897
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:220
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:638
const VPRecipeBase & back() const
Definition VPlan.h:3858
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2406
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2440
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2430
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2446
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2426
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:186
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:207
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:126
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:145
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:164
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2950
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3465
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:3492
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3630
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3553
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2995
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1981
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2029
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2018
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3962
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3986
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:979
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1063
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1014
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2547
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2539
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2568
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2620
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2579
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3136
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:473
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4108
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2828
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2669
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3997
const VPBlockBase * getEntry() const
Definition VPlan.h:4033
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4065
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4050
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4095
const VPBlockBase * getExiting() const
Definition VPlan.h:4045
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4058
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2872
bool isSingleScalar() const
Definition VPlan.h:2917
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2941
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3699
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
virtual VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1397
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1845
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1889
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3594
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1486
A recipe for handling GEP instructions.
Definition VPlan.h:1773
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2046
PHINode * getPHINode() const
Definition VPlan.h:2088
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2074
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2091
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2121
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2202
A recipe for widening vector intrinsics.
Definition VPlan.h:1544
A common base class for widening memory operations.
Definition VPlan.h:3178
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3240
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3233
A recipe for widened phis.
Definition VPlan.h:2257
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1443
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4121
bool hasVF(ElementCount VF) const
Definition VPlan.h:4327
LLVMContext & getContext() const
Definition VPlan.h:4315
VPBasicBlock * getEntry()
Definition VPlan.h:4215
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4306
bool hasScalableVF() const
Definition VPlan.h:4328
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4313
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4309
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4277
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4384
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4298
unsigned getUF() const
Definition VPlan.h:4347
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4451
bool hasUF(unsigned UF) const
Definition VPlan.h:4345
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4267
void setVF(ElementCount VF)
Definition VPlan.h:4321
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4360
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4291
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4240
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4429
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4390
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4369
bool hasScalarVFOnly() const
Definition VPlan.h:4338
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4258
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4399
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4263
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4396
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4220
void setUF(unsigned UF)
Definition VPlan.h:4352
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4483
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:830
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2299
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3300
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3260
A recipe for widening select instructions.
Definition VPlan.h:1727
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3382
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3340
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.