LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
95 VPIRMetadata(*Load), Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 Store->getAlign(), VPIRMetadata(*Store),
101 Ingredient.getDebugLoc());
103 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
104 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
105 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
106 if (VectorID == Intrinsic::not_intrinsic)
107 return false;
108 NewRecipe = new VPWidenIntrinsicRecipe(
109 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
110 drop_end(Ingredient.operands()), CI->getType(),
111 CI->getDebugLoc());
112 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
113 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
114 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
115 NewRecipe = new VPWidenCastRecipe(
116 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
117 } else {
118 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
119 }
120 }
121
122 NewRecipe->insertBefore(&Ingredient);
123 if (NewRecipe->getNumDefinedValues() == 1)
124 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
125 else
126 assert(NewRecipe->getNumDefinedValues() == 0 &&
127 "Only recpies with zero or one defined values expected");
128 Ingredient.eraseFromParent();
129 }
130 }
131 return true;
132}
133
134/// Return true if we do not know how to (mechanically) hoist or sink \p R out
135/// of a loop region.
137 // Assumes don't alias anything or throw; as long as they're guaranteed to
138 // execute, they're safe to hoist.
140 return false;
141
142 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
143 // memory location is not modified in the vector loop.
144 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
145 return true;
146
147 // Allocas cannot be hoisted.
148 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
149 return RepR && RepR->getOpcode() == Instruction::Alloca;
150}
151
152static bool sinkScalarOperands(VPlan &Plan) {
153 auto Iter = vp_depth_first_deep(Plan.getEntry());
154 bool Changed = false;
155 // First, collect the operands of all recipes in replicate blocks as seeds for
156 // sinking.
159 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
160 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
161 continue;
162 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
163 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
164 continue;
165 for (auto &Recipe : *VPBB) {
166 for (VPValue *Op : Recipe.operands())
167 if (auto *Def =
168 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
169 WorkList.insert({VPBB, Def});
170 }
171 }
172
173 bool ScalarVFOnly = Plan.hasScalarVFOnly();
174 // Try to sink each replicate or scalar IV steps recipe in the worklist.
175 for (unsigned I = 0; I != WorkList.size(); ++I) {
176 VPBasicBlock *SinkTo;
177 VPSingleDefRecipe *SinkCandidate;
178 std::tie(SinkTo, SinkCandidate) = WorkList[I];
179 if (SinkCandidate->getParent() == SinkTo ||
180 SinkCandidate->mayHaveSideEffects() ||
181 SinkCandidate->mayReadOrWriteMemory())
182 continue;
183 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
184 if (!ScalarVFOnly && RepR->isSingleScalar())
185 continue;
186 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
187 continue;
188
189 bool NeedsDuplicating = false;
190 // All recipe users of the sink candidate must be in the same block SinkTo
191 // or all users outside of SinkTo must be uniform-after-vectorization (
192 // i.e., only first lane is used) . In the latter case, we need to duplicate
193 // SinkCandidate.
194 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
195 SinkCandidate](VPUser *U) {
196 auto *UI = cast<VPRecipeBase>(U);
197 if (UI->getParent() == SinkTo)
198 return true;
199 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
200 // We only know how to duplicate VPReplicateRecipes and
201 // VPScalarIVStepsRecipes for now.
202 return NeedsDuplicating &&
204 };
205 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
206 continue;
207
208 if (NeedsDuplicating) {
209 if (ScalarVFOnly)
210 continue;
211 VPSingleDefRecipe *Clone;
212 if (auto *SinkCandidateRepR =
213 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
214 // TODO: Handle converting to uniform recipes as separate transform,
215 // then cloning should be sufficient here.
216 Instruction *I = SinkCandidate->getUnderlyingInstr();
217 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
218 nullptr /*Mask*/, *SinkCandidateRepR);
219 // TODO: add ".cloned" suffix to name of Clone's VPValue.
220 } else {
221 Clone = SinkCandidate->clone();
222 }
223
224 Clone->insertBefore(SinkCandidate);
225 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
226 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
227 });
228 }
229 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
230 for (VPValue *Op : SinkCandidate->operands())
231 if (auto *Def =
232 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
233 WorkList.insert({SinkTo, Def});
234 Changed = true;
235 }
236 return Changed;
237}
238
239/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
240/// the mask.
242 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
243 if (!EntryBB || EntryBB->size() != 1 ||
244 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
245 return nullptr;
246
247 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
248}
249
250/// If \p R is a triangle region, return the 'then' block of the triangle.
252 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
253 if (EntryBB->getNumSuccessors() != 2)
254 return nullptr;
255
256 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
257 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
258 if (!Succ0 || !Succ1)
259 return nullptr;
260
261 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
262 return nullptr;
263 if (Succ0->getSingleSuccessor() == Succ1)
264 return Succ0;
265 if (Succ1->getSingleSuccessor() == Succ0)
266 return Succ1;
267 return nullptr;
268}
269
270// Merge replicate regions in their successor region, if a replicate region
271// is connected to a successor replicate region with the same predicate by a
272// single, empty VPBasicBlock.
274 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
275
276 // Collect replicate regions followed by an empty block, followed by another
277 // replicate region with matching masks to process front. This is to avoid
278 // iterator invalidation issues while merging regions.
281 vp_depth_first_deep(Plan.getEntry()))) {
282 if (!Region1->isReplicator())
283 continue;
284 auto *MiddleBasicBlock =
285 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
286 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
287 continue;
288
289 auto *Region2 =
290 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
291 if (!Region2 || !Region2->isReplicator())
292 continue;
293
294 VPValue *Mask1 = getPredicatedMask(Region1);
295 VPValue *Mask2 = getPredicatedMask(Region2);
296 if (!Mask1 || Mask1 != Mask2)
297 continue;
298
299 assert(Mask1 && Mask2 && "both region must have conditions");
300 WorkList.push_back(Region1);
301 }
302
303 // Move recipes from Region1 to its successor region, if both are triangles.
304 for (VPRegionBlock *Region1 : WorkList) {
305 if (TransformedRegions.contains(Region1))
306 continue;
307 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
308 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
309
310 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
311 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
312 if (!Then1 || !Then2)
313 continue;
314
315 // Note: No fusion-preventing memory dependencies are expected in either
316 // region. Such dependencies should be rejected during earlier dependence
317 // checks, which guarantee accesses can be re-ordered for vectorization.
318 //
319 // Move recipes to the successor region.
320 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
321 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
322
323 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
324 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
325
326 // Move VPPredInstPHIRecipes from the merge block to the successor region's
327 // merge block. Update all users inside the successor region to use the
328 // original values.
329 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
330 VPValue *PredInst1 =
331 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
332 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
333 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
334 return cast<VPRecipeBase>(&U)->getParent() == Then2;
335 });
336
337 // Remove phi recipes that are unused after merging the regions.
338 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
339 Phi1ToMove.eraseFromParent();
340 continue;
341 }
342 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
343 }
344
345 // Remove the dead recipes in Region1's entry block.
346 for (VPRecipeBase &R :
347 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
348 R.eraseFromParent();
349
350 // Finally, remove the first region.
351 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
352 VPBlockUtils::disconnectBlocks(Pred, Region1);
353 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
354 }
355 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
356 TransformedRegions.insert(Region1);
357 }
358
359 return !TransformedRegions.empty();
360}
361
363 VPlan &Plan) {
364 Instruction *Instr = PredRecipe->getUnderlyingInstr();
365 // Build the triangular if-then region.
366 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
367 assert(Instr->getParent() && "Predicated instruction not in any basic block");
368 auto *BlockInMask = PredRecipe->getMask();
369 auto *MaskDef = BlockInMask->getDefiningRecipe();
370 auto *BOMRecipe = new VPBranchOnMaskRecipe(
371 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
372 auto *Entry =
373 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
374
375 // Replace predicated replicate recipe with a replicate recipe without a
376 // mask but in the replicate region.
377 auto *RecipeWithoutMask = new VPReplicateRecipe(
378 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
379 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
380 auto *Pred =
381 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
382
383 VPPredInstPHIRecipe *PHIRecipe = nullptr;
384 if (PredRecipe->getNumUsers() != 0) {
385 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
386 RecipeWithoutMask->getDebugLoc());
387 PredRecipe->replaceAllUsesWith(PHIRecipe);
388 PHIRecipe->setOperand(0, RecipeWithoutMask);
389 }
390 PredRecipe->eraseFromParent();
391 auto *Exiting =
392 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
394 Plan.createReplicateRegion(Entry, Exiting, RegionName);
395
396 // Note: first set Entry as region entry and then connect successors starting
397 // from it in order, to propagate the "parent" of each VPBasicBlock.
398 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
399 VPBlockUtils::connectBlocks(Pred, Exiting);
400
401 return Region;
402}
403
404static void addReplicateRegions(VPlan &Plan) {
407 vp_depth_first_deep(Plan.getEntry()))) {
408 for (VPRecipeBase &R : *VPBB)
409 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
410 if (RepR->isPredicated())
411 WorkList.push_back(RepR);
412 }
413 }
414
415 unsigned BBNum = 0;
416 for (VPReplicateRecipe *RepR : WorkList) {
417 VPBasicBlock *CurrentBlock = RepR->getParent();
418 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
419
420 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
421 SplitBlock->setName(
422 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
423 // Record predicated instructions for above packing optimizations.
425 Region->setParent(CurrentBlock->getParent());
427
428 VPRegionBlock *ParentRegion = Region->getParent();
429 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
430 ParentRegion->setExiting(SplitBlock);
431 }
432}
433
434/// Remove redundant VPBasicBlocks by merging them into their predecessor if
435/// the predecessor has a single successor.
439 vp_depth_first_deep(Plan.getEntry()))) {
440 // Don't fold the blocks in the skeleton of the Plan into their single
441 // predecessors for now.
442 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
443 if (!VPBB->getParent())
444 continue;
445 auto *PredVPBB =
446 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
447 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
448 isa<VPIRBasicBlock>(PredVPBB))
449 continue;
450 WorkList.push_back(VPBB);
451 }
452
453 for (VPBasicBlock *VPBB : WorkList) {
454 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
455 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
456 R.moveBefore(*PredVPBB, PredVPBB->end());
457 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
458 auto *ParentRegion = VPBB->getParent();
459 if (ParentRegion && ParentRegion->getExiting() == VPBB)
460 ParentRegion->setExiting(PredVPBB);
461 for (auto *Succ : to_vector(VPBB->successors())) {
463 VPBlockUtils::connectBlocks(PredVPBB, Succ);
464 }
465 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
466 }
467 return !WorkList.empty();
468}
469
471 // Convert masked VPReplicateRecipes to if-then region blocks.
473
474 bool ShouldSimplify = true;
475 while (ShouldSimplify) {
476 ShouldSimplify = sinkScalarOperands(Plan);
477 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
478 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
479 }
480}
481
482/// Remove redundant casts of inductions.
483///
484/// Such redundant casts are casts of induction variables that can be ignored,
485/// because we already proved that the casted phi is equal to the uncasted phi
486/// in the vectorized loop. There is no need to vectorize the cast - the same
487/// value can be used for both the phi and casts in the vector loop.
489 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
491 if (!IV || IV->getTruncInst())
492 continue;
493
494 // A sequence of IR Casts has potentially been recorded for IV, which
495 // *must be bypassed* when the IV is vectorized, because the vectorized IV
496 // will produce the desired casted value. This sequence forms a def-use
497 // chain and is provided in reverse order, ending with the cast that uses
498 // the IV phi. Search for the recipe of the last cast in the chain and
499 // replace it with the original IV. Note that only the final cast is
500 // expected to have users outside the cast-chain and the dead casts left
501 // over will be cleaned up later.
502 auto &Casts = IV->getInductionDescriptor().getCastInsts();
503 VPValue *FindMyCast = IV;
504 for (Instruction *IRCast : reverse(Casts)) {
505 VPSingleDefRecipe *FoundUserCast = nullptr;
506 for (auto *U : FindMyCast->users()) {
507 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
508 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
509 FoundUserCast = UserCast;
510 break;
511 }
512 }
513 FindMyCast = FoundUserCast;
514 }
515 FindMyCast->replaceAllUsesWith(IV);
516 }
517}
518
519/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
520/// recipe, if it exists.
522 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
523 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
524 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
525 for (VPUser *U : CanonicalIV->users()) {
527 if (WidenNewIV)
528 break;
529 }
530
531 if (!WidenNewIV)
532 return;
533
534 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
535 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
536 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
537
538 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
539 continue;
540
541 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
542 // everything WidenNewIV's users need. That is, WidenOriginalIV will
543 // generate a vector phi or all users of WidenNewIV demand the first lane
544 // only.
545 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
546 vputils::onlyFirstLaneUsed(WidenNewIV)) {
547 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
548 WidenNewIV->eraseFromParent();
549 return;
550 }
551 }
552}
553
554/// Returns true if \p R is dead and can be removed.
555static bool isDeadRecipe(VPRecipeBase &R) {
556 // Do remove conditional assume instructions as their conditions may be
557 // flattened.
558 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
559 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
561 if (IsConditionalAssume)
562 return true;
563
564 if (R.mayHaveSideEffects())
565 return false;
566
567 // Recipe is dead if no user keeps the recipe alive.
568 return all_of(R.definedValues(),
569 [](VPValue *V) { return V->getNumUsers() == 0; });
570}
571
574 vp_post_order_deep(Plan.getEntry()))) {
575 // The recipes in the block are processed in reverse order, to catch chains
576 // of dead recipes.
577 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
578 if (isDeadRecipe(R)) {
579 R.eraseFromParent();
580 continue;
581 }
582
583 // Check if R is a dead VPPhi <-> update cycle and remove it.
584 auto *PhiR = dyn_cast<VPPhi>(&R);
585 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
586 continue;
587 VPValue *Incoming = PhiR->getOperand(1);
588 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
589 Incoming->getNumUsers() != 1)
590 continue;
591 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
592 PhiR->eraseFromParent();
593 Incoming->getDefiningRecipe()->eraseFromParent();
594 }
595 }
596}
597
600 Instruction::BinaryOps InductionOpcode,
601 FPMathOperator *FPBinOp, Instruction *TruncI,
602 VPValue *StartV, VPValue *Step, DebugLoc DL,
603 VPBuilder &Builder) {
604 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
605 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
606 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
607 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
608 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
609
610 // Truncate base induction if needed.
611 VPTypeAnalysis TypeInfo(Plan);
612 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
613 if (TruncI) {
614 Type *TruncTy = TruncI->getType();
615 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
616 "Not truncating.");
617 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
618 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
619 ResultTy = TruncTy;
620 }
621
622 // Truncate step if needed.
623 Type *StepTy = TypeInfo.inferScalarType(Step);
624 if (ResultTy != StepTy) {
625 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
626 "Not truncating.");
627 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
628 auto *VecPreheader =
630 VPBuilder::InsertPointGuard Guard(Builder);
631 Builder.setInsertPoint(VecPreheader);
632 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
633 }
634 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
635 &Plan.getVF(), DL);
636}
637
640 for (unsigned I = 0; I != Users.size(); ++I) {
642 if (isa<VPHeaderPHIRecipe>(Cur))
643 continue;
644 for (VPValue *V : Cur->definedValues())
645 Users.insert_range(V->users());
646 }
647 return Users.takeVector();
648}
649
650/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
651/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
652/// VPWidenPointerInductionRecipe will generate vectors only. If some users
653/// require vectors while other require scalars, the scalar uses need to extract
654/// the scalars from the generated vectors (Note that this is different to how
655/// int/fp inductions are handled). Legalize extract-from-ends using uniform
656/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
657/// the correct end value is available. Also optimize
658/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
659/// providing them scalar steps built on the canonical scalar IV and update the
660/// original IV's users. This is an optional optimization to reduce the needs of
661/// vector extracts.
664 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
665 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
666 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
667 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
668 if (!PhiR)
669 continue;
670
671 // Try to narrow wide and replicating recipes to uniform recipes, based on
672 // VPlan analysis.
673 // TODO: Apply to all recipes in the future, to replace legacy uniformity
674 // analysis.
675 auto Users = collectUsersRecursively(PhiR);
676 for (VPUser *U : reverse(Users)) {
677 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
678 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
679 // Skip recipes that shouldn't be narrowed.
680 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
681 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
682 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
683 continue;
684
685 // Skip recipes that may have other lanes than their first used.
687 continue;
688
689 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
690 Def->operands(), /*IsUniform*/ true);
691 Clone->insertAfter(Def);
692 Def->replaceAllUsesWith(Clone);
693 }
694
695 // Replace wide pointer inductions which have only their scalars used by
696 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
697 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
698 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
699 continue;
700
701 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
702 VPValue *StartV =
703 Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
704 VPValue *StepV = PtrIV->getOperand(1);
706 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
707 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
708
709 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
710 PtrIV->getDebugLoc(), "next.gep");
711
712 PtrIV->replaceAllUsesWith(PtrAdd);
713 continue;
714 }
715
716 // Replace widened induction with scalar steps for users that only use
717 // scalars.
718 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
719 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
720 return U->usesScalars(WideIV);
721 }))
722 continue;
723
724 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
726 Plan, ID.getKind(), ID.getInductionOpcode(),
727 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
728 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
729 WideIV->getDebugLoc(), Builder);
730
731 // Update scalar users of IV to use Step instead.
732 if (!HasOnlyVectorVFs)
733 WideIV->replaceAllUsesWith(Steps);
734 else
735 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
736 return U.usesScalars(WideIV);
737 });
738 }
739}
740
741/// Check if \p VPV is an untruncated wide induction, either before or after the
742/// increment. If so return the header IV (before the increment), otherwise
743/// return null.
745 ScalarEvolution &SE) {
746 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
747 if (WideIV) {
748 // VPV itself is a wide induction, separately compute the end value for exit
749 // users if it is not a truncated IV.
750 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
751 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
752 }
753
754 // Check if VPV is an optimizable induction increment.
755 VPRecipeBase *Def = VPV->getDefiningRecipe();
756 if (!Def || Def->getNumOperands() != 2)
757 return nullptr;
758 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
759 if (!WideIV)
760 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
761 if (!WideIV)
762 return nullptr;
763
764 auto IsWideIVInc = [&]() {
765 auto &ID = WideIV->getInductionDescriptor();
766
767 // Check if VPV increments the induction by the induction step.
768 VPValue *IVStep = WideIV->getStepValue();
769 switch (ID.getInductionOpcode()) {
770 case Instruction::Add:
771 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
772 case Instruction::FAdd:
774 m_Specific(IVStep)));
775 case Instruction::FSub:
776 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
777 m_Specific(IVStep)));
778 case Instruction::Sub: {
779 // IVStep will be the negated step of the subtraction. Check if Step == -1
780 // * IVStep.
781 VPValue *Step;
782 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
783 return false;
784 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
785 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
786 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
787 !isa<SCEVCouldNotCompute>(StepSCEV) &&
788 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
789 }
790 default:
791 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
792 match(VPV, m_GetElementPtr(m_Specific(WideIV),
793 m_Specific(WideIV->getStepValue())));
794 }
795 llvm_unreachable("should have been covered by switch above");
796 };
797 return IsWideIVInc() ? WideIV : nullptr;
798}
799
800/// Attempts to optimize the induction variable exit values for users in the
801/// early exit block.
803 VPTypeAnalysis &TypeInfo,
804 VPBlockBase *PredVPBB,
805 VPValue *Op,
806 ScalarEvolution &SE) {
807 VPValue *Incoming, *Mask;
810 return nullptr;
811
812 auto *WideIV = getOptimizableIVOf(Incoming, SE);
813 if (!WideIV)
814 return nullptr;
815
816 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
817 if (WideIntOrFp && WideIntOrFp->getTruncInst())
818 return nullptr;
819
820 // Calculate the final index.
821 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
822 auto *CanonicalIV = LoopRegion->getCanonicalIV();
823 Type *CanonicalIVType = CanonicalIV->getScalarType();
824 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
825
826 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
827 VPValue *FirstActiveLane =
828 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
829 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
830 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
831 FirstActiveLaneType, DL);
832 VPValue *EndValue =
833 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
834
835 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
836 // changed it means the exit is using the incremented value, so we need to
837 // add the step.
838 if (Incoming != WideIV) {
839 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
840 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
841 }
842
843 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
844 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
845 VPValue *Start = WideIV->getStartValue();
846 VPValue *Step = WideIV->getStepValue();
847 EndValue = B.createDerivedIV(
848 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
849 Start, EndValue, Step);
850 }
851
852 return EndValue;
853}
854
855/// Attempts to optimize the induction variable exit values for users in the
856/// exit block coming from the latch in the original scalar loop.
858 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
862 return nullptr;
863
864 auto *WideIV = getOptimizableIVOf(Incoming, SE);
865 if (!WideIV)
866 return nullptr;
867
868 VPValue *EndValue = EndValues.lookup(WideIV);
869 assert(EndValue && "end value must have been pre-computed");
870
871 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
872 // changed it means the exit is using the incremented value, so we don't
873 // need to subtract the step.
874 if (Incoming != WideIV)
875 return EndValue;
876
877 // Otherwise, subtract the step from the EndValue.
878 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
879 VPValue *Step = WideIV->getStepValue();
880 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
881 if (ScalarTy->isIntegerTy())
882 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
883 if (ScalarTy->isPointerTy()) {
884 Type *StepTy = TypeInfo.inferScalarType(Step);
885 auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
886 return B.createPtrAdd(EndValue,
887 B.createNaryOp(Instruction::Sub, {Zero, Step}),
888 DebugLoc::getUnknown(), "ind.escape");
889 }
890 if (ScalarTy->isFloatingPointTy()) {
891 const auto &ID = WideIV->getInductionDescriptor();
892 return B.createNaryOp(
893 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
894 ? Instruction::FSub
895 : Instruction::FAdd,
896 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
897 }
898 llvm_unreachable("all possible induction types must be handled");
899 return nullptr;
900}
901
903 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
904 ScalarEvolution &SE) {
905 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
906 VPTypeAnalysis TypeInfo(Plan);
907 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
908 for (VPRecipeBase &R : ExitVPBB->phis()) {
909 auto *ExitIRI = cast<VPIRPhi>(&R);
910
911 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
912 VPValue *Escape = nullptr;
913 if (PredVPBB == MiddleVPBB)
914 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
915 ExitIRI->getOperand(Idx),
916 EndValues, SE);
917 else
918 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
919 ExitIRI->getOperand(Idx), SE);
920 if (Escape)
921 ExitIRI->setOperand(Idx, Escape);
922 }
923 }
924 }
925}
926
927/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
928/// them with already existing recipes expanding the same SCEV expression.
931
932 for (VPRecipeBase &R :
934 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
935 if (!ExpR)
936 continue;
937
938 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
939 if (Inserted)
940 continue;
941 ExpR->replaceAllUsesWith(V->second);
942 ExpR->eraseFromParent();
943 }
944}
945
947 SmallVector<VPValue *> WorkList;
949 WorkList.push_back(V);
950
951 while (!WorkList.empty()) {
952 VPValue *Cur = WorkList.pop_back_val();
953 if (!Seen.insert(Cur).second)
954 continue;
956 if (!R)
957 continue;
958 if (!isDeadRecipe(*R))
959 continue;
960 append_range(WorkList, R->operands());
961 R->eraseFromParent();
962 }
963}
964
965/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
966/// Returns an optional pair, where the first element indicates whether it is
967/// an intrinsic ID.
968static std::optional<std::pair<bool, unsigned>>
970 return TypeSwitch<const VPSingleDefRecipe *,
971 std::optional<std::pair<bool, unsigned>>>(R)
974 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
975 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
976 return std::make_pair(true, I->getVectorIntrinsicID());
977 })
978 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
979 // For recipes that do not directly map to LLVM IR instructions,
980 // assign opcodes after the last VPInstruction opcode (which is also
981 // after the last IR Instruction opcode), based on the VPDefID.
982 return std::make_pair(false,
983 VPInstruction::OpsEnd + 1 + I->getVPDefID());
984 })
985 .Default([](auto *) { return std::nullopt; });
986}
987
988/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
989/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
990/// Operands are foldable live-ins.
992 ArrayRef<VPValue *> Operands,
993 const DataLayout &DL,
994 VPTypeAnalysis &TypeInfo) {
995 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
996 if (!OpcodeOrIID)
997 return nullptr;
998
1000 for (VPValue *Op : Operands) {
1001 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1002 return nullptr;
1003 Ops.push_back(Op->getLiveInIRValue());
1004 }
1005
1006 auto FoldToIRValue = [&]() -> Value * {
1007 InstSimplifyFolder Folder(DL);
1008 if (OpcodeOrIID->first) {
1009 if (R.getNumOperands() != 2)
1010 return nullptr;
1011 unsigned ID = OpcodeOrIID->second;
1012 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1013 TypeInfo.inferScalarType(&R));
1014 }
1015 unsigned Opcode = OpcodeOrIID->second;
1016 if (Instruction::isBinaryOp(Opcode))
1017 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1018 Ops[0], Ops[1]);
1019 if (Instruction::isCast(Opcode))
1020 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1021 TypeInfo.inferScalarType(R.getVPSingleValue()));
1022 switch (Opcode) {
1024 return Folder.FoldSelect(Ops[0], Ops[1],
1026 case VPInstruction::Not:
1027 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1029 case Instruction::Select:
1030 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1031 case Instruction::ICmp:
1032 case Instruction::FCmp:
1033 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1034 Ops[1]);
1035 case Instruction::GetElementPtr: {
1036 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1037 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1038 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1039 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1040 }
1043 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1044 Ops[0], Ops[1],
1045 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1046 // An extract of a live-in is an extract of a broadcast, so return the
1047 // broadcasted element.
1048 case Instruction::ExtractElement:
1049 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1050 return Ops[0];
1051 }
1052 return nullptr;
1053 };
1054
1055 if (Value *V = FoldToIRValue())
1056 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1057 return nullptr;
1058}
1059
1060/// Try to simplify recipe \p R.
1061static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1062 VPlan *Plan = R.getParent()->getPlan();
1063
1064 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1065 if (!Def)
1066 return;
1067
1068 // Simplification of live-in IR values for SingleDef recipes using
1069 // InstSimplifyFolder.
1070 const DataLayout &DL =
1072 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1073 return Def->replaceAllUsesWith(V);
1074
1075 // Fold PredPHI LiveIn -> LiveIn.
1076 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1077 VPValue *Op = PredPHI->getOperand(0);
1078 if (Op->isLiveIn())
1079 PredPHI->replaceAllUsesWith(Op);
1080 }
1081
1082 VPBuilder Builder(Def);
1083 VPValue *A;
1084 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1085 Type *TruncTy = TypeInfo.inferScalarType(Def);
1086 Type *ATy = TypeInfo.inferScalarType(A);
1087 if (TruncTy == ATy) {
1088 Def->replaceAllUsesWith(A);
1089 } else {
1090 // Don't replace a scalarizing recipe with a widened cast.
1091 if (isa<VPReplicateRecipe>(Def))
1092 return;
1093 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1094
1095 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1096 ? Instruction::SExt
1097 : Instruction::ZExt;
1098 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1099 TruncTy);
1100 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1101 // UnderlyingExt has distinct return type, used to retain legacy cost.
1102 Ext->setUnderlyingValue(UnderlyingExt);
1103 }
1104 Def->replaceAllUsesWith(Ext);
1105 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1106 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1107 Def->replaceAllUsesWith(Trunc);
1108 }
1109 }
1110#ifndef NDEBUG
1111 // Verify that the cached type info is for both A and its users is still
1112 // accurate by comparing it to freshly computed types.
1113 VPTypeAnalysis TypeInfo2(*Plan);
1114 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1115 for (VPUser *U : A->users()) {
1116 auto *R = cast<VPRecipeBase>(U);
1117 for (VPValue *VPV : R->definedValues())
1118 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1119 }
1120#endif
1121 }
1122
1123 // Simplify (X && Y) || (X && !Y) -> X.
1124 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1125 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1126 // recipes to be visited during simplification.
1127 VPValue *X, *Y, *Z;
1128 if (match(Def,
1131 Def->replaceAllUsesWith(X);
1132 Def->eraseFromParent();
1133 return;
1134 }
1135
1136 // x | 1 -> 1
1137 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1138 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1139
1140 // x | 0 -> x
1141 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1142 return Def->replaceAllUsesWith(X);
1143
1144 // x & 0 -> 0
1145 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1146 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1147
1148 // x && false -> false
1149 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1150 return Def->replaceAllUsesWith(Def->getOperand(1));
1151
1152 // (x && y) || (x && z) -> x && (y || z)
1155 // Simplify only if one of the operands has one use to avoid creating an
1156 // extra recipe.
1157 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1158 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1159 return Def->replaceAllUsesWith(
1160 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1161
1162 // x && !x -> 0
1164 return Def->replaceAllUsesWith(Plan->getFalse());
1165
1166 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1167 return Def->replaceAllUsesWith(X);
1168
1169 // select !c, x, y -> select c, y, x
1170 VPValue *C;
1171 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1172 Def->setOperand(0, C);
1173 Def->setOperand(1, Y);
1174 Def->setOperand(2, X);
1175 return;
1176 }
1177
1178 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1179 // tail folding it is likely that x is a header mask and can be simplified
1180 // further.
1182 m_VPValue(Z))) &&
1183 X->hasMoreThanOneUniqueUser())
1184 return Def->replaceAllUsesWith(
1185 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1186
1187 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1188 return Def->replaceAllUsesWith(A);
1189
1190 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1191 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1192 : R.getOperand(0));
1193
1194 if (match(Def, m_Not(m_VPValue(A)))) {
1195 if (match(A, m_Not(m_VPValue(A))))
1196 return Def->replaceAllUsesWith(A);
1197
1198 // Try to fold Not into compares by adjusting the predicate in-place.
1199 CmpPredicate Pred;
1200 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1201 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1202 if (all_of(Cmp->users(),
1204 m_Not(m_Specific(Cmp)),
1205 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1206 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1207 for (VPUser *U : to_vector(Cmp->users())) {
1208 auto *R = cast<VPSingleDefRecipe>(U);
1209 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1210 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1211 R->setOperand(1, Y);
1212 R->setOperand(2, X);
1213 } else {
1214 // not (cmp pred) -> cmp inv_pred
1215 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1216 R->replaceAllUsesWith(Cmp);
1217 }
1218 }
1219 // If Cmp doesn't have a debug location, use the one from the negation,
1220 // to preserve the location.
1221 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1222 Cmp->setDebugLoc(R.getDebugLoc());
1223 }
1224 }
1225 }
1226
1227 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1228 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1229 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1230 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1231 TypeInfo.inferScalarType(Def))
1232 return Def->replaceAllUsesWith(Def->getOperand(1));
1233
1235 m_One()))) {
1236 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1237 if (TypeInfo.inferScalarType(X) != WideStepTy)
1238 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1239 Def->replaceAllUsesWith(X);
1240 return;
1241 }
1242
1243 // For i1 vp.merges produced by AnyOf reductions:
1244 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1246 m_VPValue(X), m_VPValue())) &&
1248 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1249 Def->setOperand(1, Def->getOperand(0));
1250 Def->setOperand(0, Y);
1251 return;
1252 }
1253
1254 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1255 if (Phi->getOperand(0) == Phi->getOperand(1))
1256 Def->replaceAllUsesWith(Phi->getOperand(0));
1257 return;
1258 }
1259
1260 // Look through ExtractLastElement (BuildVector ....).
1263 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1264 Def->replaceAllUsesWith(
1265 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1266 return;
1267 }
1268
1269 // Look through ExtractPenultimateElement (BuildVector ....).
1271 m_BuildVector()))) {
1272 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1273 Def->replaceAllUsesWith(
1274 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1275 return;
1276 }
1277
1278 uint64_t Idx;
1280 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1281 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1282 return;
1283 }
1284
1285 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1286 if (Phi->getNumOperands() == 1)
1287 Phi->replaceAllUsesWith(Phi->getOperand(0));
1288 return;
1289 }
1290
1291 // Some simplifications can only be applied after unrolling. Perform them
1292 // below.
1293 if (!Plan->isUnrolled())
1294 return;
1295
1296 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1297 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1298 isa<VPPhi>(X)) {
1299 auto *Phi = cast<VPPhi>(X);
1300 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1301 Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
1302 Phi->setOperand(0, Y);
1303 Def->replaceAllUsesWith(Phi);
1304 return;
1305 }
1306 }
1307
1308 // VPVectorPointer for part 0 can be replaced by their start pointer.
1309 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1310 if (VecPtr->isFirstPart()) {
1311 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1312 return;
1313 }
1314 }
1315
1316 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1317 // the first lane is demanded.
1318 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1319 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1320 Steps->replaceAllUsesWith(Steps->getOperand(0));
1321 return;
1322 }
1323 }
1324 // Simplify redundant ReductionStartVector recipes after unrolling.
1325 VPValue *StartV;
1327 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1328 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1329 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1330 return PhiR && PhiR->isInLoop();
1331 });
1332 return;
1333 }
1334
1335 if (match(Def,
1338 Def->replaceAllUsesWith(A);
1339 return;
1340 }
1341
1346 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1347 all_of(A->users(),
1348 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1349 return Def->replaceAllUsesWith(A);
1350 }
1351
1352 if (Plan->getUF() == 1 &&
1354 return Def->replaceAllUsesWith(
1355 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1356 }
1357}
1358
1361 Plan.getEntry());
1362 VPTypeAnalysis TypeInfo(Plan);
1364 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1365 simplifyRecipe(R, TypeInfo);
1366 }
1367 }
1368}
1369
1371 if (Plan.hasScalarVFOnly())
1372 return;
1373
1374 // Try to narrow wide and replicating recipes to single scalar recipes,
1375 // based on VPlan analysis. Only process blocks in the loop region for now,
1376 // without traversing into nested regions, as recipes in replicate regions
1377 // cannot be converted yet.
1380 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1382 continue;
1383 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1384 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1385 continue;
1386
1387 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1388 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1389 vputils::isSingleScalar(RepR->getOperand(1))) {
1390 auto *Clone = new VPReplicateRecipe(
1391 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1392 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1393 Clone->insertBefore(RepOrWidenR);
1394 unsigned ExtractOpc =
1395 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1398 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1399 Ext->insertBefore(Clone);
1400 Clone->setOperand(0, Ext);
1401 RepR->eraseFromParent();
1402 continue;
1403 }
1404
1405 // Skip recipes that aren't single scalars or don't have only their
1406 // scalar results used. In the latter case, we would introduce extra
1407 // broadcasts.
1408 if (!vputils::isSingleScalar(RepOrWidenR) ||
1409 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1410 return U->usesScalars(RepOrWidenR) ||
1411 match(cast<VPRecipeBase>(U),
1412 m_CombineOr(m_ExtractLastElement(m_VPValue()),
1413 m_ExtractLastLanePerPart(m_VPValue())));
1414 }))
1415 continue;
1416
1417 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1418 RepOrWidenR->operands(),
1419 true /*IsSingleScalar*/);
1420 Clone->insertBefore(RepOrWidenR);
1421 RepOrWidenR->replaceAllUsesWith(Clone);
1422 }
1423 }
1424}
1425
1426/// Try to see if all of \p Blend's masks share a common value logically and'ed
1427/// and remove it from the masks.
1429 if (Blend->isNormalized())
1430 return;
1431 VPValue *CommonEdgeMask;
1432 if (!match(Blend->getMask(0),
1433 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1434 return;
1435 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1436 if (!match(Blend->getMask(I),
1437 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1438 return;
1439 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1440 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1441}
1442
1443/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1444/// to make sure the masks are simplified.
1445static void simplifyBlends(VPlan &Plan) {
1448 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1449 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1450 if (!Blend)
1451 continue;
1452
1453 removeCommonBlendMask(Blend);
1454
1455 // Try to remove redundant blend recipes.
1456 SmallPtrSet<VPValue *, 4> UniqueValues;
1457 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1458 UniqueValues.insert(Blend->getIncomingValue(0));
1459 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1460 if (!match(Blend->getMask(I), m_False()))
1461 UniqueValues.insert(Blend->getIncomingValue(I));
1462
1463 if (UniqueValues.size() == 1) {
1464 Blend->replaceAllUsesWith(*UniqueValues.begin());
1465 Blend->eraseFromParent();
1466 continue;
1467 }
1468
1469 if (Blend->isNormalized())
1470 continue;
1471
1472 // Normalize the blend so its first incoming value is used as the initial
1473 // value with the others blended into it.
1474
1475 unsigned StartIndex = 0;
1476 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1477 // If a value's mask is used only by the blend then is can be deadcoded.
1478 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1479 // that's used by multiple blends where it can be removed from them all.
1480 VPValue *Mask = Blend->getMask(I);
1481 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1482 StartIndex = I;
1483 break;
1484 }
1485 }
1486
1487 SmallVector<VPValue *, 4> OperandsWithMask;
1488 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1489
1490 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1491 if (I == StartIndex)
1492 continue;
1493 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1494 OperandsWithMask.push_back(Blend->getMask(I));
1495 }
1496
1497 auto *NewBlend =
1498 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1499 OperandsWithMask, Blend->getDebugLoc());
1500 NewBlend->insertBefore(&R);
1501
1502 VPValue *DeadMask = Blend->getMask(StartIndex);
1503 Blend->replaceAllUsesWith(NewBlend);
1504 Blend->eraseFromParent();
1506
1507 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1508 VPValue *NewMask;
1509 if (NewBlend->getNumOperands() == 3 &&
1510 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1511 VPValue *Inc0 = NewBlend->getOperand(0);
1512 VPValue *Inc1 = NewBlend->getOperand(1);
1513 VPValue *OldMask = NewBlend->getOperand(2);
1514 NewBlend->setOperand(0, Inc1);
1515 NewBlend->setOperand(1, Inc0);
1516 NewBlend->setOperand(2, NewMask);
1517 if (OldMask->getNumUsers() == 0)
1518 cast<VPInstruction>(OldMask)->eraseFromParent();
1519 }
1520 }
1521 }
1522}
1523
1524/// Optimize the width of vector induction variables in \p Plan based on a known
1525/// constant Trip Count, \p BestVF and \p BestUF.
1527 ElementCount BestVF,
1528 unsigned BestUF) {
1529 // Only proceed if we have not completely removed the vector region.
1530 if (!Plan.getVectorLoopRegion())
1531 return false;
1532
1533 const APInt *TC;
1534 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1535 return false;
1536
1537 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1538 // and UF. Returns at least 8.
1539 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1540 APInt AlignedTC =
1543 APInt MaxVal = AlignedTC - 1;
1544 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1545 };
1546 unsigned NewBitWidth =
1547 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1548
1549 LLVMContext &Ctx = Plan.getContext();
1550 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1551
1552 bool MadeChange = false;
1553
1554 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1555 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1556 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1557
1558 // Currently only handle canonical IVs as it is trivial to replace the start
1559 // and stop values, and we currently only perform the optimization when the
1560 // IV has a single use.
1561 if (!WideIV || !WideIV->isCanonical() ||
1562 WideIV->hasMoreThanOneUniqueUser() ||
1563 NewIVTy == WideIV->getScalarType())
1564 continue;
1565
1566 // Currently only handle cases where the single user is a header-mask
1567 // comparison with the backedge-taken-count.
1568 if (!match(*WideIV->user_begin(),
1569 m_ICmp(m_Specific(WideIV),
1572 continue;
1573
1574 // Update IV operands and comparison bound to use new narrower type.
1575 auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1576 WideIV->setStartValue(NewStart);
1577 auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1578 WideIV->setStepValue(NewStep);
1579
1580 auto *NewBTC = new VPWidenCastRecipe(
1581 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1582 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1583 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1584 Cmp->setOperand(1, NewBTC);
1585
1586 MadeChange = true;
1587 }
1588
1589 return MadeChange;
1590}
1591
1592/// Return true if \p Cond is known to be true for given \p BestVF and \p
1593/// BestUF.
1595 ElementCount BestVF, unsigned BestUF,
1596 ScalarEvolution &SE) {
1598 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1599 &SE](VPValue *C) {
1600 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1601 });
1602
1603 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1605 m_Specific(CanIV->getBackedgeValue()),
1606 m_Specific(&Plan.getVectorTripCount()))))
1607 return false;
1608
1609 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1610 // count is not conveniently available as SCEV so far, so we compare directly
1611 // against the original trip count. This is stricter than necessary, as we
1612 // will only return true if the trip count == vector trip count.
1613 const SCEV *VectorTripCount =
1615 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1616 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1617 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1618 "Trip count SCEV must be computable");
1619 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1620 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1621 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1622}
1623
1624/// Try to replace multiple active lane masks used for control flow with
1625/// a single, wide active lane mask instruction followed by multiple
1626/// extract subvector intrinsics. This applies to the active lane mask
1627/// instructions both in the loop and in the preheader.
1628/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1629/// new extracts from the first active lane mask, which has it's last
1630/// operand (multiplier) set to UF.
1632 unsigned UF) {
1633 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1634 return false;
1635
1636 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1637 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1638 auto *Term = &ExitingVPBB->back();
1639
1640 using namespace llvm::VPlanPatternMatch;
1642 m_VPValue(), m_VPValue(), m_VPValue())))))
1643 return false;
1644
1645 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1646 LLVMContext &Ctx = Plan.getContext();
1647
1648 auto ExtractFromALM = [&](VPInstruction *ALM,
1649 SmallVectorImpl<VPValue *> &Extracts) {
1650 DebugLoc DL = ALM->getDebugLoc();
1651 for (unsigned Part = 0; Part < UF; ++Part) {
1653 Ops.append({ALM, Plan.getOrAddLiveIn(
1654 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1655 VF.getKnownMinValue() * Part))});
1656 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1658 Extracts[Part] = Ext;
1659 Ext->insertAfter(ALM);
1660 }
1661 };
1662
1663 // Create a list of each active lane mask phi, ordered by unroll part.
1665 for (VPRecipeBase &R : Header->phis()) {
1667 if (!Phi)
1668 continue;
1669 VPValue *Index = nullptr;
1670 match(Phi->getBackedgeValue(),
1672 assert(Index && "Expected index from ActiveLaneMask instruction");
1673
1674 uint64_t Part;
1675 if (match(Index,
1677 m_VPValue(), m_ConstantInt(Part))))
1678 Phis[Part] = Phi;
1679 else
1680 // Anything other than a CanonicalIVIncrementForPart is part 0
1681 Phis[0] = Phi;
1682 }
1683
1684 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1685 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1686
1687 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1688 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1689
1690 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1691 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1692 "Expected incoming values of Phi to be ActiveLaneMasks");
1693
1694 // When using wide lane masks, the return type of the get.active.lane.mask
1695 // intrinsic is VF x UF (last operand).
1696 VPValue *ALMMultiplier =
1697 Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1698 EntryALM->setOperand(2, ALMMultiplier);
1699 LoopALM->setOperand(2, ALMMultiplier);
1700
1701 // Create UF x extract vectors and insert into preheader.
1702 SmallVector<VPValue *> EntryExtracts(UF);
1703 ExtractFromALM(EntryALM, EntryExtracts);
1704
1705 // Create UF x extract vectors and insert before the loop compare & branch,
1706 // updating the compare to use the first extract.
1707 SmallVector<VPValue *> LoopExtracts(UF);
1708 ExtractFromALM(LoopALM, LoopExtracts);
1709 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1710 Not->setOperand(0, LoopExtracts[0]);
1711
1712 // Update the incoming values of active lane mask phis.
1713 for (unsigned Part = 0; Part < UF; ++Part) {
1714 Phis[Part]->setStartValue(EntryExtracts[Part]);
1715 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1716 }
1717
1718 return true;
1719}
1720
1721/// Try to simplify the branch condition of \p Plan. This may restrict the
1722/// resulting plan to \p BestVF and \p BestUF.
1724 unsigned BestUF,
1726 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1727 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1728 auto *Term = &ExitingVPBB->back();
1729 VPValue *Cond;
1730 ScalarEvolution &SE = *PSE.getSE();
1731 if (match(Term, m_BranchOnCount()) ||
1733 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1734 // Try to simplify the branch condition if TC <= VF * UF when the latch
1735 // terminator is BranchOnCount or BranchOnCond where the input is
1736 // Not(ActiveLaneMask).
1737 const SCEV *TripCount =
1739 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1740 "Trip count SCEV must be computable");
1741 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1742 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1743 if (TripCount->isZero() ||
1744 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1745 return false;
1746 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1747 // For BranchOnCond, check if we can prove the condition to be true using VF
1748 // and UF.
1749 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1750 return false;
1751 } else {
1752 return false;
1753 }
1754
1755 // The vector loop region only executes once. If possible, completely remove
1756 // the region, otherwise replace the terminator controlling the latch with
1757 // (BranchOnCond true).
1758 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1759 // support for other non-canonical widen induction recipes (e.g.,
1760 // VPWidenPointerInductionRecipe).
1761 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1762 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1763 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1764 return R->isCanonical();
1765 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1766 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1767 })) {
1768 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1769 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1770 VPBuilder Builder(Plan.getVectorPreheader());
1771 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1772 R->getScalarType());
1773 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1774 HeaderR.eraseFromParent();
1775 continue;
1776 }
1777 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1778 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1779 HeaderR.eraseFromParent();
1780 }
1781
1782 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1783 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1784 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1785 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1786
1787 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1788 B->setParent(nullptr);
1789
1790 VPBlockUtils::connectBlocks(Preheader, Header);
1791 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1793 } else {
1794 // The vector region contains header phis for which we cannot remove the
1795 // loop region yet.
1796 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1797 Term->getDebugLoc());
1798 ExitingVPBB->appendRecipe(BOC);
1799 }
1800
1801 Term->eraseFromParent();
1802
1803 return true;
1804}
1805
1807 unsigned BestUF,
1809 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1810 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1811
1812 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1813 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1814 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1815
1816 if (MadeChange) {
1817 Plan.setVF(BestVF);
1818 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1819 }
1820 // TODO: Further simplifications are possible
1821 // 1. Replace inductions with constants.
1822 // 2. Replace vector loop region with VPBasicBlock.
1823}
1824
1825/// Sink users of \p FOR after the recipe defining the previous value \p
1826/// Previous of the recurrence. \returns true if all users of \p FOR could be
1827/// re-arranged as needed or false if it is not possible.
1828static bool
1830 VPRecipeBase *Previous,
1831 VPDominatorTree &VPDT) {
1832 // Collect recipes that need sinking.
1835 Seen.insert(Previous);
1836 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1837 // The previous value must not depend on the users of the recurrence phi. In
1838 // that case, FOR is not a fixed order recurrence.
1839 if (SinkCandidate == Previous)
1840 return false;
1841
1842 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1843 !Seen.insert(SinkCandidate).second ||
1844 VPDT.properlyDominates(Previous, SinkCandidate))
1845 return true;
1846
1847 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1848 return false;
1849
1850 WorkList.push_back(SinkCandidate);
1851 return true;
1852 };
1853
1854 // Recursively sink users of FOR after Previous.
1855 WorkList.push_back(FOR);
1856 for (unsigned I = 0; I != WorkList.size(); ++I) {
1857 VPRecipeBase *Current = WorkList[I];
1858 assert(Current->getNumDefinedValues() == 1 &&
1859 "only recipes with a single defined value expected");
1860
1861 for (VPUser *User : Current->getVPSingleValue()->users()) {
1862 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1863 return false;
1864 }
1865 }
1866
1867 // Keep recipes to sink ordered by dominance so earlier instructions are
1868 // processed first.
1869 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1870 return VPDT.properlyDominates(A, B);
1871 });
1872
1873 for (VPRecipeBase *SinkCandidate : WorkList) {
1874 if (SinkCandidate == FOR)
1875 continue;
1876
1877 SinkCandidate->moveAfter(Previous);
1878 Previous = SinkCandidate;
1879 }
1880 return true;
1881}
1882
1883/// Try to hoist \p Previous and its operands before all users of \p FOR.
1885 VPRecipeBase *Previous,
1886 VPDominatorTree &VPDT) {
1887 if (cannotHoistOrSinkRecipe(*Previous))
1888 return false;
1889
1890 // Collect recipes that need hoisting.
1891 SmallVector<VPRecipeBase *> HoistCandidates;
1893 VPRecipeBase *HoistPoint = nullptr;
1894 // Find the closest hoist point by looking at all users of FOR and selecting
1895 // the recipe dominating all other users.
1896 for (VPUser *U : FOR->users()) {
1897 auto *R = cast<VPRecipeBase>(U);
1898 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1899 HoistPoint = R;
1900 }
1901 assert(all_of(FOR->users(),
1902 [&VPDT, HoistPoint](VPUser *U) {
1903 auto *R = cast<VPRecipeBase>(U);
1904 return HoistPoint == R ||
1905 VPDT.properlyDominates(HoistPoint, R);
1906 }) &&
1907 "HoistPoint must dominate all users of FOR");
1908
1909 auto NeedsHoisting = [HoistPoint, &VPDT,
1910 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1911 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1912 if (!HoistCandidate)
1913 return nullptr;
1914 VPRegionBlock *EnclosingLoopRegion =
1915 HoistCandidate->getParent()->getEnclosingLoopRegion();
1916 assert((!HoistCandidate->getRegion() ||
1917 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1918 "CFG in VPlan should still be flat, without replicate regions");
1919 // Hoist candidate was already visited, no need to hoist.
1920 if (!Visited.insert(HoistCandidate).second)
1921 return nullptr;
1922
1923 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1924 // hoisting.
1925 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1926 return nullptr;
1927
1928 // If we reached a recipe that dominates HoistPoint, we don't need to
1929 // hoist the recipe.
1930 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1931 return nullptr;
1932 return HoistCandidate;
1933 };
1934
1935 if (!NeedsHoisting(Previous->getVPSingleValue()))
1936 return true;
1937
1938 // Recursively try to hoist Previous and its operands before all users of FOR.
1939 HoistCandidates.push_back(Previous);
1940
1941 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1942 VPRecipeBase *Current = HoistCandidates[I];
1943 assert(Current->getNumDefinedValues() == 1 &&
1944 "only recipes with a single defined value expected");
1945 if (cannotHoistOrSinkRecipe(*Current))
1946 return false;
1947
1948 for (VPValue *Op : Current->operands()) {
1949 // If we reach FOR, it means the original Previous depends on some other
1950 // recurrence that in turn depends on FOR. If that is the case, we would
1951 // also need to hoist recipes involving the other FOR, which may break
1952 // dependencies.
1953 if (Op == FOR)
1954 return false;
1955
1956 if (auto *R = NeedsHoisting(Op))
1957 HoistCandidates.push_back(R);
1958 }
1959 }
1960
1961 // Order recipes to hoist by dominance so earlier instructions are processed
1962 // first.
1963 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1964 return VPDT.properlyDominates(A, B);
1965 });
1966
1967 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1968 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1969 HoistPoint->getIterator());
1970 }
1971
1972 return true;
1973}
1974
1976 VPBuilder &LoopBuilder) {
1977 VPDominatorTree VPDT(Plan);
1978
1980 for (VPRecipeBase &R :
1983 RecurrencePhis.push_back(FOR);
1984
1985 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1987 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1988 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1989 // to terminate.
1990 while (auto *PrevPhi =
1992 assert(PrevPhi->getParent() == FOR->getParent());
1993 assert(SeenPhis.insert(PrevPhi).second);
1994 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1995 }
1996
1997 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1998 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1999 return false;
2000
2001 // Introduce a recipe to combine the incoming and previous values of a
2002 // fixed-order recurrence.
2003 VPBasicBlock *InsertBlock = Previous->getParent();
2004 if (isa<VPHeaderPHIRecipe>(Previous))
2005 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2006 else
2007 LoopBuilder.setInsertPoint(InsertBlock,
2008 std::next(Previous->getIterator()));
2009
2010 auto *RecurSplice =
2012 {FOR, FOR->getBackedgeValue()});
2013
2014 FOR->replaceAllUsesWith(RecurSplice);
2015 // Set the first operand of RecurSplice to FOR again, after replacing
2016 // all users.
2017 RecurSplice->setOperand(0, FOR);
2018 }
2019 return true;
2020}
2021
2023 for (VPRecipeBase &R :
2025 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2026 if (!PhiR)
2027 continue;
2028 RecurKind RK = PhiR->getRecurrenceKind();
2029 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2031 continue;
2032
2033 for (VPUser *U : collectUsersRecursively(PhiR))
2034 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2035 RecWithFlags->dropPoisonGeneratingFlags();
2036 }
2037 }
2038}
2039
2040namespace {
2041struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2042 static bool isSentinel(const VPSingleDefRecipe *Def) {
2043 return Def == getEmptyKey() || Def == getTombstoneKey();
2044 }
2045
2046 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2047 /// return that source element type.
2048 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2049 // All VPInstructions that lower to GEPs must have the i8 source element
2050 // type (as they are PtrAdds), so we omit it.
2052 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2053 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2054 return GEP->getSourceElementType();
2055 return nullptr;
2056 })
2057 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2058 [](auto *I) { return I->getSourceElementType(); })
2059 .Default([](auto *) { return nullptr; });
2060 }
2061
2062 /// Returns true if recipe \p Def can be safely handed for CSE.
2063 static bool canHandle(const VPSingleDefRecipe *Def) {
2064 // We can extend the list of handled recipes in the future,
2065 // provided we account for the data embedded in them while checking for
2066 // equality or hashing.
2067 auto C = getOpcodeOrIntrinsicID(Def);
2068
2069 // The issue with (Insert|Extract)Value is that the index of the
2070 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2071 // VPlan.
2072 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2073 C->second == Instruction::ExtractValue)))
2074 return false;
2075
2076 // During CSE, we can only handle recipes that don't read from memory: if
2077 // they read from memory, there could be an intervening write to memory
2078 // before the next instance is CSE'd, leading to an incorrect result.
2079 return !Def->mayReadFromMemory();
2080 }
2081
2082 /// Hash the underlying data of \p Def.
2083 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2084 const VPlan *Plan = Def->getParent()->getPlan();
2085 VPTypeAnalysis TypeInfo(*Plan);
2086 hash_code Result = hash_combine(
2087 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2088 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2090 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2091 if (RFlags->hasPredicate())
2092 return hash_combine(Result, RFlags->getPredicate());
2093 return Result;
2094 }
2095
2096 /// Check equality of underlying data of \p L and \p R.
2097 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2098 if (isSentinel(L) || isSentinel(R))
2099 return L == R;
2100 if (L->getVPDefID() != R->getVPDefID() ||
2102 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2104 !equal(L->operands(), R->operands()))
2105 return false;
2107 "must have valid opcode info for both recipes");
2108 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2109 if (LFlags->hasPredicate() &&
2110 LFlags->getPredicate() !=
2111 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2112 return false;
2113 // Recipes in replicate regions implicitly depend on predicate. If either
2114 // recipe is in a replicate region, only consider them equal if both have
2115 // the same parent.
2116 const VPRegionBlock *RegionL = L->getRegion();
2117 const VPRegionBlock *RegionR = R->getRegion();
2118 if (((RegionL && RegionL->isReplicator()) ||
2119 (RegionR && RegionR->isReplicator())) &&
2120 L->getParent() != R->getParent())
2121 return false;
2122 const VPlan *Plan = L->getParent()->getPlan();
2123 VPTypeAnalysis TypeInfo(*Plan);
2124 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2125 }
2126};
2127} // end anonymous namespace
2128
2129/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2130/// Plan.
2132 VPDominatorTree VPDT(Plan);
2134
2136 vp_depth_first_deep(Plan.getEntry()))) {
2137 for (VPRecipeBase &R : *VPBB) {
2138 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2139 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2140 continue;
2141 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2142 // V must dominate Def for a valid replacement.
2143 if (!VPDT.dominates(V->getParent(), VPBB))
2144 continue;
2145 // Only keep flags present on both V and Def.
2146 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2147 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2148 Def->replaceAllUsesWith(V);
2149 continue;
2150 }
2151 CSEMap[Def] = Def;
2152 }
2153 }
2154}
2155
2156/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2157static void licm(VPlan &Plan) {
2158 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2159
2160 // Hoist any loop invariant recipes from the vector loop region to the
2161 // preheader. Preform a shallow traversal of the vector loop region, to
2162 // exclude recipes in replicate regions. Since the top-level blocks in the
2163 // vector loop region are guaranteed to execute if the vector pre-header is,
2164 // we don't need to check speculation safety.
2165 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2166 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2167 "Expected vector prehader's successor to be the vector loop region");
2169 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2170 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2172 continue;
2173 if (any_of(R.operands(), [](VPValue *Op) {
2174 return !Op->isDefinedOutsideLoopRegions();
2175 }))
2176 continue;
2177 R.moveBefore(*Preheader, Preheader->end());
2178 }
2179 }
2180}
2181
2183 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2184 if (Plan.hasScalarVFOnly())
2185 return;
2186 // Keep track of created truncates, so they can be re-used. Note that we
2187 // cannot use RAUW after creating a new truncate, as this would could make
2188 // other uses have different types for their operands, making them invalidly
2189 // typed.
2191 VPTypeAnalysis TypeInfo(Plan);
2192 VPBasicBlock *PH = Plan.getVectorPreheader();
2195 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2198 &R))
2199 continue;
2200
2201 VPValue *ResultVPV = R.getVPSingleValue();
2202 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2203 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2204 if (!NewResSizeInBits)
2205 continue;
2206
2207 // If the value wasn't vectorized, we must maintain the original scalar
2208 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2209 // skip casts which do not need to be handled explicitly here, as
2210 // redundant casts will be removed during recipe simplification.
2212 continue;
2213
2214 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2215 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2216 assert(OldResTy->isIntegerTy() && "only integer types supported");
2217 (void)OldResSizeInBits;
2218
2219 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2220
2221 // Any wrapping introduced by shrinking this operation shouldn't be
2222 // considered undefined behavior. So, we can't unconditionally copy
2223 // arithmetic wrapping flags to VPW.
2224 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2225 VPW->dropPoisonGeneratingFlags();
2226
2227 if (OldResSizeInBits != NewResSizeInBits &&
2228 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2229 // Extend result to original width.
2230 auto *Ext =
2231 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2232 Ext->insertAfter(&R);
2233 ResultVPV->replaceAllUsesWith(Ext);
2234 Ext->setOperand(0, ResultVPV);
2235 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2236 } else {
2237 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2238 "Only ICmps should not need extending the result.");
2239 }
2240
2241 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2243 continue;
2244
2245 // Shrink operands by introducing truncates as needed.
2246 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2247 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2248 auto *Op = R.getOperand(Idx);
2249 unsigned OpSizeInBits =
2251 if (OpSizeInBits == NewResSizeInBits)
2252 continue;
2253 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2254 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2255 if (!IterIsEmpty) {
2256 R.setOperand(Idx, ProcessedIter->second);
2257 continue;
2258 }
2259
2260 VPBuilder Builder;
2261 if (Op->isLiveIn())
2262 Builder.setInsertPoint(PH);
2263 else
2264 Builder.setInsertPoint(&R);
2265 VPWidenCastRecipe *NewOp =
2266 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2267 ProcessedIter->second = NewOp;
2268 R.setOperand(Idx, NewOp);
2269 }
2270
2271 }
2272 }
2273}
2274
2278 VPValue *Cond;
2279 // Skip blocks that are not terminated by BranchOnCond.
2280 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2281 continue;
2282
2283 assert(VPBB->getNumSuccessors() == 2 &&
2284 "Two successors expected for BranchOnCond");
2285 unsigned RemovedIdx;
2286 if (match(Cond, m_True()))
2287 RemovedIdx = 1;
2288 else if (match(Cond, m_False()))
2289 RemovedIdx = 0;
2290 else
2291 continue;
2292
2293 VPBasicBlock *RemovedSucc =
2294 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2295 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2296 "There must be a single edge between VPBB and its successor");
2297 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2298 // these recipes.
2299 for (VPRecipeBase &R : RemovedSucc->phis())
2300 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2301
2302 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2303 // automatically on VPlan destruction if it becomes unreachable.
2304 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2305 VPBB->back().eraseFromParent();
2306 }
2307}
2308
2327
2328// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2329// the loop terminator with a branch-on-cond recipe with the negated
2330// active-lane-mask as operand. Note that this turns the loop into an
2331// uncountable one. Only the existing terminator is replaced, all other existing
2332// recipes/users remain unchanged, except for poison-generating flags being
2333// dropped from the canonical IV increment. Return the created
2334// VPActiveLaneMaskPHIRecipe.
2335//
2336// The function uses the following definitions:
2337//
2338// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2339// calculate-trip-count-minus-VF (original TC) : original TC
2340// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2341// CanonicalIVPhi : CanonicalIVIncrement
2342// %StartV is the canonical induction start value.
2343//
2344// The function adds the following recipes:
2345//
2346// vector.ph:
2347// %TripCount = calculate-trip-count-minus-VF (original TC)
2348// [if DataWithControlFlowWithoutRuntimeCheck]
2349// %EntryInc = canonical-iv-increment-for-part %StartV
2350// %EntryALM = active-lane-mask %EntryInc, %TripCount
2351//
2352// vector.body:
2353// ...
2354// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2355// ...
2356// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2357// %ALM = active-lane-mask %InLoopInc, TripCount
2358// %Negated = Not %ALM
2359// branch-on-cond %Negated
2360//
2363 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2364 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2365 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2366 VPValue *StartV = CanonicalIVPHI->getStartValue();
2367
2368 auto *CanonicalIVIncrement =
2369 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2370 // TODO: Check if dropping the flags is needed if
2371 // !DataAndControlFlowWithoutRuntimeCheck.
2372 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2373 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2374 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2375 // we have to take unrolling into account. Each part needs to start at
2376 // Part * VF
2377 auto *VecPreheader = Plan.getVectorPreheader();
2378 VPBuilder Builder(VecPreheader);
2379
2380 // Create the ActiveLaneMask instruction using the correct start values.
2381 VPValue *TC = Plan.getTripCount();
2382
2383 VPValue *TripCount, *IncrementValue;
2385 // When the loop is guarded by a runtime overflow check for the loop
2386 // induction variable increment by VF, we can increment the value before
2387 // the get.active.lane mask and use the unmodified tripcount.
2388 IncrementValue = CanonicalIVIncrement;
2389 TripCount = TC;
2390 } else {
2391 // When avoiding a runtime check, the active.lane.mask inside the loop
2392 // uses a modified trip count and the induction variable increment is
2393 // done after the active.lane.mask intrinsic is called.
2394 IncrementValue = CanonicalIVPHI;
2395 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2396 {TC}, DL);
2397 }
2398 auto *EntryIncrement = Builder.createOverflowingOp(
2399 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2400 "index.part.next");
2401
2402 // Create the active lane mask instruction in the VPlan preheader.
2403 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2404 ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
2405 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2406 {EntryIncrement, TC, ALMMultiplier}, DL,
2407 "active.lane.mask.entry");
2408
2409 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2410 // preheader ActiveLaneMask instruction.
2411 auto *LaneMaskPhi =
2413 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2414
2415 // Create the active lane mask for the next iteration of the loop before the
2416 // original terminator.
2417 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2418 Builder.setInsertPoint(OriginalTerminator);
2419 auto *InLoopIncrement =
2420 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2421 {IncrementValue}, {false, false}, DL);
2422 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2423 {InLoopIncrement, TripCount, ALMMultiplier},
2424 DL, "active.lane.mask.next");
2425 LaneMaskPhi->addOperand(ALM);
2426
2427 // Replace the original terminator with BranchOnCond. We have to invert the
2428 // mask here because a true condition means jumping to the exit block.
2429 auto *NotMask = Builder.createNot(ALM, DL);
2430 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2431 OriginalTerminator->eraseFromParent();
2432 return LaneMaskPhi;
2433}
2434
2435/// Collect the header mask with the pattern:
2436/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2437/// TODO: Introduce explicit recipe for header-mask instead of searching
2438/// for the header-mask pattern manually.
2440 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2441 SmallVector<VPValue *> WideCanonicalIVs;
2442 auto *FoundWidenCanonicalIVUser = find_if(
2444 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2446 "Must have at most one VPWideCanonicalIVRecipe");
2447 if (FoundWidenCanonicalIVUser !=
2448 LoopRegion->getCanonicalIV()->users().end()) {
2449 auto *WideCanonicalIV =
2450 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2451 WideCanonicalIVs.push_back(WideCanonicalIV);
2452 }
2453
2454 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2455 // version of the canonical induction.
2456 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2457 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2458 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2459 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2460 WideCanonicalIVs.push_back(WidenOriginalIV);
2461 }
2462
2463 // Walk users of wide canonical IVs and find the single compare of the form
2464 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2465 VPSingleDefRecipe *HeaderMask = nullptr;
2466 for (auto *Wide : WideCanonicalIVs) {
2467 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2468 auto *VPI = dyn_cast<VPInstruction>(U);
2469 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2470 continue;
2471
2472 assert(VPI->getOperand(0) == Wide &&
2473 "WidenCanonicalIV must be the first operand of the compare");
2474 assert(!HeaderMask && "Multiple header masks found?");
2475 HeaderMask = VPI;
2476 }
2477 }
2478 return HeaderMask;
2479}
2480
2482 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2485 UseActiveLaneMaskForControlFlow) &&
2486 "DataAndControlFlowWithoutRuntimeCheck implies "
2487 "UseActiveLaneMaskForControlFlow");
2488
2489 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2490 auto *FoundWidenCanonicalIVUser = find_if(
2492 assert(FoundWidenCanonicalIVUser &&
2493 "Must have widened canonical IV when tail folding!");
2494 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2495 auto *WideCanonicalIV =
2496 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2497 VPSingleDefRecipe *LaneMask;
2498 if (UseActiveLaneMaskForControlFlow) {
2501 } else {
2502 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2503 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2504 ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
2505 LaneMask =
2506 B.createNaryOp(VPInstruction::ActiveLaneMask,
2507 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2508 nullptr, "active.lane.mask");
2509 }
2510
2511 // Walk users of WideCanonicalIV and replace the header mask of the form
2512 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2513 // removing the old one to ensure there is always only a single header mask.
2514 HeaderMask->replaceAllUsesWith(LaneMask);
2515 HeaderMask->eraseFromParent();
2516}
2517
2518/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2519/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2520/// recipe could be created.
2521/// \p HeaderMask Header Mask.
2522/// \p CurRecipe Recipe to be transform.
2523/// \p TypeInfo VPlan-based type analysis.
2524/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2525/// \p EVL The explicit vector length parameter of vector-predication
2526/// intrinsics.
2528 VPRecipeBase &CurRecipe,
2529 VPTypeAnalysis &TypeInfo,
2530 VPValue &AllOneMask, VPValue &EVL) {
2531 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2532 // header mask.
2533 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2534 assert(OrigMask && "Unmasked recipe when folding tail");
2535 // HeaderMask will be handled using EVL.
2536 VPValue *Mask;
2537 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2538 return Mask;
2539 return HeaderMask == OrigMask ? nullptr : OrigMask;
2540 };
2541
2542 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2543 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2544 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2545 if (!EndPtr)
2546 return Addr;
2547 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2548 "VPVectorEndPointerRecipe with non-VF VF operand?");
2549 assert(
2550 all_of(EndPtr->users(),
2551 [](VPUser *U) {
2552 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2553 }) &&
2554 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2555 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2556 EVLAddr->insertBefore(&CurRecipe);
2557 EVLAddr->setOperand(1, &EVL);
2558 return EVLAddr;
2559 };
2560
2563 VPValue *NewMask = GetNewMask(L->getMask());
2564 VPValue *NewAddr = GetNewAddr(L->getAddr());
2565 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2566 })
2567 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2568 VPValue *NewMask = GetNewMask(S->getMask());
2569 VPValue *NewAddr = GetNewAddr(S->getAddr());
2570 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2571 })
2572 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2573 VPValue *NewMask = GetNewMask(IR->getMask());
2574 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2575 })
2576 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2577 VPValue *NewMask = GetNewMask(Red->getCondOp());
2578 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2579 })
2580 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2581 VPValue *LHS, *RHS;
2582 // Transform select with a header mask condition
2583 // select(header_mask, LHS, RHS)
2584 // into vector predication merge.
2585 // vp.merge(all-true, LHS, RHS, EVL)
2586 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2587 m_VPValue(RHS))))
2588 return nullptr;
2589 // Use all true as the condition because this transformation is
2590 // limited to selects whose condition is a header mask.
2591 return new VPWidenIntrinsicRecipe(
2592 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2593 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2594 })
2595 .Default([&](VPRecipeBase *R) { return nullptr; });
2596}
2597
2598/// Replace recipes with their EVL variants.
2600 VPTypeAnalysis TypeInfo(Plan);
2601 VPValue *AllOneMask = Plan.getTrue();
2602 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2603 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2604
2605 assert(all_of(Plan.getVF().users(),
2608 "User of VF that we can't transform to EVL.");
2609 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2611 });
2612
2613 assert(all_of(Plan.getVFxUF().users(),
2614 [&LoopRegion, &Plan](VPUser *U) {
2615 return match(U,
2616 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2617 m_Specific(&Plan.getVFxUF()))) ||
2618 isa<VPWidenPointerInductionRecipe>(U);
2619 }) &&
2620 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2621 "increment of the canonical induction.");
2622 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2623 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2624 // canonical induction must not be updated.
2626 });
2627
2628 // Defer erasing recipes till the end so that we don't invalidate the
2629 // VPTypeAnalysis cache.
2631
2632 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2633 // contained.
2634 bool ContainsFORs =
2636 if (ContainsFORs) {
2637 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2638 VPValue *MaxEVL = &Plan.getVF();
2639 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2640 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2641 MaxEVL = Builder.createScalarZExtOrTrunc(
2642 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2643 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2644
2645 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2646 VPValue *PrevEVL = Builder.createScalarPhi(
2647 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2648
2651 for (VPRecipeBase &R : *VPBB) {
2652 VPValue *V1, *V2;
2653 if (!match(&R,
2655 m_VPValue(V1), m_VPValue(V2))))
2656 continue;
2657 VPValue *Imm = Plan.getOrAddLiveIn(
2660 Intrinsic::experimental_vp_splice,
2661 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2662 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2663 VPSplice->insertBefore(&R);
2664 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2665 ToErase.push_back(&R);
2666 }
2667 }
2668 }
2669
2670 VPValue *HeaderMask = findHeaderMask(Plan);
2671 if (!HeaderMask)
2672 return;
2673
2674 // Replace header masks with a mask equivalent to predicating by EVL:
2675 //
2676 // icmp ule widen-canonical-iv backedge-taken-count
2677 // ->
2678 // icmp ult step-vector, EVL
2679 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2680 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2681 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2682 VPValue *EVLMask = Builder.createICmp(
2684 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2685 HeaderMask->replaceAllUsesWith(EVLMask);
2686 ToErase.push_back(HeaderMask->getDefiningRecipe());
2687
2688 // Try to optimize header mask recipes away to their EVL variants.
2689 // TODO: Split optimizeMaskToEVL out and move into
2690 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2691 // tryToBuildVPlanWithVPRecipes beforehand.
2692 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2693 auto *CurRecipe = cast<VPRecipeBase>(U);
2694 VPRecipeBase *EVLRecipe =
2695 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2696 if (!EVLRecipe)
2697 continue;
2698
2699 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2700 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2701 "New recipe must define the same number of values as the "
2702 "original.");
2703 EVLRecipe->insertBefore(CurRecipe);
2705 EVLRecipe)) {
2706 for (unsigned I = 0; I < NumDefVal; ++I) {
2707 VPValue *CurVPV = CurRecipe->getVPValue(I);
2708 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2709 }
2710 }
2711 ToErase.push_back(CurRecipe);
2712 }
2713 // Remove dead EVL mask.
2714 if (EVLMask->getNumUsers() == 0)
2715 ToErase.push_back(EVLMask->getDefiningRecipe());
2716
2717 for (VPRecipeBase *R : reverse(ToErase)) {
2718 SmallVector<VPValue *> PossiblyDead(R->operands());
2719 R->eraseFromParent();
2720 for (VPValue *Op : PossiblyDead)
2722 }
2723}
2724
2725/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2726/// replaces all uses except the canonical IV increment of
2727/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2728/// is used only for loop iterations counting after this transformation.
2729///
2730/// The function uses the following definitions:
2731/// %StartV is the canonical induction start value.
2732///
2733/// The function adds the following recipes:
2734///
2735/// vector.ph:
2736/// ...
2737///
2738/// vector.body:
2739/// ...
2740/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2741/// [ %NextEVLIV, %vector.body ]
2742/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2743/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2744/// ...
2745/// %OpEVL = cast i32 %VPEVL to IVSize
2746/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2747/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2748/// ...
2749///
2750/// If MaxSafeElements is provided, the function adds the following recipes:
2751/// vector.ph:
2752/// ...
2753///
2754/// vector.body:
2755/// ...
2756/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2757/// [ %NextEVLIV, %vector.body ]
2758/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2759/// %cmp = cmp ult %AVL, MaxSafeElements
2760/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2761/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2762/// ...
2763/// %OpEVL = cast i32 %VPEVL to IVSize
2764/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2765/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2766/// ...
2767///
2769 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2770 if (Plan.hasScalarVFOnly())
2771 return;
2772 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2773 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2774
2775 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2776 auto *CanIVTy = CanonicalIVPHI->getScalarType();
2777 VPValue *StartV = CanonicalIVPHI->getStartValue();
2778
2779 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2780 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2781 EVLPhi->insertAfter(CanonicalIVPHI);
2782 VPBuilder Builder(Header, Header->getFirstNonPhi());
2783 // Create the AVL (application vector length), starting from TC -> 0 in steps
2784 // of EVL.
2785 VPPhi *AVLPhi = Builder.createScalarPhi(
2786 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2787 VPValue *AVL = AVLPhi;
2788
2789 if (MaxSafeElements) {
2790 // Support for MaxSafeDist for correct loop emission.
2791 VPValue *AVLSafe =
2792 Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
2793 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2794 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2795 "safe_avl");
2796 }
2797 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2799
2800 auto *CanonicalIVIncrement =
2801 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2802 Builder.setInsertPoint(CanonicalIVIncrement);
2803 VPValue *OpVPEVL = VPEVL;
2804
2805 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2806 OpVPEVL = Builder.createScalarZExtOrTrunc(
2807 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2808
2809 auto *NextEVLIV = Builder.createOverflowingOp(
2810 Instruction::Add, {OpVPEVL, EVLPhi},
2811 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2812 CanonicalIVIncrement->hasNoSignedWrap()},
2813 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2814 EVLPhi->addOperand(NextEVLIV);
2815
2816 VPValue *NextAVL = Builder.createOverflowingOp(
2817 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2818 DebugLoc::getCompilerGenerated(), "avl.next");
2819 AVLPhi->addOperand(NextAVL);
2820
2821 transformRecipestoEVLRecipes(Plan, *VPEVL);
2822
2823 // Replace all uses of VPCanonicalIVPHIRecipe by
2824 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2825 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2826 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2827 // TODO: support unroll factor > 1.
2828 Plan.setUF(1);
2829}
2830
2832 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2833 // There should be only one EVL PHI in the entire plan.
2834 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2835
2838 for (VPRecipeBase &R : VPBB->phis())
2839 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2840 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2841 EVLPhi = PhiR;
2842 }
2843
2844 // Early return if no EVL PHI is found.
2845 if (!EVLPhi)
2846 return;
2847
2848 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2849 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2850 VPValue *AVL;
2851 [[maybe_unused]] bool FoundAVL =
2852 match(EVLIncrement,
2853 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2854 assert(FoundAVL && "Didn't find AVL?");
2855
2856 // The AVL may be capped to a safe distance.
2857 VPValue *SafeAVL;
2858 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2859 AVL = SafeAVL;
2860
2861 VPValue *AVLNext;
2862 [[maybe_unused]] bool FoundAVLNext =
2864 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2865 assert(FoundAVLNext && "Didn't find AVL backedge?");
2866
2867 // Convert EVLPhi to concrete recipe.
2868 auto *ScalarR =
2869 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2870 EVLPhi->getDebugLoc(), "evl.based.iv");
2871 EVLPhi->replaceAllUsesWith(ScalarR);
2872 EVLPhi->eraseFromParent();
2873
2874 // Replace CanonicalIVInc with EVL-PHI increment.
2875 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2876 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2877 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2878 m_Specific(&Plan.getVFxUF()))) &&
2879 "Unexpected canonical iv");
2880 Backedge->replaceAllUsesWith(EVLIncrement);
2881
2882 // Remove unused phi and increment.
2883 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2884 CanonicalIVIncrement->eraseFromParent();
2885 CanonicalIV->eraseFromParent();
2886
2887 // Replace the use of VectorTripCount in the latch-exiting block.
2888 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2889 // After: (branch-on-cond eq AVLNext, 0)
2890
2891 VPBasicBlock *LatchExiting =
2892 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2893 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2894 // Skip single-iteration loop region
2895 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2896 return;
2897 assert(LatchExitingBr &&
2898 match(LatchExitingBr,
2899 m_BranchOnCount(m_VPValue(EVLIncrement),
2900 m_Specific(&Plan.getVectorTripCount()))) &&
2901 "Unexpected terminator in EVL loop");
2902
2903 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2904 VPBuilder Builder(LatchExitingBr);
2905 VPValue *Cmp =
2906 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2908 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2909 LatchExitingBr->eraseFromParent();
2910}
2911
2913 VPlan &Plan, PredicatedScalarEvolution &PSE,
2914 const DenseMap<Value *, const SCEV *> &StridesMap) {
2915 // Replace VPValues for known constant strides guaranteed by predicate scalar
2916 // evolution.
2917 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2918 auto *R = cast<VPRecipeBase>(&U);
2919 return R->getRegion() ||
2920 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2921 };
2922 ValueToSCEVMapTy RewriteMap;
2923 for (const SCEV *Stride : StridesMap.values()) {
2924 using namespace SCEVPatternMatch;
2925 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2926 const APInt *StrideConst;
2927 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2928 // Only handle constant strides for now.
2929 continue;
2930
2931 auto *CI =
2932 Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
2933 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2934 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2935
2936 // The versioned value may not be used in the loop directly but through a
2937 // sext/zext. Add new live-ins in those cases.
2938 for (Value *U : StrideV->users()) {
2940 continue;
2941 VPValue *StrideVPV = Plan.getLiveIn(U);
2942 if (!StrideVPV)
2943 continue;
2944 unsigned BW = U->getType()->getScalarSizeInBits();
2945 APInt C =
2946 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2947 VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
2948 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2949 }
2950 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2951 }
2952
2953 for (VPRecipeBase &R : *Plan.getEntry()) {
2954 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2955 if (!ExpSCEV)
2956 continue;
2957 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2958 auto *NewSCEV =
2959 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2960 if (NewSCEV != ScevExpr) {
2961 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2962 ExpSCEV->replaceAllUsesWith(NewExp);
2963 if (Plan.getTripCount() == ExpSCEV)
2964 Plan.resetTripCount(NewExp);
2965 }
2966 }
2967}
2968
2970 VPlan &Plan,
2971 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2972 // Collect recipes in the backward slice of `Root` that may generate a poison
2973 // value that is used after vectorization.
2975 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2977 Worklist.push_back(Root);
2978
2979 // Traverse the backward slice of Root through its use-def chain.
2980 while (!Worklist.empty()) {
2981 VPRecipeBase *CurRec = Worklist.pop_back_val();
2982
2983 if (!Visited.insert(CurRec).second)
2984 continue;
2985
2986 // Prune search if we find another recipe generating a widen memory
2987 // instruction. Widen memory instructions involved in address computation
2988 // will lead to gather/scatter instructions, which don't need to be
2989 // handled.
2991 VPHeaderPHIRecipe>(CurRec))
2992 continue;
2993
2994 // This recipe contributes to the address computation of a widen
2995 // load/store. If the underlying instruction has poison-generating flags,
2996 // drop them directly.
2997 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
2998 VPValue *A, *B;
2999 // Dropping disjoint from an OR may yield incorrect results, as some
3000 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3001 // for dependence analysis). Instead, replace it with an equivalent Add.
3002 // This is possible as all users of the disjoint OR only access lanes
3003 // where the operands are disjoint or poison otherwise.
3004 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3005 RecWithFlags->isDisjoint()) {
3006 VPBuilder Builder(RecWithFlags);
3007 VPInstruction *New = Builder.createOverflowingOp(
3008 Instruction::Add, {A, B}, {false, false},
3009 RecWithFlags->getDebugLoc());
3010 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3011 RecWithFlags->replaceAllUsesWith(New);
3012 RecWithFlags->eraseFromParent();
3013 CurRec = New;
3014 } else
3015 RecWithFlags->dropPoisonGeneratingFlags();
3016 } else {
3019 (void)Instr;
3020 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3021 "found instruction with poison generating flags not covered by "
3022 "VPRecipeWithIRFlags");
3023 }
3024
3025 // Add new definitions to the worklist.
3026 for (VPValue *Operand : CurRec->operands())
3027 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3028 Worklist.push_back(OpDef);
3029 }
3030 });
3031
3032 // Traverse all the recipes in the VPlan and collect the poison-generating
3033 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3034 // VPInterleaveRecipe.
3035 auto Iter = vp_depth_first_deep(Plan.getEntry());
3037 for (VPRecipeBase &Recipe : *VPBB) {
3038 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3039 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3040 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3041 if (AddrDef && WidenRec->isConsecutive() &&
3042 BlockNeedsPredication(UnderlyingInstr.getParent()))
3043 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3044 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3045 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3046 if (AddrDef) {
3047 // Check if any member of the interleave group needs predication.
3048 const InterleaveGroup<Instruction> *InterGroup =
3049 InterleaveRec->getInterleaveGroup();
3050 bool NeedPredication = false;
3051 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3052 I < NumMembers; ++I) {
3053 Instruction *Member = InterGroup->getMember(I);
3054 if (Member)
3055 NeedPredication |= BlockNeedsPredication(Member->getParent());
3056 }
3057
3058 if (NeedPredication)
3059 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3060 }
3061 }
3062 }
3063 }
3064}
3065
3067 VPlan &Plan,
3069 &InterleaveGroups,
3070 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3071 if (InterleaveGroups.empty())
3072 return;
3073
3074 // Interleave memory: for each Interleave Group we marked earlier as relevant
3075 // for this VPlan, replace the Recipes widening its memory instructions with a
3076 // single VPInterleaveRecipe at its insertion point.
3077 VPDominatorTree VPDT(Plan);
3078 for (const auto *IG : InterleaveGroups) {
3079 auto *Start =
3080 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3081 VPIRMetadata InterleaveMD(*Start);
3082 SmallVector<VPValue *, 4> StoredValues;
3083 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3084 StoredValues.push_back(StoreR->getStoredValue());
3085 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3086 Instruction *MemberI = IG->getMember(I);
3087 if (!MemberI)
3088 continue;
3089 VPWidenMemoryRecipe *MemoryR =
3090 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3091 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3092 StoredValues.push_back(StoreR->getStoredValue());
3093 InterleaveMD.intersect(*MemoryR);
3094 }
3095
3096 bool NeedsMaskForGaps =
3097 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3098 (!StoredValues.empty() && !IG->isFull());
3099
3100 Instruction *IRInsertPos = IG->getInsertPos();
3101 auto *InsertPos =
3102 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3103
3105 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3106 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3107 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3108
3109 // Get or create the start address for the interleave group.
3110 VPValue *Addr = Start->getAddr();
3111 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3112 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3113 // We cannot re-use the address of member zero because it does not
3114 // dominate the insert position. Instead, use the address of the insert
3115 // position and create a PtrAdd adjusting it to the address of member
3116 // zero.
3117 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3118 // InsertPos or sink loads above zero members to join it.
3119 assert(IG->getIndex(IRInsertPos) != 0 &&
3120 "index of insert position shouldn't be zero");
3121 auto &DL = IRInsertPos->getDataLayout();
3122 APInt Offset(32,
3123 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3124 IG->getIndex(IRInsertPos),
3125 /*IsSigned=*/true);
3126 VPValue *OffsetVPV =
3127 Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
3128 VPBuilder B(InsertPos);
3129 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3130 }
3131 // If the group is reverse, adjust the index to refer to the last vector
3132 // lane instead of the first. We adjust the index from the first vector
3133 // lane, rather than directly getting the pointer for lane VF - 1, because
3134 // the pointer operand of the interleaved access is supposed to be uniform.
3135 if (IG->isReverse()) {
3136 auto *ReversePtr = new VPVectorEndPointerRecipe(
3137 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3138 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3139 ReversePtr->insertBefore(InsertPos);
3140 Addr = ReversePtr;
3141 }
3142 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3143 InsertPos->getMask(), NeedsMaskForGaps,
3144 InterleaveMD, InsertPos->getDebugLoc());
3145 VPIG->insertBefore(InsertPos);
3146
3147 unsigned J = 0;
3148 for (unsigned i = 0; i < IG->getFactor(); ++i)
3149 if (Instruction *Member = IG->getMember(i)) {
3150 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3151 if (!Member->getType()->isVoidTy()) {
3152 VPValue *OriginalV = MemberR->getVPSingleValue();
3153 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3154 J++;
3155 }
3156 MemberR->eraseFromParent();
3157 }
3158 }
3159}
3160
3161/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3162/// value, phi and backedge value. In the following example:
3163///
3164/// vector.ph:
3165/// Successor(s): vector loop
3166///
3167/// <x1> vector loop: {
3168/// vector.body:
3169/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3170/// ...
3171/// EMIT branch-on-count ...
3172/// No successors
3173/// }
3174///
3175/// WIDEN-INDUCTION will get expanded to:
3176///
3177/// vector.ph:
3178/// ...
3179/// vp<%induction.start> = ...
3180/// vp<%induction.increment> = ...
3181///
3182/// Successor(s): vector loop
3183///
3184/// <x1> vector loop: {
3185/// vector.body:
3186/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3187/// ...
3188/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3189/// EMIT branch-on-count ...
3190/// No successors
3191/// }
3192static void
3194 VPTypeAnalysis &TypeInfo) {
3195 VPlan *Plan = WidenIVR->getParent()->getPlan();
3196 VPValue *Start = WidenIVR->getStartValue();
3197 VPValue *Step = WidenIVR->getStepValue();
3198 VPValue *VF = WidenIVR->getVFValue();
3199 DebugLoc DL = WidenIVR->getDebugLoc();
3200
3201 // The value from the original loop to which we are mapping the new induction
3202 // variable.
3203 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3204
3205 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3208 // FIXME: The newly created binary instructions should contain nsw/nuw
3209 // flags, which can be found from the original scalar operations.
3210 VPIRFlags Flags;
3211 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3212 AddOp = Instruction::Add;
3213 MulOp = Instruction::Mul;
3214 } else {
3215 AddOp = ID.getInductionOpcode();
3216 MulOp = Instruction::FMul;
3217 Flags = ID.getInductionBinOp()->getFastMathFlags();
3218 }
3219
3220 // If the phi is truncated, truncate the start and step values.
3221 VPBuilder Builder(Plan->getVectorPreheader());
3222 Type *StepTy = TypeInfo.inferScalarType(Step);
3223 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3224 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3225 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3226 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3227 StepTy = Ty;
3228 }
3229
3230 // Construct the initial value of the vector IV in the vector loop preheader.
3231 Type *IVIntTy =
3233 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3234 if (StepTy->isFloatingPointTy())
3235 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3236
3237 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3238 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3239
3240 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3241 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3242 DebugLoc::getUnknown(), "induction");
3243
3244 // Create the widened phi of the vector IV.
3245 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3246 WidenIVR->getDebugLoc(), "vec.ind");
3247 WidePHI->insertBefore(WidenIVR);
3248
3249 // Create the backedge value for the vector IV.
3250 VPValue *Inc;
3251 VPValue *Prev;
3252 // If unrolled, use the increment and prev value from the operands.
3253 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3254 Inc = SplatVF;
3255 Prev = WidenIVR->getLastUnrolledPartOperand();
3256 } else {
3257 if (VPRecipeBase *R = VF->getDefiningRecipe())
3258 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3259 // Multiply the vectorization factor by the step using integer or
3260 // floating-point arithmetic as appropriate.
3261 if (StepTy->isFloatingPointTy())
3262 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3263 DL);
3264 else
3265 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3266 TypeInfo.inferScalarType(VF), DL);
3267
3268 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3269 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3270 Prev = WidePHI;
3271 }
3272
3274 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3275 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3276 WidenIVR->getDebugLoc(), "vec.ind.next");
3277
3278 WidePHI->addOperand(Next);
3279
3280 WidenIVR->replaceAllUsesWith(WidePHI);
3281}
3282
3283/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3284/// initial value, phi and backedge value. In the following example:
3285///
3286/// <x1> vector loop: {
3287/// vector.body:
3288/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3289/// ...
3290/// EMIT branch-on-count ...
3291/// }
3292///
3293/// WIDEN-POINTER-INDUCTION will get expanded to:
3294///
3295/// <x1> vector loop: {
3296/// vector.body:
3297/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3298/// EMIT %mul = mul %stepvector, %step
3299/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3300/// ...
3301/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3302/// EMIT branch-on-count ...
3303/// }
3305 VPTypeAnalysis &TypeInfo) {
3306 VPlan *Plan = R->getParent()->getPlan();
3307 VPValue *Start = R->getStartValue();
3308 VPValue *Step = R->getStepValue();
3309 VPValue *VF = R->getVFValue();
3310
3311 assert(R->getInductionDescriptor().getKind() ==
3313 "Not a pointer induction according to InductionDescriptor!");
3314 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3315 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3316 "Recipe should have been replaced");
3317
3318 VPBuilder Builder(R);
3319 DebugLoc DL = R->getDebugLoc();
3320
3321 // Build a scalar pointer phi.
3322 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3323
3324 // Create actual address geps that use the pointer phi as base and a
3325 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3326 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3327 Type *StepTy = TypeInfo.inferScalarType(Step);
3328 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3329 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3330 VPValue *PtrAdd = Builder.createNaryOp(
3331 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3332 R->replaceAllUsesWith(PtrAdd);
3333
3334 // Create the backedge value for the scalar pointer phi.
3336 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3337 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3338 DL);
3339 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3340
3341 VPValue *InductionGEP =
3342 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3343 ScalarPtrPhi->addOperand(InductionGEP);
3344}
3345
3347 // Replace loop regions with explicity CFG.
3348 SmallVector<VPRegionBlock *> LoopRegions;
3350 vp_depth_first_deep(Plan.getEntry()))) {
3351 if (!R->isReplicator())
3352 LoopRegions.push_back(R);
3353 }
3354 for (VPRegionBlock *R : LoopRegions)
3355 R->dissolveToCFGLoop();
3356}
3357
3359 VPTypeAnalysis TypeInfo(Plan);
3362 vp_depth_first_deep(Plan.getEntry()))) {
3363 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3364 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3365 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3366 ToRemove.push_back(WidenIVR);
3367 continue;
3368 }
3369
3370 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3371 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3372 ToRemove.push_back(WidenIVR);
3373 continue;
3374 }
3375
3376 // Expand VPBlendRecipe into VPInstruction::Select.
3377 VPBuilder Builder(&R);
3378 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3379 VPValue *Select = Blend->getIncomingValue(0);
3380 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3381 Select = Builder.createSelect(Blend->getMask(I),
3382 Blend->getIncomingValue(I), Select,
3383 R.getDebugLoc(), "predphi");
3384 Blend->replaceAllUsesWith(Select);
3385 ToRemove.push_back(Blend);
3386 }
3387
3388 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3389 Expr->decompose();
3390 ToRemove.push_back(Expr);
3391 }
3392
3393 VPValue *VectorStep;
3394 VPValue *ScalarStep;
3396 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3397 continue;
3398
3399 // Expand WideIVStep.
3400 auto *VPI = cast<VPInstruction>(&R);
3401 Type *IVTy = TypeInfo.inferScalarType(VPI);
3402 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3404 ? Instruction::UIToFP
3405 : Instruction::Trunc;
3406 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3407 }
3408
3409 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3410 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3411 ScalarStep =
3412 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3413 }
3414
3415 VPIRFlags Flags;
3416 if (IVTy->isFloatingPointTy())
3417 Flags = {VPI->getFastMathFlags()};
3418
3419 unsigned MulOpc =
3420 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3421 VPInstruction *Mul = Builder.createNaryOp(
3422 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3423 VectorStep = Mul;
3424 VPI->replaceAllUsesWith(VectorStep);
3425 ToRemove.push_back(VPI);
3426 }
3427 }
3428
3429 for (VPRecipeBase *R : ToRemove)
3430 R->eraseFromParent();
3431}
3432
3434 VPBasicBlock *EarlyExitVPBB,
3435 VPlan &Plan,
3436 VPBasicBlock *HeaderVPBB,
3437 VPBasicBlock *LatchVPBB) {
3438 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3439 if (!EarlyExitVPBB->getSinglePredecessor() &&
3440 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3441 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3442 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3443 "unsupported early exit VPBB");
3444 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3445 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3446 // of the phis.
3447 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3448 cast<VPIRPhi>(&R)->swapOperands();
3449 }
3450
3451 VPBuilder Builder(LatchVPBB->getTerminator());
3452 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3453 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3454 "Terminator must be be BranchOnCond");
3455 VPValue *CondOfEarlyExitingVPBB =
3456 EarlyExitingVPBB->getTerminator()->getOperand(0);
3457 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3458 ? CondOfEarlyExitingVPBB
3459 : Builder.createNot(CondOfEarlyExitingVPBB);
3460
3461 // Split the middle block and have it conditionally branch to the early exit
3462 // block if CondToEarlyExit.
3463 VPValue *IsEarlyExitTaken =
3464 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3465 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3466 VPBasicBlock *VectorEarlyExitVPBB =
3467 Plan.createVPBasicBlock("vector.early.exit");
3468 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3469 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3470 NewMiddle->swapSuccessors();
3471
3472 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3473
3474 // Update the exit phis in the early exit block.
3475 VPBuilder MiddleBuilder(NewMiddle);
3476 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3477 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3478 auto *ExitIRI = cast<VPIRPhi>(&R);
3479 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3480 // a single predecessor and 1 if it has two.
3481 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3482 if (ExitIRI->getNumOperands() != 1) {
3483 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3484 // predecessor. Extract its last lane.
3485 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3486 }
3487
3488 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3489 if (!IncomingFromEarlyExit->isLiveIn()) {
3490 // Update the incoming value from the early exit.
3491 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3492 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3493 "first.active.lane");
3494 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3495 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3496 nullptr, "early.exit.value");
3497 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3498 }
3499 }
3500 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3501
3502 // Replace the condition controlling the non-early exit from the vector loop
3503 // with one exiting if either the original condition of the vector latch is
3504 // true or the early exit has been taken.
3505 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3506 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3507 "Unexpected terminator");
3508 auto *IsLatchExitTaken =
3509 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3510 LatchExitingBranch->getOperand(1));
3511 auto *AnyExitTaken = Builder.createNaryOp(
3512 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3513 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3514 LatchExitingBranch->eraseFromParent();
3515}
3516
3517/// This function tries convert extended in-loop reductions to
3518/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3519/// valid. The created recipe must be decomposed to its constituent
3520/// recipes before execution.
3521static VPExpressionRecipe *
3523 VFRange &Range) {
3524 Type *RedTy = Ctx.Types.inferScalarType(Red);
3525 VPValue *VecOp = Red->getVecOp();
3526
3527 // Clamp the range if using extended-reduction is profitable.
3528 auto IsExtendedRedValidAndClampRange =
3529 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3531 [&](ElementCount VF) {
3532 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3534
3535 InstructionCost ExtRedCost;
3536 InstructionCost ExtCost =
3537 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3538 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3539
3543 // FIXME: Move partial reduction creation, costing and clamping
3544 // here from LoopVectorize.cpp.
3545 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3546 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3547 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3548 } else {
3549 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3550 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3551 Red->getFastMathFlags(), CostKind);
3552 }
3553 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3554 },
3555 Range);
3556 };
3557
3558 VPValue *A;
3559 // Match reduce(ext)).
3560 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3561 IsExtendedRedValidAndClampRange(
3562 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3563 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3564 Ctx.Types.inferScalarType(A)))
3565 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3566
3567 return nullptr;
3568}
3569
3570/// This function tries convert extended in-loop reductions to
3571/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3572/// and valid. The created VPExpressionRecipe must be decomposed to its
3573/// constituent recipes before execution. Patterns of the
3574/// VPExpressionRecipe:
3575/// reduce.add(mul(...)),
3576/// reduce.add(mul(ext(A), ext(B))),
3577/// reduce.add(ext(mul(ext(A), ext(B)))).
3578static VPExpressionRecipe *
3580 VPCostContext &Ctx, VFRange &Range) {
3581 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3582
3583 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3584 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3585 return nullptr;
3586
3587 Type *RedTy = Ctx.Types.inferScalarType(Red);
3588
3589 // Clamp the range if using multiply-accumulate-reduction is profitable.
3590 auto IsMulAccValidAndClampRange =
3592 VPWidenCastRecipe *OuterExt) -> bool {
3594 [&](ElementCount VF) {
3596 Type *SrcTy =
3597 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3598 InstructionCost MulAccCost;
3599
3600 if (IsPartialReduction) {
3601 Type *SrcTy2 =
3602 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3603 // FIXME: Move partial reduction creation, costing and clamping
3604 // here from LoopVectorize.cpp.
3605 MulAccCost = Ctx.TTI.getPartialReductionCost(
3606 Opcode, SrcTy, SrcTy2, RedTy, VF,
3608 Ext0->getOpcode())
3611 Ext1->getOpcode())
3613 Mul->getOpcode(), CostKind);
3614 } else {
3615 // Only partial reductions support mixed extends at the moment.
3616 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3617 return false;
3618
3619 bool IsZExt =
3620 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3621 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3622 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3623 SrcVecTy, CostKind);
3624 }
3625
3626 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3627 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3628 InstructionCost ExtCost = 0;
3629 if (Ext0)
3630 ExtCost += Ext0->computeCost(VF, Ctx);
3631 if (Ext1)
3632 ExtCost += Ext1->computeCost(VF, Ctx);
3633 if (OuterExt)
3634 ExtCost += OuterExt->computeCost(VF, Ctx);
3635
3636 return MulAccCost.isValid() &&
3637 MulAccCost < ExtCost + MulCost + RedCost;
3638 },
3639 Range);
3640 };
3641
3642 VPValue *VecOp = Red->getVecOp();
3643 VPRecipeBase *Sub = nullptr;
3644 VPValue *A, *B;
3645 VPValue *Tmp = nullptr;
3646 // Sub reductions could have a sub between the add reduction and vec op.
3647 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3648 Sub = VecOp->getDefiningRecipe();
3649 VecOp = Tmp;
3650 }
3651
3652 // If ValB is a constant and can be safely extended, truncate it to the same
3653 // type as ExtA's operand, then extend it to the same type as ExtA. This
3654 // creates two uniform extends that can more easily be matched by the rest of
3655 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3656 // replaced with the new extend of the constant.
3657 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3658 VPWidenCastRecipe *&ExtB,
3659 VPValue *&ValB, VPWidenRecipe *Mul) {
3660 if (!ExtA || ExtB || !ValB->isLiveIn())
3661 return;
3662 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3663 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3664 const APInt *Const;
3665 if (!match(ValB, m_APInt(Const)) ||
3667 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3668 return;
3669 // The truncate ensures that the type of each extended operand is the
3670 // same, and it's been proven that the constant can be extended from
3671 // NarrowTy safely. Necessary since ExtA's extended operand would be
3672 // e.g. an i8, while the const will likely be an i32. This will be
3673 // elided by later optimisations.
3674 VPBuilder Builder(Mul);
3675 auto *Trunc =
3676 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3677 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3678 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3679 Mul->setOperand(1, ExtB);
3680 };
3681
3682 // Try to match reduce.add(mul(...)).
3683 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3684 auto *RecipeA =
3685 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3686 auto *RecipeB =
3687 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3688 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3689
3690 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3691 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3692
3693 // Match reduce.add/sub(mul(ext, ext)).
3694 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3695 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3696 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3697 if (Sub)
3698 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3699 cast<VPWidenRecipe>(Sub), Red);
3700 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3701 }
3702 // TODO: Add an expression type for this variant with a negated mul
3703 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3704 return new VPExpressionRecipe(Mul, Red);
3705 }
3706 // TODO: Add an expression type for negated versions of other expression
3707 // variants.
3708 if (Sub)
3709 return nullptr;
3710
3711 // Match reduce.add(ext(mul(A, B))).
3712 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3713 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3714 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3715 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3716 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3717
3718 // reduce.add(ext(mul(ext, const)))
3719 // -> reduce.add(ext(mul(ext, ext(const))))
3720 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3721
3722 // reduce.add(ext(mul(ext(A), ext(B))))
3723 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3724 // The inner extends must either have the same opcode as the outer extend or
3725 // be the same, in which case the multiply can never result in a negative
3726 // value and the outer extend can be folded away by doing wider
3727 // extends for the operands of the mul.
3728 if (Ext0 && Ext1 &&
3729 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3730 Ext0->getOpcode() == Ext1->getOpcode() &&
3731 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3732 auto *NewExt0 = new VPWidenCastRecipe(
3733 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3734 *Ext0, Ext0->getDebugLoc());
3735 NewExt0->insertBefore(Ext0);
3736
3737 VPWidenCastRecipe *NewExt1 = NewExt0;
3738 if (Ext0 != Ext1) {
3739 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3740 Ext->getResultType(), *Ext1, *Ext1,
3741 Ext1->getDebugLoc());
3742 NewExt1->insertBefore(Ext1);
3743 }
3744 Mul->setOperand(0, NewExt0);
3745 Mul->setOperand(1, NewExt1);
3746 Red->setOperand(1, Mul);
3747 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3748 }
3749 }
3750 return nullptr;
3751}
3752
3753/// This function tries to create abstract recipes from the reduction recipe for
3754/// following optimizations and cost estimation.
3756 VPCostContext &Ctx,
3757 VFRange &Range) {
3758 VPExpressionRecipe *AbstractR = nullptr;
3759 auto IP = std::next(Red->getIterator());
3760 auto *VPBB = Red->getParent();
3761 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3762 AbstractR = MulAcc;
3763 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3764 AbstractR = ExtRed;
3765 // Cannot create abstract inloop reduction recipes.
3766 if (!AbstractR)
3767 return;
3768
3769 AbstractR->insertBefore(*VPBB, IP);
3770 Red->replaceAllUsesWith(AbstractR);
3771}
3772
3783
3785 if (Plan.hasScalarVFOnly())
3786 return;
3787
3788#ifndef NDEBUG
3789 VPDominatorTree VPDT(Plan);
3790#endif
3791
3792 SmallVector<VPValue *> VPValues;
3795 append_range(VPValues, Plan.getLiveIns());
3796 for (VPRecipeBase &R : *Plan.getEntry())
3797 append_range(VPValues, R.definedValues());
3798
3799 auto *VectorPreheader = Plan.getVectorPreheader();
3800 for (VPValue *VPV : VPValues) {
3802 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3803 isa<Constant>(VPV->getLiveInIRValue())))
3804 continue;
3805
3806 // Add explicit broadcast at the insert point that dominates all users.
3807 VPBasicBlock *HoistBlock = VectorPreheader;
3808 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3809 for (VPUser *User : VPV->users()) {
3810 if (User->usesScalars(VPV))
3811 continue;
3812 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3813 HoistPoint = HoistBlock->begin();
3814 else
3815 assert(VPDT.dominates(VectorPreheader,
3816 cast<VPRecipeBase>(User)->getParent()) &&
3817 "All users must be in the vector preheader or dominated by it");
3818 }
3819
3820 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3821 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3822 VPV->replaceUsesWithIf(Broadcast,
3823 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3824 return Broadcast != &U && !U.usesScalars(VPV);
3825 });
3826 }
3827}
3828
3830 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3832 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3833 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3834
3835 VPValue *TC = Plan.getTripCount();
3836 // Skip cases for which the trip count may be non-trivial to materialize.
3837 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3838 // tail is required.
3839 if (!Plan.hasScalarTail() ||
3841 Plan.getScalarPreheader() ||
3842 !TC->isLiveIn())
3843 return;
3844
3845 // Materialize vector trip counts for constants early if it can simply
3846 // be computed as (Original TC / VF * UF) * VF * UF.
3847 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3848 // tail-folded loops.
3849 ScalarEvolution &SE = *PSE.getSE();
3850 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3851 if (!isa<SCEVConstant>(TCScev))
3852 return;
3853 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3854 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3855 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3856 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3857}
3858
3860 VPBasicBlock *VectorPH) {
3862 if (BTC->getNumUsers() == 0)
3863 return;
3864
3865 VPBuilder Builder(VectorPH, VectorPH->begin());
3866 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3867 auto *TCMO = Builder.createNaryOp(
3868 Instruction::Sub,
3869 {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
3870 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3871 BTC->replaceAllUsesWith(TCMO);
3872}
3873
3875 if (Plan.hasScalarVFOnly())
3876 return;
3877
3878 VPTypeAnalysis TypeInfo(Plan);
3879 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3880 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3882 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3883 vp_depth_first_shallow(LoopRegion->getEntry()));
3884 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3885 // VPInstructions, excluding ones in replicate regions. Those are not
3886 // materialized explicitly yet. Those vector users are still handled in
3887 // VPReplicateRegion::execute(), via shouldPack().
3888 // TODO: materialize build vectors for replicating recipes in replicating
3889 // regions.
3890 for (VPBasicBlock *VPBB :
3891 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3892 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3894 continue;
3895 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3896 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3897 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3898 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3899 };
3900 if ((isa<VPReplicateRecipe>(DefR) &&
3901 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3902 (isa<VPInstruction>(DefR) &&
3904 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3905 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3906 continue;
3907
3908 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3909 unsigned Opcode = ScalarTy->isStructTy()
3912 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3913 BuildVector->insertAfter(DefR);
3914
3915 DefR->replaceUsesWithIf(
3916 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3917 VPUser &U, unsigned) {
3918 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3919 });
3920 }
3921 }
3922
3923 // Create explicit VPInstructions to convert vectors to scalars. The current
3924 // implementation is conservative - it may miss some cases that may or may not
3925 // be vector values. TODO: introduce Unpacks speculatively - remove them later
3926 // if they are known to operate on scalar values.
3927 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3928 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3931 continue;
3932 for (VPValue *Def : R.definedValues()) {
3933 // Skip recipes that are single-scalar or only have their first lane
3934 // used.
3935 // TODO: The Defs skipped here may or may not be vector values.
3936 // Introduce Unpacks, and remove them later, if they are guaranteed to
3937 // produce scalar values.
3939 continue;
3940
3941 // At the moment, we create unpacks only for scalar users outside
3942 // replicate regions. Recipes inside replicate regions still extract the
3943 // required lanes implicitly.
3944 // TODO: Remove once replicate regions are unrolled completely.
3945 auto IsCandidateUnpackUser = [Def](VPUser *U) {
3946 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3947 return U->usesScalars(Def) &&
3948 (!ParentRegion || !ParentRegion->isReplicator());
3949 };
3950 if (none_of(Def->users(), IsCandidateUnpackUser))
3951 continue;
3952
3953 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3954 if (R.isPhi())
3955 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3956 else
3957 Unpack->insertAfter(&R);
3958 Def->replaceUsesWithIf(Unpack,
3959 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
3960 return IsCandidateUnpackUser(&U);
3961 });
3962 }
3963 }
3964 }
3965}
3966
3968 VPBasicBlock *VectorPHVPBB,
3969 bool TailByMasking,
3970 bool RequiresScalarEpilogue) {
3971 VPValue &VectorTC = Plan.getVectorTripCount();
3972 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3973 // There's nothing to do if there are no users of the vector trip count or its
3974 // IR value has already been set.
3975 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3976 return;
3977
3978 VPValue *TC = Plan.getTripCount();
3979 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3980 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3981 VPValue *Step = &Plan.getVFxUF();
3982
3983 // If the tail is to be folded by masking, round the number of iterations N
3984 // up to a multiple of Step instead of rounding down. This is done by first
3985 // adding Step-1 and then rounding down. Note that it's ok if this addition
3986 // overflows: the vector induction variable will eventually wrap to zero given
3987 // that it starts at zero and its Step is a power of two; the loop will then
3988 // exit, with the last early-exit vector comparison also producing all-true.
3989 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3990 // is accounted for in emitIterationCountCheck that adds an overflow check.
3991 if (TailByMasking) {
3992 TC = Builder.createNaryOp(
3993 Instruction::Add,
3994 {TC, Builder.createNaryOp(
3995 Instruction::Sub,
3996 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
3997 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3998 }
3999
4000 // Now we need to generate the expression for the part of the loop that the
4001 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4002 // iterations are not required for correctness, or N - Step, otherwise. Step
4003 // is equal to the vectorization factor (number of SIMD elements) times the
4004 // unroll factor (number of SIMD instructions).
4005 VPValue *R =
4006 Builder.createNaryOp(Instruction::URem, {TC, Step},
4007 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4008
4009 // There are cases where we *must* run at least one iteration in the remainder
4010 // loop. See the cost model for when this can happen. If the step evenly
4011 // divides the trip count, we set the remainder to be equal to the step. If
4012 // the step does not evenly divide the trip count, no adjustment is necessary
4013 // since there will already be scalar iterations. Note that the minimum
4014 // iterations check ensures that N >= Step.
4015 if (RequiresScalarEpilogue) {
4016 assert(!TailByMasking &&
4017 "requiring scalar epilogue is not supported with fail folding");
4018 VPValue *IsZero = Builder.createICmp(
4019 CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
4020 R = Builder.createSelect(IsZero, Step, R);
4021 }
4022
4023 VPValue *Res = Builder.createNaryOp(
4024 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4025 VectorTC.replaceAllUsesWith(Res);
4026}
4027
4029 ElementCount VFEC) {
4030 VPBuilder Builder(VectorPH, VectorPH->begin());
4031 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4032 VPValue &VF = Plan.getVF();
4033 VPValue &VFxUF = Plan.getVFxUF();
4034 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4035 // used.
4036 // TODO: Assert that they aren't used.
4037
4038 // If there are no users of the runtime VF, compute VFxUF by constant folding
4039 // the multiplication of VF and UF.
4040 if (VF.getNumUsers() == 0) {
4041 VPValue *RuntimeVFxUF =
4042 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4043 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4044 return;
4045 }
4046
4047 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4048 // vscale) * UF.
4049 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4051 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4053 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4054 }
4055 VF.replaceAllUsesWith(RuntimeVF);
4056
4057 VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
4058 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4059 VFxUF.replaceAllUsesWith(MulByUF);
4060}
4061
4064 const DataLayout &DL = SE.getDataLayout();
4065 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4066
4067 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4068 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4069 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4070 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4072 continue;
4073 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4074 if (!ExpSCEV)
4075 break;
4076 const SCEV *Expr = ExpSCEV->getSCEV();
4077 Value *Res =
4078 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4079 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4080 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4081 ExpSCEV->replaceAllUsesWith(Exp);
4082 if (Plan.getTripCount() == ExpSCEV)
4083 Plan.resetTripCount(Exp);
4084 ExpSCEV->eraseFromParent();
4085 }
4087 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4088 "after any VPIRInstructions");
4089 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4090 // to the VPIRBasicBlock.
4091 auto EI = Entry->begin();
4092 for (Instruction &I : drop_end(*EntryBB)) {
4093 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4094 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4095 EI++;
4096 continue;
4097 }
4099 }
4100
4101 return ExpandedSCEVs;
4102}
4103
4104/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4105/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4106/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4107/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4108/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4109/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4110/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4111/// is defined at \p Idx of a load interleave group.
4112static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4113 VPValue *OpV, unsigned Idx) {
4114 auto *DefR = OpV->getDefiningRecipe();
4115 if (!DefR)
4116 return WideMember0->getOperand(OpIdx) == OpV;
4117 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
4118 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
4119
4120 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
4121 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4122 return false;
4123}
4124
4125/// Returns true if \p IR is a full interleave group with factor and number of
4126/// members both equal to \p VF. The interleave group must also access the full
4127/// vector width \p VectorRegWidth.
4129 unsigned VF, VPTypeAnalysis &TypeInfo,
4130 unsigned VectorRegWidth) {
4131 if (!InterleaveR || InterleaveR->getMask())
4132 return false;
4133
4134 Type *GroupElementTy = nullptr;
4135 if (InterleaveR->getStoredValues().empty()) {
4136 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4137 if (!all_of(InterleaveR->definedValues(),
4138 [&TypeInfo, GroupElementTy](VPValue *Op) {
4139 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4140 }))
4141 return false;
4142 } else {
4143 GroupElementTy =
4144 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4145 if (!all_of(InterleaveR->getStoredValues(),
4146 [&TypeInfo, GroupElementTy](VPValue *Op) {
4147 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4148 }))
4149 return false;
4150 }
4151
4152 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4153 auto IG = InterleaveR->getInterleaveGroup();
4154 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4155 GroupSize == VectorRegWidth;
4156}
4157
4158/// Returns true if \p VPValue is a narrow VPValue.
4159static bool isAlreadyNarrow(VPValue *VPV) {
4160 if (VPV->isLiveIn())
4161 return true;
4162 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4163 return RepR && RepR->isSingleScalar();
4164}
4165
4167 unsigned VectorRegWidth) {
4168 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4169 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4170 return;
4171
4172 VPTypeAnalysis TypeInfo(Plan);
4173
4174 unsigned VFMinVal = VF.getKnownMinValue();
4176 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4178 continue;
4179
4182 continue;
4183
4184 // Bail out on recipes not supported at the moment:
4185 // * phi recipes other than the canonical induction
4186 // * recipes writing to memory except interleave groups
4187 // Only support plans with a canonical induction phi.
4188 if (R.isPhi())
4189 return;
4190
4191 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4192 if (R.mayWriteToMemory() && !InterleaveR)
4193 return;
4194
4195 // Do not narrow interleave groups if there are VectorPointer recipes and
4196 // the plan was unrolled. The recipe implicitly uses VF from
4197 // VPTransformState.
4198 // TODO: Remove restriction once the VF for the VectorPointer offset is
4199 // modeled explicitly as operand.
4200 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4201 return;
4202
4203 // All other ops are allowed, but we reject uses that cannot be converted
4204 // when checking all allowed consumers (store interleave groups) below.
4205 if (!InterleaveR)
4206 continue;
4207
4208 // Bail out on non-consecutive interleave groups.
4209 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4210 VectorRegWidth))
4211 return;
4212
4213 // Skip read interleave groups.
4214 if (InterleaveR->getStoredValues().empty())
4215 continue;
4216
4217 // Narrow interleave groups, if all operands are already matching narrow
4218 // ops.
4219 auto *Member0 = InterleaveR->getStoredValues()[0];
4220 if (isAlreadyNarrow(Member0) &&
4221 all_of(InterleaveR->getStoredValues(),
4222 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4223 StoreGroups.push_back(InterleaveR);
4224 continue;
4225 }
4226
4227 // For now, we only support full interleave groups storing load interleave
4228 // groups.
4229 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4230 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4231 if (!DefR)
4232 return false;
4233 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4234 return IR && IR->getInterleaveGroup()->isFull() &&
4235 IR->getVPValue(Op.index()) == Op.value();
4236 })) {
4237 StoreGroups.push_back(InterleaveR);
4238 continue;
4239 }
4240
4241 // Check if all values feeding InterleaveR are matching wide recipes, which
4242 // operands that can be narrowed.
4243 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4244 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4245 if (!WideMember0)
4246 return;
4247 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4248 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4249 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4250 R->getNumOperands() > 2)
4251 return;
4252 if (any_of(enumerate(R->operands()),
4253 [WideMember0, Idx = I](const auto &P) {
4254 const auto &[OpIdx, OpV] = P;
4255 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4256 }))
4257 return;
4258 }
4259 StoreGroups.push_back(InterleaveR);
4260 }
4261
4262 if (StoreGroups.empty())
4263 return;
4264
4265 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4266 SmallPtrSet<VPValue *, 4> NarrowedOps;
4267 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4268 auto *R = V->getDefiningRecipe();
4269 if (!R || NarrowedOps.contains(V))
4270 return V;
4271 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4272 // Narrow interleave group to wide load, as transformed VPlan will only
4273 // process one original iteration.
4274 auto *LI =
4275 cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4276 auto *L = new VPWidenLoadRecipe(
4277 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4278 /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
4279 L->insertBefore(LoadGroup);
4280 NarrowedOps.insert(L);
4281 return L;
4282 }
4283
4284 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4285 assert(RepR->isSingleScalar() &&
4286 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4287 "must be a single scalar load");
4288 NarrowedOps.insert(RepR);
4289 return RepR;
4290 }
4291 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4292
4293 VPValue *PtrOp = WideLoad->getAddr();
4294 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4295 PtrOp = VecPtr->getOperand(0);
4296 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4297 // process one original iteration.
4298 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4299 /*IsUniform*/ true,
4300 /*Mask*/ nullptr, *WideLoad);
4301 N->insertBefore(WideLoad);
4302 NarrowedOps.insert(N);
4303 return N;
4304 };
4305
4306 // Narrow operation tree rooted at store groups.
4307 for (auto *StoreGroup : StoreGroups) {
4308 VPValue *Res = nullptr;
4309 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4310 if (isAlreadyNarrow(Member0)) {
4311 Res = Member0;
4312 } else if (auto *WideMember0 =
4314 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4315 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4316 Res = WideMember0;
4317 } else {
4318 Res = NarrowOp(Member0);
4319 }
4320
4321 auto *SI =
4322 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4323 auto *S = new VPWidenStoreRecipe(
4324 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4325 /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
4326 S->insertBefore(StoreGroup);
4327 StoreGroup->eraseFromParent();
4328 }
4329
4330 // Adjust induction to reflect that the transformed plan only processes one
4331 // original iteration.
4332 auto *CanIV = VectorLoop->getCanonicalIV();
4333 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4334 VPBuilder PHBuilder(Plan.getVectorPreheader());
4335
4336 VPValue *UF = Plan.getOrAddLiveIn(
4337 ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
4338 if (VF.isScalable()) {
4339 VPValue *VScale = PHBuilder.createElementCount(
4340 CanIV->getScalarType(), ElementCount::getScalable(1));
4341 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4342 Inc->setOperand(1, VScaleUF);
4343 Plan.getVF().replaceAllUsesWith(VScale);
4344 } else {
4345 Inc->setOperand(1, UF);
4347 Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
4348 }
4349 removeDeadRecipes(Plan);
4350}
4351
4352/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4353/// BranchOnCond recipe.
4355 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4356 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4357 auto *MiddleTerm =
4359 // Only add branch metadata if there is a (conditional) terminator.
4360 if (!MiddleTerm)
4361 return;
4362
4363 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4364 "must have a BranchOnCond");
4365 // Assume that `TripCount % VectorStep ` is equally distributed.
4366 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4367 if (VF.isScalable() && VScaleForTuning.has_value())
4368 VectorStep *= *VScaleForTuning;
4369 assert(VectorStep > 0 && "trip count should not be zero");
4370 MDBuilder MDB(Plan.getContext());
4371 MDNode *BranchWeights =
4372 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4373 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4374}
4375
4376/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4377/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4378/// the end value of the induction.
4380 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4381 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4382 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4383 // Truncated wide inductions resume from the last lane of their vector value
4384 // in the last vector iteration which is handled elsewhere.
4385 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4386 return nullptr;
4387
4388 VPValue *Start = WideIV->getStartValue();
4389 VPValue *Step = WideIV->getStepValue();
4391 VPValue *EndValue = VectorTC;
4392 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4393 EndValue = VectorPHBuilder.createDerivedIV(
4394 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4395 Start, VectorTC, Step);
4396 }
4397
4398 // EndValue is derived from the vector trip count (which has the same type as
4399 // the widest induction) and thus may be wider than the induction here.
4400 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4401 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4402 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4403 ScalarTypeOfWideIV,
4404 WideIV->getDebugLoc());
4405 }
4406
4407 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4408 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4409 return ResumePhiRecipe;
4410}
4411
4413 VPlan &Plan, VPRecipeBuilder &Builder,
4414 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4415 VPTypeAnalysis TypeInfo(Plan);
4416 auto *ScalarPH = Plan.getScalarPreheader();
4417 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4418 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4419 VPBuilder VectorPHBuilder(
4420 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4421 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4422 VPBuilder ScalarPHBuilder(ScalarPH);
4423 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4424 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4425
4426 // TODO: Extract final value from induction recipe initially, optimize to
4427 // pre-computed end value together in optimizeInductionExitUsers.
4428 auto *VectorPhiR =
4429 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4430 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4432 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4433 &Plan.getVectorTripCount())) {
4434 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4435 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4436 ScalarPhiIRI->addOperand(ResumePhi);
4437 continue;
4438 }
4439 // TODO: Also handle truncated inductions here. Computing end-values
4440 // separately should be done as VPlan-to-VPlan optimization, after
4441 // legalizing all resume values to use the last lane from the loop.
4442 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4443 "should only skip truncated wide inductions");
4444 continue;
4445 }
4446
4447 // The backedge value provides the value to resume coming out of a loop,
4448 // which for FORs is a vector whose last element needs to be extracted. The
4449 // start value provides the value if the loop is bypassed.
4450 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4451 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4452 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4453 "Cannot handle loops with uncountable early exits");
4454 if (IsFOR)
4455 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4456 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4457 "vector.recur.extract");
4458 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4459 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4460 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4461 ScalarPhiIRI->addOperand(ResumePhiR);
4462 }
4463}
4464
4466 VFRange &Range) {
4467 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4468 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4469 auto *MiddleVPBB = Plan.getMiddleBlock();
4470 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4471 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4472
4473 auto IsScalableOne = [](ElementCount VF) -> bool {
4474 return VF == ElementCount::getScalable(1);
4475 };
4476
4477 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4478 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4479 if (!FOR)
4480 continue;
4481
4482 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4483 "Cannot handle loops with uncountable early exits");
4484
4485 // This is the second phase of vectorizing first-order recurrences, creating
4486 // extract for users outside the loop. An overview of the transformation is
4487 // described below. Suppose we have the following loop with some use after
4488 // the loop of the last a[i-1],
4489 //
4490 // for (int i = 0; i < n; ++i) {
4491 // t = a[i - 1];
4492 // b[i] = a[i] - t;
4493 // }
4494 // use t;
4495 //
4496 // There is a first-order recurrence on "a". For this loop, the shorthand
4497 // scalar IR looks like:
4498 //
4499 // scalar.ph:
4500 // s.init = a[-1]
4501 // br scalar.body
4502 //
4503 // scalar.body:
4504 // i = phi [0, scalar.ph], [i+1, scalar.body]
4505 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4506 // s2 = a[i]
4507 // b[i] = s2 - s1
4508 // br cond, scalar.body, exit.block
4509 //
4510 // exit.block:
4511 // use = lcssa.phi [s1, scalar.body]
4512 //
4513 // In this example, s1 is a recurrence because it's value depends on the
4514 // previous iteration. In the first phase of vectorization, we created a
4515 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4516 // for users in the scalar preheader and exit block.
4517 //
4518 // vector.ph:
4519 // v_init = vector(..., ..., ..., a[-1])
4520 // br vector.body
4521 //
4522 // vector.body
4523 // i = phi [0, vector.ph], [i+4, vector.body]
4524 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4525 // v2 = a[i, i+1, i+2, i+3]
4526 // b[i] = v2 - v1
4527 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4528 // b[i, i+1, i+2, i+3] = v2 - v1
4529 // br cond, vector.body, middle.block
4530 //
4531 // middle.block:
4532 // vector.recur.extract.for.phi = v2(2)
4533 // vector.recur.extract = v2(3)
4534 // br cond, scalar.ph, exit.block
4535 //
4536 // scalar.ph:
4537 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4538 // [s.init, otherwise]
4539 // br scalar.body
4540 //
4541 // scalar.body:
4542 // i = phi [0, scalar.ph], [i+1, scalar.body]
4543 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4544 // s2 = a[i]
4545 // b[i] = s2 - s1
4546 // br cond, scalar.body, exit.block
4547 //
4548 // exit.block:
4549 // lo = lcssa.phi [s1, scalar.body],
4550 // [vector.recur.extract.for.phi, middle.block]
4551 //
4552 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4553 // Extract the penultimate value of the recurrence and use it as operand for
4554 // the VPIRInstruction modeling the phi.
4555 for (VPUser *U : FOR->users()) {
4556 using namespace llvm::VPlanPatternMatch;
4557 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4558 continue;
4559 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4560 // penultimate value of the recurrence. Instead we rely on the existing
4561 // extract of the last element from the result of
4562 // VPInstruction::FirstOrderRecurrenceSplice.
4563 // TODO: Consider vscale_range info and UF.
4565 Range))
4566 return;
4567 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4568 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4569 {}, "vector.recur.extract.for.phi");
4570 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4571 }
4572 }
4573}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1557
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3528
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3815
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3890
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3842
iterator end()
Definition VPlan.h:3852
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3850
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3903
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:220
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:638
const VPRecipeBase & back() const
Definition VPlan.h:3864
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2406
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2440
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2430
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2446
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2426
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:186
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:207
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:126
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:145
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:164
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2950
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3471
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:3498
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3636
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3559
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2995
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1981
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2029
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2018
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3968
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3992
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:979
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1063
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1014
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2547
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2539
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2568
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2620
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2579
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3136
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:473
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4114
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2828
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2669
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4003
const VPBlockBase * getEntry() const
Definition VPlan.h:4039
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4071
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4056
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4101
const VPBlockBase * getExiting() const
Definition VPlan.h:4051
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4064
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2872
bool isSingleScalar() const
Definition VPlan.h:2917
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2941
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3705
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1397
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1845
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1889
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3600
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1486
Instruction::CastOps getOpcode() const
Definition VPlan.h:1537
A recipe for handling GEP instructions.
Definition VPlan.h:1773
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2046
PHINode * getPHINode() const
Definition VPlan.h:2088
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2074
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2091
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2121
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2202
A recipe for widening vector intrinsics.
Definition VPlan.h:1544
A common base class for widening memory operations.
Definition VPlan.h:3178
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3243
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3236
A recipe for widened phis.
Definition VPlan.h:2257
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1443
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4127
bool hasVF(ElementCount VF) const
Definition VPlan.h:4333
LLVMContext & getContext() const
Definition VPlan.h:4321
VPBasicBlock * getEntry()
Definition VPlan.h:4221
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4312
bool hasScalableVF() const
Definition VPlan.h:4334
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4319
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4315
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4283
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4390
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4304
unsigned getUF() const
Definition VPlan.h:4353
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4457
bool hasUF(unsigned UF) const
Definition VPlan.h:4351
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4273
void setVF(ElementCount VF)
Definition VPlan.h:4327
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4366
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4297
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4246
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4435
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4396
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4375
bool hasScalarVFOnly() const
Definition VPlan.h:4344
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4264
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4405
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4269
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4402
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4226
void setUF(unsigned UF)
Definition VPlan.h:4358
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4489
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1734
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:830
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2299
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3306
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3266
A recipe for widening select instructions.
Definition VPlan.h:1727
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3388
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3346
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.