LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPIRValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
119 NewRecipe = new VPWidenCastRecipe(
120 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
121 VPIRFlags(*CI), VPIRMetadata(*CI));
122 } else {
123 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
124 *VPI, Ingredient.getDebugLoc());
125 }
126 }
127
128 NewRecipe->insertBefore(&Ingredient);
129 if (NewRecipe->getNumDefinedValues() == 1)
130 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
131 else
132 assert(NewRecipe->getNumDefinedValues() == 0 &&
133 "Only recpies with zero or one defined values expected");
134 Ingredient.eraseFromParent();
135 }
136 }
137 return true;
138}
139
140/// Helper for extra no-alias checks via known-safe recipe and SCEV.
142 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
143 VPReplicateRecipe &GroupLeader;
145 const Loop &L;
146 VPTypeAnalysis &TypeInfo;
147
148 // Return true if \p A and \p B are known to not alias for all VFs in the
149 // plan, checked via the distance between the accesses
150 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
151 if (A->getOpcode() != Instruction::Store ||
152 B->getOpcode() != Instruction::Store)
153 return false;
154
155 VPValue *AddrA = A->getOperand(1);
156 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
157 VPValue *AddrB = B->getOperand(1);
158 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
160 return false;
161
162 const APInt *Distance;
163 ScalarEvolution &SE = *PSE.getSE();
164 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
165 return false;
166
167 const DataLayout &DL = SE.getDataLayout();
168 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
169 uint64_t SizeA = DL.getTypeStoreSize(TyA);
170 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
171 uint64_t SizeB = DL.getTypeStoreSize(TyB);
172
173 // Use the maximum store size to ensure no overlap from either direction.
174 // Currently only handles fixed sizes, as it is only used for
175 // replicating VPReplicateRecipes.
176 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
177
178 auto VFs = B->getParent()->getPlan()->vectorFactors();
180 return Distance->abs().uge(
181 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
182 }
183
184public:
187 const Loop &L, VPTypeAnalysis &TypeInfo)
188 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
189 L(L), TypeInfo(TypeInfo) {}
190
191 /// Return true if \p R should be skipped during alias checking, either
192 /// because it's in the exclude set or because no-alias can be proven via
193 /// SCEV.
194 bool shouldSkip(VPRecipeBase &R) const {
195 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
196 return ExcludeRecipes.contains(&R) ||
197 (Store && isNoAliasViaDistance(Store, &GroupLeader));
198 }
199};
200
201/// Check if a memory operation doesn't alias with memory operations in blocks
202/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
203/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
204/// checked (for load hoisting). Otherwise recipes that both read and write
205/// memory are checked, and SCEV is used to prove no-alias between the group
206/// leader and other replicate recipes (for store sinking).
207static bool
209 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
210 std::optional<SinkStoreInfo> SinkInfo = {}) {
211 bool CheckReads = SinkInfo.has_value();
212 if (!MemLoc.AATags.Scope)
213 return false;
214
215 const AAMDNodes &MemAA = MemLoc.AATags;
216
217 for (VPBlockBase *Block = FirstBB; Block;
218 Block = Block->getSingleSuccessor()) {
219 assert(Block->getNumSuccessors() <= 1 &&
220 "Expected at most one successor in block chain");
221 auto *VPBB = cast<VPBasicBlock>(Block);
222 for (VPRecipeBase &R : *VPBB) {
223 if (SinkInfo && SinkInfo->shouldSkip(R))
224 continue;
225
226 // Skip recipes that don't need checking.
227 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
228 continue;
229
231 if (!Loc)
232 // Conservatively assume aliasing for memory operations without
233 // location.
234 return false;
235
236 // For reads, check if they don't alias in the reverse direction and
237 // skip if so.
238 if (CheckReads && R.mayReadFromMemory() &&
240 MemAA.NoAlias))
241 continue;
242
243 // Check if the memory operations may alias in the forward direction.
245 Loc->AATags.NoAlias))
246 return false;
247 }
248
249 if (Block == LastBB)
250 break;
251 }
252 return true;
253}
254
255/// Return true if we do not know how to (mechanically) hoist or sink \p R out
256/// of a loop region.
258 // Assumes don't alias anything or throw; as long as they're guaranteed to
259 // execute, they're safe to hoist.
261 return false;
262
263 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
264 // memory location is not modified in the vector loop.
265 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
266 return true;
267
268 // Allocas cannot be hoisted.
269 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
270 return RepR && RepR->getOpcode() == Instruction::Alloca;
271}
272
273static bool sinkScalarOperands(VPlan &Plan) {
274 auto Iter = vp_depth_first_deep(Plan.getEntry());
275 bool ScalarVFOnly = Plan.hasScalarVFOnly();
276 bool Changed = false;
277
279 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
280 VPBasicBlock *SinkTo, VPValue *Op) {
281 auto *Candidate =
282 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
283 if (!Candidate)
284 return;
285
286 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
287 // for now.
289 return;
290
291 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
292 return;
293
294 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
295 if (!ScalarVFOnly && RepR->isSingleScalar())
296 return;
297
298 WorkList.insert({SinkTo, Candidate});
299 };
300
301 // First, collect the operands of all recipes in replicate blocks as seeds for
302 // sinking.
304 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
305 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
306 continue;
307 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
308 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
309 continue;
310 for (auto &Recipe : *VPBB)
311 for (VPValue *Op : Recipe.operands())
312 InsertIfValidSinkCandidate(VPBB, Op);
313 }
314
315 // Try to sink each replicate or scalar IV steps recipe in the worklist.
316 for (unsigned I = 0; I != WorkList.size(); ++I) {
317 VPBasicBlock *SinkTo;
318 VPSingleDefRecipe *SinkCandidate;
319 std::tie(SinkTo, SinkCandidate) = WorkList[I];
320
321 // All recipe users of SinkCandidate must be in the same block SinkTo or all
322 // users outside of SinkTo must only use the first lane of SinkCandidate. In
323 // the latter case, we need to duplicate SinkCandidate.
324 auto UsersOutsideSinkTo =
325 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
326 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
327 });
328 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
329 return !U->usesFirstLaneOnly(SinkCandidate);
330 }))
331 continue;
332 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
333
334 if (NeedsDuplicating) {
335 if (ScalarVFOnly)
336 continue;
337 VPSingleDefRecipe *Clone;
338 if (auto *SinkCandidateRepR =
339 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
340 // TODO: Handle converting to uniform recipes as separate transform,
341 // then cloning should be sufficient here.
342 Instruction *I = SinkCandidate->getUnderlyingInstr();
343 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
344 nullptr /*Mask*/, *SinkCandidateRepR,
345 *SinkCandidateRepR);
346 // TODO: add ".cloned" suffix to name of Clone's VPValue.
347 } else {
348 Clone = SinkCandidate->clone();
349 }
350
351 Clone->insertBefore(SinkCandidate);
352 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
353 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
354 });
355 }
356 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
357 for (VPValue *Op : SinkCandidate->operands())
358 InsertIfValidSinkCandidate(SinkTo, Op);
359 Changed = true;
360 }
361 return Changed;
362}
363
364/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
365/// the mask.
367 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
368 if (!EntryBB || EntryBB->size() != 1 ||
369 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
370 return nullptr;
371
372 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
373}
374
375/// If \p R is a triangle region, return the 'then' block of the triangle.
377 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
378 if (EntryBB->getNumSuccessors() != 2)
379 return nullptr;
380
381 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
382 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
383 if (!Succ0 || !Succ1)
384 return nullptr;
385
386 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
387 return nullptr;
388 if (Succ0->getSingleSuccessor() == Succ1)
389 return Succ0;
390 if (Succ1->getSingleSuccessor() == Succ0)
391 return Succ1;
392 return nullptr;
393}
394
395// Merge replicate regions in their successor region, if a replicate region
396// is connected to a successor replicate region with the same predicate by a
397// single, empty VPBasicBlock.
399 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
400
401 // Collect replicate regions followed by an empty block, followed by another
402 // replicate region with matching masks to process front. This is to avoid
403 // iterator invalidation issues while merging regions.
406 vp_depth_first_deep(Plan.getEntry()))) {
407 if (!Region1->isReplicator())
408 continue;
409 auto *MiddleBasicBlock =
410 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
411 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
412 continue;
413
414 auto *Region2 =
415 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
416 if (!Region2 || !Region2->isReplicator())
417 continue;
418
419 VPValue *Mask1 = getPredicatedMask(Region1);
420 VPValue *Mask2 = getPredicatedMask(Region2);
421 if (!Mask1 || Mask1 != Mask2)
422 continue;
423
424 assert(Mask1 && Mask2 && "both region must have conditions");
425 WorkList.push_back(Region1);
426 }
427
428 // Move recipes from Region1 to its successor region, if both are triangles.
429 for (VPRegionBlock *Region1 : WorkList) {
430 if (TransformedRegions.contains(Region1))
431 continue;
432 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
433 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434
435 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
436 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
437 if (!Then1 || !Then2)
438 continue;
439
440 // Note: No fusion-preventing memory dependencies are expected in either
441 // region. Such dependencies should be rejected during earlier dependence
442 // checks, which guarantee accesses can be re-ordered for vectorization.
443 //
444 // Move recipes to the successor region.
445 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
446 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
447
448 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
449 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
450
451 // Move VPPredInstPHIRecipes from the merge block to the successor region's
452 // merge block. Update all users inside the successor region to use the
453 // original values.
454 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
455 VPValue *PredInst1 =
456 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
457 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
458 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
459 return cast<VPRecipeBase>(&U)->getParent() == Then2;
460 });
461
462 // Remove phi recipes that are unused after merging the regions.
463 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
464 Phi1ToMove.eraseFromParent();
465 continue;
466 }
467 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
468 }
469
470 // Remove the dead recipes in Region1's entry block.
471 for (VPRecipeBase &R :
472 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
473 R.eraseFromParent();
474
475 // Finally, remove the first region.
476 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
477 VPBlockUtils::disconnectBlocks(Pred, Region1);
478 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
479 }
480 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
481 TransformedRegions.insert(Region1);
482 }
483
484 return !TransformedRegions.empty();
485}
486
488 VPlan &Plan) {
489 Instruction *Instr = PredRecipe->getUnderlyingInstr();
490 // Build the triangular if-then region.
491 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
492 assert(Instr->getParent() && "Predicated instruction not in any basic block");
493 auto *BlockInMask = PredRecipe->getMask();
494 auto *MaskDef = BlockInMask->getDefiningRecipe();
495 auto *BOMRecipe = new VPBranchOnMaskRecipe(
496 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
497 auto *Entry =
498 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
499
500 // Replace predicated replicate recipe with a replicate recipe without a
501 // mask but in the replicate region.
502 auto *RecipeWithoutMask = new VPReplicateRecipe(
503 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
504 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
505 PredRecipe->getDebugLoc());
506 auto *Pred =
507 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
508
509 VPPredInstPHIRecipe *PHIRecipe = nullptr;
510 if (PredRecipe->getNumUsers() != 0) {
511 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
512 RecipeWithoutMask->getDebugLoc());
513 PredRecipe->replaceAllUsesWith(PHIRecipe);
514 PHIRecipe->setOperand(0, RecipeWithoutMask);
515 }
516 PredRecipe->eraseFromParent();
517 auto *Exiting =
518 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
520 Plan.createReplicateRegion(Entry, Exiting, RegionName);
521
522 // Note: first set Entry as region entry and then connect successors starting
523 // from it in order, to propagate the "parent" of each VPBasicBlock.
524 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
525 VPBlockUtils::connectBlocks(Pred, Exiting);
526
527 return Region;
528}
529
530static void addReplicateRegions(VPlan &Plan) {
533 vp_depth_first_deep(Plan.getEntry()))) {
534 for (VPRecipeBase &R : *VPBB)
535 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
536 if (RepR->isPredicated())
537 WorkList.push_back(RepR);
538 }
539 }
540
541 unsigned BBNum = 0;
542 for (VPReplicateRecipe *RepR : WorkList) {
543 VPBasicBlock *CurrentBlock = RepR->getParent();
544 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
545
546 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
547 SplitBlock->setName(
548 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
549 // Record predicated instructions for above packing optimizations.
551 Region->setParent(CurrentBlock->getParent());
553
554 VPRegionBlock *ParentRegion = Region->getParent();
555 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
556 ParentRegion->setExiting(SplitBlock);
557 }
558}
559
560/// Remove redundant VPBasicBlocks by merging them into their predecessor if
561/// the predecessor has a single successor.
565 vp_depth_first_deep(Plan.getEntry()))) {
566 // Don't fold the blocks in the skeleton of the Plan into their single
567 // predecessors for now.
568 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
569 if (!VPBB->getParent())
570 continue;
571 auto *PredVPBB =
572 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
573 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
574 isa<VPIRBasicBlock>(PredVPBB))
575 continue;
576 WorkList.push_back(VPBB);
577 }
578
579 for (VPBasicBlock *VPBB : WorkList) {
580 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
581 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
582 R.moveBefore(*PredVPBB, PredVPBB->end());
583 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
584 auto *ParentRegion = VPBB->getParent();
585 if (ParentRegion && ParentRegion->getExiting() == VPBB)
586 ParentRegion->setExiting(PredVPBB);
587 for (auto *Succ : to_vector(VPBB->successors())) {
589 VPBlockUtils::connectBlocks(PredVPBB, Succ);
590 }
591 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
592 }
593 return !WorkList.empty();
594}
595
597 // Convert masked VPReplicateRecipes to if-then region blocks.
599
600 bool ShouldSimplify = true;
601 while (ShouldSimplify) {
602 ShouldSimplify = sinkScalarOperands(Plan);
603 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
604 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
605 }
606}
607
608/// Remove redundant casts of inductions.
609///
610/// Such redundant casts are casts of induction variables that can be ignored,
611/// because we already proved that the casted phi is equal to the uncasted phi
612/// in the vectorized loop. There is no need to vectorize the cast - the same
613/// value can be used for both the phi and casts in the vector loop.
615 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
617 if (!IV || IV->getTruncInst())
618 continue;
619
620 // A sequence of IR Casts has potentially been recorded for IV, which
621 // *must be bypassed* when the IV is vectorized, because the vectorized IV
622 // will produce the desired casted value. This sequence forms a def-use
623 // chain and is provided in reverse order, ending with the cast that uses
624 // the IV phi. Search for the recipe of the last cast in the chain and
625 // replace it with the original IV. Note that only the final cast is
626 // expected to have users outside the cast-chain and the dead casts left
627 // over will be cleaned up later.
628 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
629 VPValue *FindMyCast = IV;
630 for (Instruction *IRCast : reverse(Casts)) {
631 VPSingleDefRecipe *FoundUserCast = nullptr;
632 for (auto *U : FindMyCast->users()) {
633 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
634 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
635 FoundUserCast = UserCast;
636 break;
637 }
638 }
639 FindMyCast = FoundUserCast;
640 }
641 FindMyCast->replaceAllUsesWith(IV);
642 }
643}
644
645/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
646/// recipe, if it exists.
648 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
649 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
650 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
651 for (VPUser *U : CanonicalIV->users()) {
653 if (WidenNewIV)
654 break;
655 }
656
657 if (!WidenNewIV)
658 return;
659
660 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
661 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
662 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
663
664 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
665 continue;
666
667 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
668 // everything WidenNewIV's users need. That is, WidenOriginalIV will
669 // generate a vector phi or all users of WidenNewIV demand the first lane
670 // only.
671 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
672 vputils::onlyFirstLaneUsed(WidenNewIV)) {
673 // We are replacing a wide canonical iv with a suitable wide induction.
674 // This is used to compute header mask, hence all lanes will be used and
675 // we need to drop wrap flags only applying to lanes guranteed to execute
676 // in the original scalar loop.
677 WidenOriginalIV->dropPoisonGeneratingFlags();
678 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
679 WidenNewIV->eraseFromParent();
680 return;
681 }
682 }
683}
684
685/// Returns true if \p R is dead and can be removed.
686static bool isDeadRecipe(VPRecipeBase &R) {
687 // Do remove conditional assume instructions as their conditions may be
688 // flattened.
689 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
690 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
692 if (IsConditionalAssume)
693 return true;
694
695 if (R.mayHaveSideEffects())
696 return false;
697
698 // Recipe is dead if no user keeps the recipe alive.
699 return all_of(R.definedValues(),
700 [](VPValue *V) { return V->getNumUsers() == 0; });
701}
702
705 vp_post_order_deep(Plan.getEntry()))) {
706 // The recipes in the block are processed in reverse order, to catch chains
707 // of dead recipes.
708 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
709 if (isDeadRecipe(R)) {
710 R.eraseFromParent();
711 continue;
712 }
713
714 // Check if R is a dead VPPhi <-> update cycle and remove it.
715 auto *PhiR = dyn_cast<VPPhi>(&R);
716 if (!PhiR || PhiR->getNumOperands() != 2)
717 continue;
718 VPUser *PhiUser = PhiR->getSingleUser();
719 if (!PhiUser)
720 continue;
721 VPValue *Incoming = PhiR->getOperand(1);
722 if (PhiUser != Incoming->getDefiningRecipe() ||
723 Incoming->getNumUsers() != 1)
724 continue;
725 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
726 PhiR->eraseFromParent();
727 Incoming->getDefiningRecipe()->eraseFromParent();
728 }
729 }
730}
731
734 Instruction::BinaryOps InductionOpcode,
735 FPMathOperator *FPBinOp, Instruction *TruncI,
736 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
737 VPBuilder &Builder) {
738 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
739 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
740 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
741 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
742 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
743
744 // Truncate base induction if needed.
745 VPTypeAnalysis TypeInfo(Plan);
746 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
747 if (TruncI) {
748 Type *TruncTy = TruncI->getType();
749 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
750 "Not truncating.");
751 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
752 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
753 ResultTy = TruncTy;
754 }
755
756 // Truncate step if needed.
757 Type *StepTy = TypeInfo.inferScalarType(Step);
758 if (ResultTy != StepTy) {
759 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
760 "Not truncating.");
761 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
762 auto *VecPreheader =
764 VPBuilder::InsertPointGuard Guard(Builder);
765 Builder.setInsertPoint(VecPreheader);
766 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
767 }
768 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
769 &Plan.getVF(), DL);
770}
771
774 for (unsigned I = 0; I != Users.size(); ++I) {
776 if (isa<VPHeaderPHIRecipe>(Cur))
777 continue;
778 for (VPValue *V : Cur->definedValues())
779 Users.insert_range(V->users());
780 }
781 return Users.takeVector();
782}
783
784/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
785/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
786/// generates scalar values.
787static VPValue *
789 VPlan &Plan, VPBuilder &Builder) {
791 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
792 VPValue *StepV = PtrIV->getOperand(1);
794 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
795 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
796
797 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
798 PtrIV->getDebugLoc(), "next.gep");
799}
800
801/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
802/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
803/// VPWidenPointerInductionRecipe will generate vectors only. If some users
804/// require vectors while other require scalars, the scalar uses need to extract
805/// the scalars from the generated vectors (Note that this is different to how
806/// int/fp inductions are handled). Legalize extract-from-ends using uniform
807/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
808/// the correct end value is available. Also optimize
809/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
810/// providing them scalar steps built on the canonical scalar IV and update the
811/// original IV's users. This is an optional optimization to reduce the needs of
812/// vector extracts.
815 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
816 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
817 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
818 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
819 if (!PhiR)
820 continue;
821
822 // Try to narrow wide and replicating recipes to uniform recipes, based on
823 // VPlan analysis.
824 // TODO: Apply to all recipes in the future, to replace legacy uniformity
825 // analysis.
826 auto Users = collectUsersRecursively(PhiR);
827 for (VPUser *U : reverse(Users)) {
828 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
829 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
830 // Skip recipes that shouldn't be narrowed.
831 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
832 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
833 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
834 continue;
835
836 // Skip recipes that may have other lanes than their first used.
838 continue;
839
840 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
841 Def->operands(), /*IsUniform*/ true,
842 /*Mask*/ nullptr, /*Flags*/ *Def);
843 Clone->insertAfter(Def);
844 Def->replaceAllUsesWith(Clone);
845 }
846
847 // Replace wide pointer inductions which have only their scalars used by
848 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
849 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
850 if (!Plan.hasScalarVFOnly() &&
851 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
852 continue;
853
854 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
855 PtrIV->replaceAllUsesWith(PtrAdd);
856 continue;
857 }
858
859 // Replace widened induction with scalar steps for users that only use
860 // scalars.
861 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
862 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
863 return U->usesScalars(WideIV);
864 }))
865 continue;
866
867 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
869 Plan, ID.getKind(), ID.getInductionOpcode(),
870 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
871 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
872 WideIV->getDebugLoc(), Builder);
873
874 // Update scalar users of IV to use Step instead.
875 if (!HasOnlyVectorVFs) {
876 assert(!Plan.hasScalableVF() &&
877 "plans containing a scalar VF cannot also include scalable VFs");
878 WideIV->replaceAllUsesWith(Steps);
879 } else {
880 bool HasScalableVF = Plan.hasScalableVF();
881 WideIV->replaceUsesWithIf(Steps,
882 [WideIV, HasScalableVF](VPUser &U, unsigned) {
883 if (HasScalableVF)
884 return U.usesFirstLaneOnly(WideIV);
885 return U.usesScalars(WideIV);
886 });
887 }
888 }
889}
890
891/// Check if \p VPV is an untruncated wide induction, either before or after the
892/// increment. If so return the header IV (before the increment), otherwise
893/// return null.
896 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
897 if (WideIV) {
898 // VPV itself is a wide induction, separately compute the end value for exit
899 // users if it is not a truncated IV.
900 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
901 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
902 }
903
904 // Check if VPV is an optimizable induction increment.
905 VPRecipeBase *Def = VPV->getDefiningRecipe();
906 if (!Def || Def->getNumOperands() != 2)
907 return nullptr;
908 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
909 if (!WideIV)
910 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
911 if (!WideIV)
912 return nullptr;
913
914 auto IsWideIVInc = [&]() {
915 auto &ID = WideIV->getInductionDescriptor();
916
917 // Check if VPV increments the induction by the induction step.
918 VPValue *IVStep = WideIV->getStepValue();
919 switch (ID.getInductionOpcode()) {
920 case Instruction::Add:
921 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
922 case Instruction::FAdd:
924 m_Specific(IVStep)));
925 case Instruction::FSub:
926 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
927 m_Specific(IVStep)));
928 case Instruction::Sub: {
929 // IVStep will be the negated step of the subtraction. Check if Step == -1
930 // * IVStep.
931 VPValue *Step;
932 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
933 return false;
934 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
935 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
936 ScalarEvolution &SE = *PSE.getSE();
937 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
938 !isa<SCEVCouldNotCompute>(StepSCEV) &&
939 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
940 }
941 default:
942 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
943 match(VPV, m_GetElementPtr(m_Specific(WideIV),
944 m_Specific(WideIV->getStepValue())));
945 }
946 llvm_unreachable("should have been covered by switch above");
947 };
948 return IsWideIVInc() ? WideIV : nullptr;
949}
950
951/// Attempts to optimize the induction variable exit values for users in the
952/// early exit block.
954 VPTypeAnalysis &TypeInfo,
955 VPBlockBase *PredVPBB,
956 VPValue *Op,
958 VPValue *Incoming, *Mask;
961 return nullptr;
962
963 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
964 if (!WideIV)
965 return nullptr;
966
967 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
968 if (WideIntOrFp && WideIntOrFp->getTruncInst())
969 return nullptr;
970
971 // Calculate the final index.
972 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
973 auto *CanonicalIV = LoopRegion->getCanonicalIV();
974 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
975 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
976
977 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
978 VPValue *FirstActiveLane =
979 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
980 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
981 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
982 FirstActiveLaneType, DL);
983 VPValue *EndValue =
984 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
985
986 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
987 // changed it means the exit is using the incremented value, so we need to
988 // add the step.
989 if (Incoming != WideIV) {
990 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
991 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
992 }
993
994 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
995 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
996 VPIRValue *Start = WideIV->getStartValue();
997 VPValue *Step = WideIV->getStepValue();
998 EndValue = B.createDerivedIV(
999 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1000 Start, EndValue, Step);
1001 }
1002
1003 return EndValue;
1004}
1005
1006/// Attempts to optimize the induction variable exit values for users in the
1007/// exit block coming from the latch in the original scalar loop.
1009 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1013 return nullptr;
1014
1015 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1016 if (!WideIV)
1017 return nullptr;
1018
1019 VPValue *EndValue = EndValues.lookup(WideIV);
1020 assert(EndValue && "end value must have been pre-computed");
1021
1022 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1023 // changed it means the exit is using the incremented value, so we don't
1024 // need to subtract the step.
1025 if (Incoming != WideIV)
1026 return EndValue;
1027
1028 // Otherwise, subtract the step from the EndValue.
1029 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1030 VPValue *Step = WideIV->getStepValue();
1031 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1032 if (ScalarTy->isIntegerTy())
1033 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1034 DebugLoc::getUnknown(), "ind.escape");
1035 if (ScalarTy->isPointerTy()) {
1036 Type *StepTy = TypeInfo.inferScalarType(Step);
1037 auto *Zero = Plan.getConstantInt(StepTy, 0);
1038 return B.createPtrAdd(EndValue,
1039 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1040 DebugLoc::getUnknown(), "ind.escape");
1041 }
1042 if (ScalarTy->isFloatingPointTy()) {
1043 const auto &ID = WideIV->getInductionDescriptor();
1044 return B.createNaryOp(
1045 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1046 ? Instruction::FSub
1047 : Instruction::FAdd,
1048 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1049 }
1050 llvm_unreachable("all possible induction types must be handled");
1051 return nullptr;
1052}
1053
1055 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1057 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1058 VPTypeAnalysis TypeInfo(Plan);
1059 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1060 for (VPRecipeBase &R : ExitVPBB->phis()) {
1061 auto *ExitIRI = cast<VPIRPhi>(&R);
1062
1063 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1064 VPValue *Escape = nullptr;
1065 if (PredVPBB == MiddleVPBB)
1066 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1067 ExitIRI->getOperand(Idx),
1068 EndValues, PSE);
1069 else
1071 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1072 if (Escape)
1073 ExitIRI->setOperand(Idx, Escape);
1074 }
1075 }
1076 }
1077}
1078
1079/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1080/// them with already existing recipes expanding the same SCEV expression.
1083
1084 for (VPRecipeBase &R :
1086 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1087 if (!ExpR)
1088 continue;
1089
1090 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1091 if (Inserted)
1092 continue;
1093 ExpR->replaceAllUsesWith(V->second);
1094 ExpR->eraseFromParent();
1095 }
1096}
1097
1099 SmallVector<VPValue *> WorkList;
1101 WorkList.push_back(V);
1102
1103 while (!WorkList.empty()) {
1104 VPValue *Cur = WorkList.pop_back_val();
1105 if (!Seen.insert(Cur).second)
1106 continue;
1107 VPRecipeBase *R = Cur->getDefiningRecipe();
1108 if (!R)
1109 continue;
1110 if (!isDeadRecipe(*R))
1111 continue;
1112 append_range(WorkList, R->operands());
1113 R->eraseFromParent();
1114 }
1115}
1116
1117/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1118/// Returns an optional pair, where the first element indicates whether it is
1119/// an intrinsic ID.
1120static std::optional<std::pair<bool, unsigned>>
1122 return TypeSwitch<const VPSingleDefRecipe *,
1123 std::optional<std::pair<bool, unsigned>>>(R)
1126 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1127 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1128 return std::make_pair(true, I->getVectorIntrinsicID());
1129 })
1130 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1131 // For recipes that do not directly map to LLVM IR instructions,
1132 // assign opcodes after the last VPInstruction opcode (which is also
1133 // after the last IR Instruction opcode), based on the VPDefID.
1134 return std::make_pair(false,
1135 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1136 })
1137 .Default([](auto *) { return std::nullopt; });
1138}
1139
1140/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1141/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1142/// Operands are foldable live-ins.
1144 ArrayRef<VPValue *> Operands,
1145 const DataLayout &DL,
1146 VPTypeAnalysis &TypeInfo) {
1147 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1148 if (!OpcodeOrIID)
1149 return nullptr;
1150
1152 for (VPValue *Op : Operands) {
1154 return nullptr;
1155 Value *V = Op->getUnderlyingValue();
1156 if (!V)
1157 return nullptr;
1158 Ops.push_back(V);
1159 }
1160
1161 auto FoldToIRValue = [&]() -> Value * {
1162 InstSimplifyFolder Folder(DL);
1163 if (OpcodeOrIID->first) {
1164 if (R.getNumOperands() != 2)
1165 return nullptr;
1166 unsigned ID = OpcodeOrIID->second;
1167 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1168 TypeInfo.inferScalarType(&R));
1169 }
1170 unsigned Opcode = OpcodeOrIID->second;
1171 if (Instruction::isBinaryOp(Opcode))
1172 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1173 Ops[0], Ops[1]);
1174 if (Instruction::isCast(Opcode))
1175 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1176 TypeInfo.inferScalarType(R.getVPSingleValue()));
1177 switch (Opcode) {
1179 return Folder.FoldSelect(Ops[0], Ops[1],
1181 case VPInstruction::Not:
1182 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1184 case Instruction::Select:
1185 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1186 case Instruction::ICmp:
1187 case Instruction::FCmp:
1188 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1189 Ops[1]);
1190 case Instruction::GetElementPtr: {
1191 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1192 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1193 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1194 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1195 }
1198 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1199 Ops[0], Ops[1],
1200 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1201 // An extract of a live-in is an extract of a broadcast, so return the
1202 // broadcasted element.
1203 case Instruction::ExtractElement:
1204 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1205 return Ops[0];
1206 }
1207 return nullptr;
1208 };
1209
1210 if (Value *V = FoldToIRValue())
1211 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1212 return nullptr;
1213}
1214
1215/// Try to simplify VPSingleDefRecipe \p Def.
1217 VPlan *Plan = Def->getParent()->getPlan();
1218
1219 // Simplification of live-in IR values for SingleDef recipes using
1220 // InstSimplifyFolder.
1221 const DataLayout &DL =
1223 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1224 return Def->replaceAllUsesWith(V);
1225
1226 // Fold PredPHI LiveIn -> LiveIn.
1227 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1228 VPValue *Op = PredPHI->getOperand(0);
1229 if (isa<VPIRValue>(Op))
1230 PredPHI->replaceAllUsesWith(Op);
1231 }
1232
1233 VPBuilder Builder(Def);
1234 VPValue *A;
1235 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1236 Type *TruncTy = TypeInfo.inferScalarType(Def);
1237 Type *ATy = TypeInfo.inferScalarType(A);
1238 if (TruncTy == ATy) {
1239 Def->replaceAllUsesWith(A);
1240 } else {
1241 // Don't replace a scalarizing recipe with a widened cast.
1242 if (isa<VPReplicateRecipe>(Def))
1243 return;
1244 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1245
1246 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1247 ? Instruction::SExt
1248 : Instruction::ZExt;
1249 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1250 TruncTy);
1251 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1252 // UnderlyingExt has distinct return type, used to retain legacy cost.
1253 Ext->setUnderlyingValue(UnderlyingExt);
1254 }
1255 Def->replaceAllUsesWith(Ext);
1256 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1257 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1258 Def->replaceAllUsesWith(Trunc);
1259 }
1260 }
1261#ifndef NDEBUG
1262 // Verify that the cached type info is for both A and its users is still
1263 // accurate by comparing it to freshly computed types.
1264 VPTypeAnalysis TypeInfo2(*Plan);
1265 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1266 for (VPUser *U : A->users()) {
1267 auto *R = cast<VPRecipeBase>(U);
1268 for (VPValue *VPV : R->definedValues())
1269 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1270 }
1271#endif
1272 }
1273
1274 // Simplify (X && Y) || (X && !Y) -> X.
1275 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1276 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1277 // recipes to be visited during simplification.
1278 VPValue *X, *Y, *Z;
1279 if (match(Def,
1282 Def->replaceAllUsesWith(X);
1283 Def->eraseFromParent();
1284 return;
1285 }
1286
1287 // x | 1 -> 1
1288 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1289 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1290
1291 // x | 0 -> x
1292 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1293 return Def->replaceAllUsesWith(X);
1294
1295 // x & 0 -> 0
1296 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1297 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1298
1299 // x && false -> false
1300 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1301 return Def->replaceAllUsesWith(Def->getOperand(1));
1302
1303 // (x && y) || (x && z) -> x && (y || z)
1306 // Simplify only if one of the operands has one use to avoid creating an
1307 // extra recipe.
1308 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1309 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1310 return Def->replaceAllUsesWith(
1311 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1312
1313 // x && !x -> 0
1315 return Def->replaceAllUsesWith(Plan->getFalse());
1316
1317 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1318 return Def->replaceAllUsesWith(X);
1319
1320 // select !c, x, y -> select c, y, x
1321 VPValue *C;
1322 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1323 Def->setOperand(0, C);
1324 Def->setOperand(1, Y);
1325 Def->setOperand(2, X);
1326 return;
1327 }
1328
1329 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1330 // tail folding it is likely that x is a header mask and can be simplified
1331 // further.
1333 m_VPValue(Z))) &&
1334 X->hasMoreThanOneUniqueUser())
1335 return Def->replaceAllUsesWith(
1336 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1337
1338 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1339 return Def->replaceAllUsesWith(A);
1340
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1342 return Def->replaceAllUsesWith(A);
1343
1344 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1345 return Def->replaceAllUsesWith(
1346 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1347
1348 const APInt *APC;
1349 if (match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1350 return Def->replaceAllUsesWith(Builder.createNaryOp(
1351 Instruction::Shl,
1352 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1353 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1354
1355 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1356 // not allowed in them.
1357 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1358 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1359 if (!IsInReplicateRegion && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1360 APC->isPowerOf2())
1361 return Def->replaceAllUsesWith(Builder.createNaryOp(
1362 Instruction::LShr,
1363 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())}, {},
1364 Def->getDebugLoc()));
1365
1366 if (match(Def, m_Not(m_VPValue(A)))) {
1367 if (match(A, m_Not(m_VPValue(A))))
1368 return Def->replaceAllUsesWith(A);
1369
1370 // Try to fold Not into compares by adjusting the predicate in-place.
1371 CmpPredicate Pred;
1372 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1373 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1374 if (all_of(Cmp->users(),
1376 m_Not(m_Specific(Cmp)),
1377 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1378 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1379 for (VPUser *U : to_vector(Cmp->users())) {
1380 auto *R = cast<VPSingleDefRecipe>(U);
1381 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1382 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1383 R->setOperand(1, Y);
1384 R->setOperand(2, X);
1385 } else {
1386 // not (cmp pred) -> cmp inv_pred
1387 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1388 R->replaceAllUsesWith(Cmp);
1389 }
1390 }
1391 // If Cmp doesn't have a debug location, use the one from the negation,
1392 // to preserve the location.
1393 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1394 Cmp->setDebugLoc(Def->getDebugLoc());
1395 }
1396 }
1397 }
1398
1399 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1400 // any-of (fcmp uno %A, %B), ...
1401 if (match(Def, m_AnyOf())) {
1403 VPRecipeBase *UnpairedCmp = nullptr;
1404 for (VPValue *Op : Def->operands()) {
1405 VPValue *X;
1406 if (Op->getNumUsers() > 1 ||
1408 m_Deferred(X)))) {
1409 NewOps.push_back(Op);
1410 } else if (!UnpairedCmp) {
1411 UnpairedCmp = Op->getDefiningRecipe();
1412 } else {
1413 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1414 UnpairedCmp->getOperand(0), X));
1415 UnpairedCmp = nullptr;
1416 }
1417 }
1418
1419 if (UnpairedCmp)
1420 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1421
1422 if (NewOps.size() < Def->getNumOperands()) {
1423 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1424 return Def->replaceAllUsesWith(NewAnyOf);
1425 }
1426 }
1427
1428 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1429 // This is useful for fmax/fmin without fast-math flags, where we need to
1430 // check if any operand is NaN.
1432 m_Deferred(X)),
1434 m_Deferred(Y))))) {
1435 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1436 return Def->replaceAllUsesWith(NewCmp);
1437 }
1438
1439 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1440 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1441 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1442 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1443 TypeInfo.inferScalarType(Def))
1444 return Def->replaceAllUsesWith(Def->getOperand(1));
1445
1447 m_One()))) {
1448 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1449 if (TypeInfo.inferScalarType(X) != WideStepTy)
1450 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1451 Def->replaceAllUsesWith(X);
1452 return;
1453 }
1454
1455 // For i1 vp.merges produced by AnyOf reductions:
1456 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1458 m_VPValue(X), m_VPValue())) &&
1460 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1461 Def->setOperand(1, Def->getOperand(0));
1462 Def->setOperand(0, Y);
1463 return;
1464 }
1465
1466 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1467 if (Phi->getOperand(0) == Phi->getOperand(1))
1468 Phi->replaceAllUsesWith(Phi->getOperand(0));
1469 return;
1470 }
1471
1472 // Look through ExtractLastLane.
1473 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1474 if (match(A, m_BuildVector())) {
1475 auto *BuildVector = cast<VPInstruction>(A);
1476 Def->replaceAllUsesWith(
1477 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1478 return;
1479 }
1480 if (Plan->hasScalarVFOnly())
1481 return Def->replaceAllUsesWith(A);
1482 }
1483
1484 // Look through ExtractPenultimateElement (BuildVector ....).
1486 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1487 Def->replaceAllUsesWith(
1488 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1489 return;
1490 }
1491
1492 uint64_t Idx;
1494 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1495 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1496 return;
1497 }
1498
1499 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1500 Def->replaceAllUsesWith(
1501 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1502 return;
1503 }
1504
1505 // Look through broadcast of single-scalar when used as select conditions; in
1506 // that case the scalar condition can be used directly.
1507 if (match(Def,
1510 "broadcast operand must be single-scalar");
1511 Def->setOperand(0, C);
1512 return;
1513 }
1514
1515 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1516 if (Phi->getNumOperands() == 1)
1517 Phi->replaceAllUsesWith(Phi->getOperand(0));
1518 return;
1519 }
1520
1521 // Some simplifications can only be applied after unrolling. Perform them
1522 // below.
1523 if (!Plan->isUnrolled())
1524 return;
1525
1526 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1527 // After unrolling, extract-lane may be used to extract values from multiple
1528 // scalar sources. Only simplify when extracting from a single scalar source.
1529 if (match(Def, m_ExtractLane(m_VPValue(), m_VPValue(A))) &&
1531 return Def->replaceAllUsesWith(A);
1532 }
1533
1534 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1535 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1536 isa<VPPhi>(X)) {
1537 auto *Phi = cast<VPPhi>(X);
1538 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1539 Phi->getSingleUser() == Def) {
1540 Phi->setOperand(0, Y);
1541 Def->replaceAllUsesWith(Phi);
1542 return;
1543 }
1544 }
1545
1546 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1547 // just the pointer operand.
1548 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1549 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1550 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1551
1552 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1553 // the first lane is demanded.
1554 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1555 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1556 Steps->replaceAllUsesWith(Steps->getOperand(0));
1557 return;
1558 }
1559 }
1560 // Simplify redundant ReductionStartVector recipes after unrolling.
1561 VPValue *StartV;
1563 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1564 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1565 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1566 return PhiR && PhiR->isInLoop();
1567 });
1568 return;
1569 }
1570
1572 Def->replaceAllUsesWith(A);
1573 return;
1574 }
1575
1576 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1579 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1580 all_of(A->users(),
1581 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1582 return Def->replaceAllUsesWith(A);
1583 }
1584
1585 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1586 return Def->replaceAllUsesWith(A);
1587}
1588
1591 Plan.getEntry());
1592 VPTypeAnalysis TypeInfo(Plan);
1594 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1595 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1596 simplifyRecipe(Def, TypeInfo);
1597 }
1598}
1599
1601 if (Plan.hasScalarVFOnly())
1602 return;
1603
1604 // Try to narrow wide and replicating recipes to single scalar recipes,
1605 // based on VPlan analysis. Only process blocks in the loop region for now,
1606 // without traversing into nested regions, as recipes in replicate regions
1607 // cannot be converted yet.
1610 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1612 VPWidenStoreRecipe>(&R))
1613 continue;
1614 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1615 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1616 continue;
1617
1618 // Convert an unmasked scatter with an uniform address into
1619 // extract-last-lane + scalar store.
1620 // TODO: Add a profitability check comparing the cost of a scatter vs.
1621 // extract + scalar store.
1622 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1623 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1624 !WidenStoreR->isConsecutive()) {
1625 assert(!WidenStoreR->isReverse() &&
1626 "Not consecutive memory recipes shouldn't be reversed");
1627 VPValue *Mask = WidenStoreR->getMask();
1628
1629 // Only convert the scatter to a scalar store if it is unmasked.
1630 // TODO: Support converting scatter masked by the header mask to scalar
1631 // store.
1632 if (Mask)
1633 continue;
1634
1636 {WidenStoreR->getOperand(1)});
1637 Extract->insertBefore(WidenStoreR);
1638
1639 // TODO: Sink the scalar store recipe to middle block if possible.
1640 auto *ScalarStore = new VPReplicateRecipe(
1641 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1642 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1643 *WidenStoreR /*Metadata*/);
1644 ScalarStore->insertBefore(WidenStoreR);
1645 WidenStoreR->eraseFromParent();
1646 continue;
1647 }
1648
1649 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1650 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1651 vputils::isSingleScalar(RepR->getOperand(1))) {
1652 auto *Clone = new VPReplicateRecipe(
1653 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1654 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1655 *RepR /*Metadata*/, RepR->getDebugLoc());
1656 Clone->insertBefore(RepOrWidenR);
1657 VPBuilder Builder(Clone);
1658 VPValue *ExtractOp = Clone->getOperand(0);
1659 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1660 ExtractOp =
1661 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1662 ExtractOp =
1663 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1664 Clone->setOperand(0, ExtractOp);
1665 RepR->eraseFromParent();
1666 continue;
1667 }
1668
1669 // Skip recipes that aren't single scalars.
1670 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1671 continue;
1672
1673 // Skip recipes for which conversion to single-scalar does introduce
1674 // additional broadcasts. No extra broadcasts are needed, if either only
1675 // the scalars of the recipe are used, or at least one of the operands
1676 // would require a broadcast. In the latter case, the single-scalar may
1677 // need to be broadcasted, but another broadcast is removed.
1678 if (!all_of(RepOrWidenR->users(),
1679 [RepOrWidenR](const VPUser *U) {
1680 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1681 unsigned Opcode = VPI->getOpcode();
1682 if (Opcode == VPInstruction::ExtractLastLane ||
1683 Opcode == VPInstruction::ExtractLastPart ||
1684 Opcode == VPInstruction::ExtractPenultimateElement)
1685 return true;
1686 }
1687
1688 return U->usesScalars(RepOrWidenR);
1689 }) &&
1690 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1691 if (Op->getSingleUser() != RepOrWidenR)
1692 return false;
1693 // Non-constant live-ins require broadcasts, while constants do not
1694 // need explicit broadcasts.
1695 auto *IRV = dyn_cast<VPIRValue>(Op);
1696 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1697 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1698 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1699 }))
1700 continue;
1701
1702 auto *Clone = new VPReplicateRecipe(
1703 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1704 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1705 Clone->insertBefore(RepOrWidenR);
1706 RepOrWidenR->replaceAllUsesWith(Clone);
1707 if (isDeadRecipe(*RepOrWidenR))
1708 RepOrWidenR->eraseFromParent();
1709 }
1710 }
1711}
1712
1713/// Try to see if all of \p Blend's masks share a common value logically and'ed
1714/// and remove it from the masks.
1716 if (Blend->isNormalized())
1717 return;
1718 VPValue *CommonEdgeMask;
1719 if (!match(Blend->getMask(0),
1720 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1721 return;
1722 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1723 if (!match(Blend->getMask(I),
1724 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1725 return;
1726 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1727 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1728}
1729
1730/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1731/// to make sure the masks are simplified.
1732static void simplifyBlends(VPlan &Plan) {
1735 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1736 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1737 if (!Blend)
1738 continue;
1739
1740 removeCommonBlendMask(Blend);
1741
1742 // Try to remove redundant blend recipes.
1743 SmallPtrSet<VPValue *, 4> UniqueValues;
1744 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1745 UniqueValues.insert(Blend->getIncomingValue(0));
1746 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1747 if (!match(Blend->getMask(I), m_False()))
1748 UniqueValues.insert(Blend->getIncomingValue(I));
1749
1750 if (UniqueValues.size() == 1) {
1751 Blend->replaceAllUsesWith(*UniqueValues.begin());
1752 Blend->eraseFromParent();
1753 continue;
1754 }
1755
1756 if (Blend->isNormalized())
1757 continue;
1758
1759 // Normalize the blend so its first incoming value is used as the initial
1760 // value with the others blended into it.
1761
1762 unsigned StartIndex = 0;
1763 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1764 // If a value's mask is used only by the blend then is can be deadcoded.
1765 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1766 // that's used by multiple blends where it can be removed from them all.
1767 VPValue *Mask = Blend->getMask(I);
1768 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1769 StartIndex = I;
1770 break;
1771 }
1772 }
1773
1774 SmallVector<VPValue *, 4> OperandsWithMask;
1775 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1776
1777 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1778 if (I == StartIndex)
1779 continue;
1780 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1781 OperandsWithMask.push_back(Blend->getMask(I));
1782 }
1783
1784 auto *NewBlend =
1785 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1786 OperandsWithMask, Blend->getDebugLoc());
1787 NewBlend->insertBefore(&R);
1788
1789 VPValue *DeadMask = Blend->getMask(StartIndex);
1790 Blend->replaceAllUsesWith(NewBlend);
1791 Blend->eraseFromParent();
1793
1794 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1795 VPValue *NewMask;
1796 if (NewBlend->getNumOperands() == 3 &&
1797 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1798 VPValue *Inc0 = NewBlend->getOperand(0);
1799 VPValue *Inc1 = NewBlend->getOperand(1);
1800 VPValue *OldMask = NewBlend->getOperand(2);
1801 NewBlend->setOperand(0, Inc1);
1802 NewBlend->setOperand(1, Inc0);
1803 NewBlend->setOperand(2, NewMask);
1804 if (OldMask->getNumUsers() == 0)
1805 cast<VPInstruction>(OldMask)->eraseFromParent();
1806 }
1807 }
1808 }
1809}
1810
1811/// Optimize the width of vector induction variables in \p Plan based on a known
1812/// constant Trip Count, \p BestVF and \p BestUF.
1814 ElementCount BestVF,
1815 unsigned BestUF) {
1816 // Only proceed if we have not completely removed the vector region.
1817 if (!Plan.getVectorLoopRegion())
1818 return false;
1819
1820 const APInt *TC;
1821 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1822 return false;
1823
1824 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1825 // and UF. Returns at least 8.
1826 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1827 APInt AlignedTC =
1830 APInt MaxVal = AlignedTC - 1;
1831 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1832 };
1833 unsigned NewBitWidth =
1834 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1835
1836 LLVMContext &Ctx = Plan.getContext();
1837 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1838
1839 bool MadeChange = false;
1840
1841 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1842 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1843 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1844
1845 // Currently only handle canonical IVs as it is trivial to replace the start
1846 // and stop values, and we currently only perform the optimization when the
1847 // IV has a single use.
1848 if (!WideIV || !WideIV->isCanonical() ||
1849 WideIV->hasMoreThanOneUniqueUser() ||
1850 NewIVTy == WideIV->getScalarType())
1851 continue;
1852
1853 // Currently only handle cases where the single user is a header-mask
1854 // comparison with the backedge-taken-count.
1855 VPUser *SingleUser = WideIV->getSingleUser();
1856 if (!SingleUser ||
1857 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1860 continue;
1861
1862 // Update IV operands and comparison bound to use new narrower type.
1863 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1864 WideIV->setStartValue(NewStart);
1865 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1866 WideIV->setStepValue(NewStep);
1867
1868 auto *NewBTC = new VPWidenCastRecipe(
1869 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1870 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1871 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1872 Cmp->setOperand(1, NewBTC);
1873
1874 MadeChange = true;
1875 }
1876
1877 return MadeChange;
1878}
1879
1880/// Return true if \p Cond is known to be true for given \p BestVF and \p
1881/// BestUF.
1883 ElementCount BestVF, unsigned BestUF,
1886 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1887 &PSE](VPValue *C) {
1888 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1889 });
1890
1891 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1893 m_Specific(CanIV->getBackedgeValue()),
1894 m_Specific(&Plan.getVectorTripCount()))))
1895 return false;
1896
1897 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1898 // count is not conveniently available as SCEV so far, so we compare directly
1899 // against the original trip count. This is stricter than necessary, as we
1900 // will only return true if the trip count == vector trip count.
1901 const SCEV *VectorTripCount =
1903 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1904 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1905 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1906 "Trip count SCEV must be computable");
1907 ScalarEvolution &SE = *PSE.getSE();
1908 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1909 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1910 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1911}
1912
1913/// Try to replace multiple active lane masks used for control flow with
1914/// a single, wide active lane mask instruction followed by multiple
1915/// extract subvector intrinsics. This applies to the active lane mask
1916/// instructions both in the loop and in the preheader.
1917/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1918/// new extracts from the first active lane mask, which has it's last
1919/// operand (multiplier) set to UF.
1921 unsigned UF) {
1922 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1923 return false;
1924
1925 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1926 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1927 auto *Term = &ExitingVPBB->back();
1928
1929 using namespace llvm::VPlanPatternMatch;
1931 m_VPValue(), m_VPValue(), m_VPValue())))))
1932 return false;
1933
1934 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1935 LLVMContext &Ctx = Plan.getContext();
1936
1937 auto ExtractFromALM = [&](VPInstruction *ALM,
1938 SmallVectorImpl<VPValue *> &Extracts) {
1939 DebugLoc DL = ALM->getDebugLoc();
1940 for (unsigned Part = 0; Part < UF; ++Part) {
1942 Ops.append({ALM, Plan.getOrAddLiveIn(
1943 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1944 VF.getKnownMinValue() * Part))});
1945 auto *Ext =
1946 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1947 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1948 Extracts[Part] = Ext;
1949 Ext->insertAfter(ALM);
1950 }
1951 };
1952
1953 // Create a list of each active lane mask phi, ordered by unroll part.
1955 for (VPRecipeBase &R : Header->phis()) {
1957 if (!Phi)
1958 continue;
1959 VPValue *Index = nullptr;
1960 match(Phi->getBackedgeValue(),
1962 assert(Index && "Expected index from ActiveLaneMask instruction");
1963
1964 uint64_t Part;
1965 if (match(Index,
1967 m_VPValue(), m_ConstantInt(Part))))
1968 Phis[Part] = Phi;
1969 else
1970 // Anything other than a CanonicalIVIncrementForPart is part 0
1971 Phis[0] = Phi;
1972 }
1973
1974 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1975 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1976
1977 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1978 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1979
1980 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1981 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1982 "Expected incoming values of Phi to be ActiveLaneMasks");
1983
1984 // When using wide lane masks, the return type of the get.active.lane.mask
1985 // intrinsic is VF x UF (last operand).
1986 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1987 EntryALM->setOperand(2, ALMMultiplier);
1988 LoopALM->setOperand(2, ALMMultiplier);
1989
1990 // Create UF x extract vectors and insert into preheader.
1991 SmallVector<VPValue *> EntryExtracts(UF);
1992 ExtractFromALM(EntryALM, EntryExtracts);
1993
1994 // Create UF x extract vectors and insert before the loop compare & branch,
1995 // updating the compare to use the first extract.
1996 SmallVector<VPValue *> LoopExtracts(UF);
1997 ExtractFromALM(LoopALM, LoopExtracts);
1998 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1999 Not->setOperand(0, LoopExtracts[0]);
2000
2001 // Update the incoming values of active lane mask phis.
2002 for (unsigned Part = 0; Part < UF; ++Part) {
2003 Phis[Part]->setStartValue(EntryExtracts[Part]);
2004 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2005 }
2006
2007 return true;
2008}
2009
2010/// Try to simplify the branch condition of \p Plan. This may restrict the
2011/// resulting plan to \p BestVF and \p BestUF.
2013 unsigned BestUF,
2015 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2016 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2017 auto *Term = &ExitingVPBB->back();
2018 VPValue *Cond;
2019 if (match(Term, m_BranchOnCount()) ||
2021 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2022 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2023 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2024 const SCEV *VectorTripCount =
2026 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2027 VectorTripCount =
2029 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2030 "Trip count SCEV must be computable");
2031 ScalarEvolution &SE = *PSE.getSE();
2032 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2033 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2034 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2035 return false;
2036 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2038 // For BranchOnCond, check if we can prove the condition to be true using VF
2039 // and UF.
2040 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2041 return false;
2042 } else {
2043 return false;
2044 }
2045
2046 // The vector loop region only executes once. If possible, completely remove
2047 // the region, otherwise replace the terminator controlling the latch with
2048 // (BranchOnCond true).
2049 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2050 // support for other non-canonical widen induction recipes (e.g.,
2051 // VPWidenPointerInductionRecipe).
2052 // TODO: fold branch-on-constant after dissolving region.
2053 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2054 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2055 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2056 return R->isCanonical();
2057 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2058 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2059 })) {
2060 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2061 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2062 VPBuilder Builder(Plan.getVectorPreheader());
2063 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2064 R->getScalarType());
2065 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2066 HeaderR.eraseFromParent();
2067 continue;
2068 }
2069 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2070 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2071 HeaderR.eraseFromParent();
2072 }
2073
2074 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2075 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2076 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2077 for (VPBlockBase *Exit : Exits)
2078 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2079
2080 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2081 B->setParent(nullptr);
2082
2083 VPBlockUtils::connectBlocks(Preheader, Header);
2084
2085 for (VPBlockBase *Exit : Exits)
2086 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2087
2088 // Replace terminating branch-on-two-conds with branch-on-cond to early
2089 // exit.
2090 if (Exits.size() != 1) {
2091 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2092 "BranchOnTwoConds needs 2 remaining exits");
2094 Term->getOperand(0));
2095 }
2097 } else {
2098 // The vector region contains header phis for which we cannot remove the
2099 // loop region yet.
2100
2101 // For BranchOnTwoConds, set the latch exit condition to true directly.
2102 if (match(Term, m_BranchOnTwoConds())) {
2103 Term->setOperand(1, Plan.getTrue());
2104 return true;
2105 }
2106
2107 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2108 {}, {}, Term->getDebugLoc());
2109 ExitingVPBB->appendRecipe(BOC);
2110 }
2111
2112 Term->eraseFromParent();
2113
2114 return true;
2115}
2116
2117/// From the definition of llvm.experimental.get.vector.length,
2118/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2122 vp_depth_first_deep(Plan.getEntry()))) {
2123 for (VPRecipeBase &R : *VPBB) {
2124 VPValue *AVL;
2125 if (!match(&R, m_EVL(m_VPValue(AVL))))
2126 continue;
2127
2128 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2129 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2130 continue;
2131 ScalarEvolution &SE = *PSE.getSE();
2132 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2133 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2134 continue;
2135
2137 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2138 R.getDebugLoc());
2139 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2140 return true;
2141 }
2142 }
2143 return false;
2144}
2145
2147 unsigned BestUF,
2149 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2150 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2151
2152 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2153 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2154 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2155 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2156
2157 if (MadeChange) {
2158 Plan.setVF(BestVF);
2159 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2160 }
2161}
2162
2163/// Sink users of \p FOR after the recipe defining the previous value \p
2164/// Previous of the recurrence. \returns true if all users of \p FOR could be
2165/// re-arranged as needed or false if it is not possible.
2166static bool
2168 VPRecipeBase *Previous,
2169 VPDominatorTree &VPDT) {
2170 // Collect recipes that need sinking.
2173 Seen.insert(Previous);
2174 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2175 // The previous value must not depend on the users of the recurrence phi. In
2176 // that case, FOR is not a fixed order recurrence.
2177 if (SinkCandidate == Previous)
2178 return false;
2179
2180 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2181 !Seen.insert(SinkCandidate).second ||
2182 VPDT.properlyDominates(Previous, SinkCandidate))
2183 return true;
2184
2185 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2186 return false;
2187
2188 WorkList.push_back(SinkCandidate);
2189 return true;
2190 };
2191
2192 // Recursively sink users of FOR after Previous.
2193 WorkList.push_back(FOR);
2194 for (unsigned I = 0; I != WorkList.size(); ++I) {
2195 VPRecipeBase *Current = WorkList[I];
2196 assert(Current->getNumDefinedValues() == 1 &&
2197 "only recipes with a single defined value expected");
2198
2199 for (VPUser *User : Current->getVPSingleValue()->users()) {
2200 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2201 return false;
2202 }
2203 }
2204
2205 // Keep recipes to sink ordered by dominance so earlier instructions are
2206 // processed first.
2207 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2208 return VPDT.properlyDominates(A, B);
2209 });
2210
2211 for (VPRecipeBase *SinkCandidate : WorkList) {
2212 if (SinkCandidate == FOR)
2213 continue;
2214
2215 SinkCandidate->moveAfter(Previous);
2216 Previous = SinkCandidate;
2217 }
2218 return true;
2219}
2220
2221/// Try to hoist \p Previous and its operands before all users of \p FOR.
2223 VPRecipeBase *Previous,
2224 VPDominatorTree &VPDT) {
2225 if (cannotHoistOrSinkRecipe(*Previous))
2226 return false;
2227
2228 // Collect recipes that need hoisting.
2229 SmallVector<VPRecipeBase *> HoistCandidates;
2231 VPRecipeBase *HoistPoint = nullptr;
2232 // Find the closest hoist point by looking at all users of FOR and selecting
2233 // the recipe dominating all other users.
2234 for (VPUser *U : FOR->users()) {
2235 auto *R = cast<VPRecipeBase>(U);
2236 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2237 HoistPoint = R;
2238 }
2239 assert(all_of(FOR->users(),
2240 [&VPDT, HoistPoint](VPUser *U) {
2241 auto *R = cast<VPRecipeBase>(U);
2242 return HoistPoint == R ||
2243 VPDT.properlyDominates(HoistPoint, R);
2244 }) &&
2245 "HoistPoint must dominate all users of FOR");
2246
2247 auto NeedsHoisting = [HoistPoint, &VPDT,
2248 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2249 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2250 if (!HoistCandidate)
2251 return nullptr;
2252 VPRegionBlock *EnclosingLoopRegion =
2253 HoistCandidate->getParent()->getEnclosingLoopRegion();
2254 assert((!HoistCandidate->getRegion() ||
2255 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2256 "CFG in VPlan should still be flat, without replicate regions");
2257 // Hoist candidate was already visited, no need to hoist.
2258 if (!Visited.insert(HoistCandidate).second)
2259 return nullptr;
2260
2261 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2262 // hoisting.
2263 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2264 return nullptr;
2265
2266 // If we reached a recipe that dominates HoistPoint, we don't need to
2267 // hoist the recipe.
2268 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2269 return nullptr;
2270 return HoistCandidate;
2271 };
2272
2273 if (!NeedsHoisting(Previous->getVPSingleValue()))
2274 return true;
2275
2276 // Recursively try to hoist Previous and its operands before all users of FOR.
2277 HoistCandidates.push_back(Previous);
2278
2279 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2280 VPRecipeBase *Current = HoistCandidates[I];
2281 assert(Current->getNumDefinedValues() == 1 &&
2282 "only recipes with a single defined value expected");
2283 if (cannotHoistOrSinkRecipe(*Current))
2284 return false;
2285
2286 for (VPValue *Op : Current->operands()) {
2287 // If we reach FOR, it means the original Previous depends on some other
2288 // recurrence that in turn depends on FOR. If that is the case, we would
2289 // also need to hoist recipes involving the other FOR, which may break
2290 // dependencies.
2291 if (Op == FOR)
2292 return false;
2293
2294 if (auto *R = NeedsHoisting(Op)) {
2295 // Bail out if the recipe defines multiple values.
2296 // TODO: Hoisting such recipes requires additional handling.
2297 if (R->getNumDefinedValues() != 1)
2298 return false;
2299 HoistCandidates.push_back(R);
2300 }
2301 }
2302 }
2303
2304 // Order recipes to hoist by dominance so earlier instructions are processed
2305 // first.
2306 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2307 return VPDT.properlyDominates(A, B);
2308 });
2309
2310 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2311 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2312 HoistPoint->getIterator());
2313 }
2314
2315 return true;
2316}
2317
2319 VPBuilder &LoopBuilder) {
2320 VPDominatorTree VPDT(Plan);
2321
2323 for (VPRecipeBase &R :
2326 RecurrencePhis.push_back(FOR);
2327
2328 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2330 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2331 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2332 // to terminate.
2333 while (auto *PrevPhi =
2335 assert(PrevPhi->getParent() == FOR->getParent());
2336 assert(SeenPhis.insert(PrevPhi).second);
2337 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2338 }
2339
2340 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2341 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2342 return false;
2343
2344 // Introduce a recipe to combine the incoming and previous values of a
2345 // fixed-order recurrence.
2346 VPBasicBlock *InsertBlock = Previous->getParent();
2347 if (isa<VPHeaderPHIRecipe>(Previous))
2348 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2349 else
2350 LoopBuilder.setInsertPoint(InsertBlock,
2351 std::next(Previous->getIterator()));
2352
2353 auto *RecurSplice =
2355 {FOR, FOR->getBackedgeValue()});
2356
2357 FOR->replaceAllUsesWith(RecurSplice);
2358 // Set the first operand of RecurSplice to FOR again, after replacing
2359 // all users.
2360 RecurSplice->setOperand(0, FOR);
2361
2362 // Check for users extracting at the penultimate active lane of the FOR.
2363 // If only a single lane is active in the current iteration, we need to
2364 // select the last element from the previous iteration (from the FOR phi
2365 // directly).
2366 for (VPUser *U : RecurSplice->users()) {
2368 m_Specific(RecurSplice))))
2369 continue;
2370
2372 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2373 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2374 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2375 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2376 VPValue *PenultimateIndex =
2377 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2378 VPValue *PenultimateLastIter =
2379 B.createNaryOp(VPInstruction::ExtractLane,
2380 {PenultimateIndex, FOR->getBackedgeValue()});
2381 VPValue *LastPrevIter =
2382 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2383
2384 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2385 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2386 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2387 }
2388 }
2389 return true;
2390}
2391
2393 for (VPRecipeBase &R :
2395 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2396 if (!PhiR)
2397 continue;
2398 RecurKind RK = PhiR->getRecurrenceKind();
2399 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2401 continue;
2402
2403 for (VPUser *U : collectUsersRecursively(PhiR))
2404 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2405 RecWithFlags->dropPoisonGeneratingFlags();
2406 }
2407 }
2408}
2409
2410namespace {
2411struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2412 static bool isSentinel(const VPSingleDefRecipe *Def) {
2413 return Def == getEmptyKey() || Def == getTombstoneKey();
2414 }
2415
2416 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2417 /// return that source element type.
2418 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2419 // All VPInstructions that lower to GEPs must have the i8 source element
2420 // type (as they are PtrAdds), so we omit it.
2422 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2423 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2424 return GEP->getSourceElementType();
2425 return nullptr;
2426 })
2427 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2428 [](auto *I) { return I->getSourceElementType(); })
2429 .Default([](auto *) { return nullptr; });
2430 }
2431
2432 /// Returns true if recipe \p Def can be safely handed for CSE.
2433 static bool canHandle(const VPSingleDefRecipe *Def) {
2434 // We can extend the list of handled recipes in the future,
2435 // provided we account for the data embedded in them while checking for
2436 // equality or hashing.
2437 auto C = getOpcodeOrIntrinsicID(Def);
2438
2439 // The issue with (Insert|Extract)Value is that the index of the
2440 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2441 // VPlan.
2442 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2443 C->second == Instruction::ExtractValue)))
2444 return false;
2445
2446 // During CSE, we can only handle recipes that don't read from memory: if
2447 // they read from memory, there could be an intervening write to memory
2448 // before the next instance is CSE'd, leading to an incorrect result.
2449 return !Def->mayReadFromMemory();
2450 }
2451
2452 /// Hash the underlying data of \p Def.
2453 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2454 const VPlan *Plan = Def->getParent()->getPlan();
2455 VPTypeAnalysis TypeInfo(*Plan);
2456 hash_code Result = hash_combine(
2457 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2458 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2460 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2461 if (RFlags->hasPredicate())
2462 return hash_combine(Result, RFlags->getPredicate());
2463 return Result;
2464 }
2465
2466 /// Check equality of underlying data of \p L and \p R.
2467 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2468 if (isSentinel(L) || isSentinel(R))
2469 return L == R;
2470 if (L->getVPDefID() != R->getVPDefID() ||
2472 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2474 !equal(L->operands(), R->operands()))
2475 return false;
2477 "must have valid opcode info for both recipes");
2478 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2479 if (LFlags->hasPredicate() &&
2480 LFlags->getPredicate() !=
2481 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2482 return false;
2483 // Recipes in replicate regions implicitly depend on predicate. If either
2484 // recipe is in a replicate region, only consider them equal if both have
2485 // the same parent.
2486 const VPRegionBlock *RegionL = L->getRegion();
2487 const VPRegionBlock *RegionR = R->getRegion();
2488 if (((RegionL && RegionL->isReplicator()) ||
2489 (RegionR && RegionR->isReplicator())) &&
2490 L->getParent() != R->getParent())
2491 return false;
2492 const VPlan *Plan = L->getParent()->getPlan();
2493 VPTypeAnalysis TypeInfo(*Plan);
2494 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2495 }
2496};
2497} // end anonymous namespace
2498
2499/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2500/// Plan.
2502 VPDominatorTree VPDT(Plan);
2504
2506 vp_depth_first_deep(Plan.getEntry()))) {
2507 for (VPRecipeBase &R : *VPBB) {
2508 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2509 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2510 continue;
2511 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2512 // V must dominate Def for a valid replacement.
2513 if (!VPDT.dominates(V->getParent(), VPBB))
2514 continue;
2515 // Only keep flags present on both V and Def.
2516 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2517 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2518 Def->replaceAllUsesWith(V);
2519 continue;
2520 }
2521 CSEMap[Def] = Def;
2522 }
2523 }
2524}
2525
2526/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2527static void licm(VPlan &Plan) {
2528 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2529
2530 // Hoist any loop invariant recipes from the vector loop region to the
2531 // preheader. Preform a shallow traversal of the vector loop region, to
2532 // exclude recipes in replicate regions. Since the top-level blocks in the
2533 // vector loop region are guaranteed to execute if the vector pre-header is,
2534 // we don't need to check speculation safety.
2535 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2536 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2537 "Expected vector prehader's successor to be the vector loop region");
2539 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2540 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2542 continue;
2543 if (any_of(R.operands(), [](VPValue *Op) {
2544 return !Op->isDefinedOutsideLoopRegions();
2545 }))
2546 continue;
2547 R.moveBefore(*Preheader, Preheader->end());
2548 }
2549 }
2550}
2551
2553 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2554 if (Plan.hasScalarVFOnly())
2555 return;
2556 // Keep track of created truncates, so they can be re-used. Note that we
2557 // cannot use RAUW after creating a new truncate, as this would could make
2558 // other uses have different types for their operands, making them invalidly
2559 // typed.
2561 VPTypeAnalysis TypeInfo(Plan);
2562 VPBasicBlock *PH = Plan.getVectorPreheader();
2565 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2568 continue;
2569
2570 VPValue *ResultVPV = R.getVPSingleValue();
2571 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2572 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2573 if (!NewResSizeInBits)
2574 continue;
2575
2576 // If the value wasn't vectorized, we must maintain the original scalar
2577 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2578 // skip casts which do not need to be handled explicitly here, as
2579 // redundant casts will be removed during recipe simplification.
2581 continue;
2582
2583 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2584 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2585 assert(OldResTy->isIntegerTy() && "only integer types supported");
2586 (void)OldResSizeInBits;
2587
2588 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2589
2590 // Any wrapping introduced by shrinking this operation shouldn't be
2591 // considered undefined behavior. So, we can't unconditionally copy
2592 // arithmetic wrapping flags to VPW.
2593 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2594 VPW->dropPoisonGeneratingFlags();
2595
2596 if (OldResSizeInBits != NewResSizeInBits &&
2597 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2598 // Extend result to original width.
2599 auto *Ext =
2600 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2601 Ext->insertAfter(&R);
2602 ResultVPV->replaceAllUsesWith(Ext);
2603 Ext->setOperand(0, ResultVPV);
2604 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2605 } else {
2606 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2607 "Only ICmps should not need extending the result.");
2608 }
2609
2610 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2612 continue;
2613
2614 // Shrink operands by introducing truncates as needed.
2615 unsigned StartIdx =
2616 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2617 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2618 auto *Op = R.getOperand(Idx);
2619 unsigned OpSizeInBits =
2621 if (OpSizeInBits == NewResSizeInBits)
2622 continue;
2623 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2624 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2625 if (!IterIsEmpty) {
2626 R.setOperand(Idx, ProcessedIter->second);
2627 continue;
2628 }
2629
2630 VPBuilder Builder;
2631 if (isa<VPIRValue>(Op))
2632 Builder.setInsertPoint(PH);
2633 else
2634 Builder.setInsertPoint(&R);
2635 VPWidenCastRecipe *NewOp =
2636 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2637 ProcessedIter->second = NewOp;
2638 R.setOperand(Idx, NewOp);
2639 }
2640
2641 }
2642 }
2643}
2644
2648 VPValue *Cond;
2649 // Skip blocks that are not terminated by BranchOnCond.
2650 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2651 continue;
2652
2653 assert(VPBB->getNumSuccessors() == 2 &&
2654 "Two successors expected for BranchOnCond");
2655 unsigned RemovedIdx;
2656 if (match(Cond, m_True()))
2657 RemovedIdx = 1;
2658 else if (match(Cond, m_False()))
2659 RemovedIdx = 0;
2660 else
2661 continue;
2662
2663 VPBasicBlock *RemovedSucc =
2664 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2665 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2666 "There must be a single edge between VPBB and its successor");
2667 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2668 // these recipes.
2669 for (VPRecipeBase &R : RemovedSucc->phis())
2670 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2671
2672 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2673 // automatically on VPlan destruction if it becomes unreachable.
2674 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2675 VPBB->back().eraseFromParent();
2676 }
2677}
2678
2698
2699// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2700// the loop terminator with a branch-on-cond recipe with the negated
2701// active-lane-mask as operand. Note that this turns the loop into an
2702// uncountable one. Only the existing terminator is replaced, all other existing
2703// recipes/users remain unchanged, except for poison-generating flags being
2704// dropped from the canonical IV increment. Return the created
2705// VPActiveLaneMaskPHIRecipe.
2706//
2707// The function uses the following definitions:
2708//
2709// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2710// calculate-trip-count-minus-VF (original TC) : original TC
2711// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2712// CanonicalIVPhi : CanonicalIVIncrement
2713// %StartV is the canonical induction start value.
2714//
2715// The function adds the following recipes:
2716//
2717// vector.ph:
2718// %TripCount = calculate-trip-count-minus-VF (original TC)
2719// [if DataWithControlFlowWithoutRuntimeCheck]
2720// %EntryInc = canonical-iv-increment-for-part %StartV
2721// %EntryALM = active-lane-mask %EntryInc, %TripCount
2722//
2723// vector.body:
2724// ...
2725// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2726// ...
2727// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2728// %ALM = active-lane-mask %InLoopInc, TripCount
2729// %Negated = Not %ALM
2730// branch-on-cond %Negated
2731//
2734 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2735 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2736 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2737 VPValue *StartV = CanonicalIVPHI->getStartValue();
2738
2739 auto *CanonicalIVIncrement =
2740 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2741 // TODO: Check if dropping the flags is needed if
2742 // !DataAndControlFlowWithoutRuntimeCheck.
2743 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2744 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2745 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2746 // we have to take unrolling into account. Each part needs to start at
2747 // Part * VF
2748 auto *VecPreheader = Plan.getVectorPreheader();
2749 VPBuilder Builder(VecPreheader);
2750
2751 // Create the ActiveLaneMask instruction using the correct start values.
2752 VPValue *TC = Plan.getTripCount();
2753
2754 VPValue *TripCount, *IncrementValue;
2756 // When the loop is guarded by a runtime overflow check for the loop
2757 // induction variable increment by VF, we can increment the value before
2758 // the get.active.lane mask and use the unmodified tripcount.
2759 IncrementValue = CanonicalIVIncrement;
2760 TripCount = TC;
2761 } else {
2762 // When avoiding a runtime check, the active.lane.mask inside the loop
2763 // uses a modified trip count and the induction variable increment is
2764 // done after the active.lane.mask intrinsic is called.
2765 IncrementValue = CanonicalIVPHI;
2766 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2767 {TC}, DL);
2768 }
2769 auto *EntryIncrement = Builder.createOverflowingOp(
2770 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2771 "index.part.next");
2772
2773 // Create the active lane mask instruction in the VPlan preheader.
2774 VPValue *ALMMultiplier =
2775 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2776 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2777 {EntryIncrement, TC, ALMMultiplier}, DL,
2778 "active.lane.mask.entry");
2779
2780 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2781 // preheader ActiveLaneMask instruction.
2782 auto *LaneMaskPhi =
2784 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2785
2786 // Create the active lane mask for the next iteration of the loop before the
2787 // original terminator.
2788 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2789 Builder.setInsertPoint(OriginalTerminator);
2790 auto *InLoopIncrement =
2791 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2792 {IncrementValue}, {false, false}, DL);
2793 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2794 {InLoopIncrement, TripCount, ALMMultiplier},
2795 DL, "active.lane.mask.next");
2796 LaneMaskPhi->addOperand(ALM);
2797
2798 // Replace the original terminator with BranchOnCond. We have to invert the
2799 // mask here because a true condition means jumping to the exit block.
2800 auto *NotMask = Builder.createNot(ALM, DL);
2801 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2802 OriginalTerminator->eraseFromParent();
2803 return LaneMaskPhi;
2804}
2805
2806/// Collect the header mask with the pattern:
2807/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2808/// TODO: Introduce explicit recipe for header-mask instead of searching
2809/// for the header-mask pattern manually.
2811 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2812 SmallVector<VPValue *> WideCanonicalIVs;
2813 auto *FoundWidenCanonicalIVUser = find_if(
2815 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2817 "Must have at most one VPWideCanonicalIVRecipe");
2818 if (FoundWidenCanonicalIVUser !=
2819 LoopRegion->getCanonicalIV()->users().end()) {
2820 auto *WideCanonicalIV =
2821 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2822 WideCanonicalIVs.push_back(WideCanonicalIV);
2823 }
2824
2825 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2826 // version of the canonical induction.
2827 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2828 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2829 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2830 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2831 WideCanonicalIVs.push_back(WidenOriginalIV);
2832 }
2833
2834 // Walk users of wide canonical IVs and find the single compare of the form
2835 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2836 VPSingleDefRecipe *HeaderMask = nullptr;
2837 for (auto *Wide : WideCanonicalIVs) {
2838 for (VPUser *U : Wide->users()) {
2839 auto *VPI = dyn_cast<VPInstruction>(U);
2840 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2841 continue;
2842
2843 assert(VPI->getOperand(0) == Wide &&
2844 "WidenCanonicalIV must be the first operand of the compare");
2845 assert(!HeaderMask && "Multiple header masks found?");
2846 HeaderMask = VPI;
2847 }
2848 }
2849 return HeaderMask;
2850}
2851
2853 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2856 UseActiveLaneMaskForControlFlow) &&
2857 "DataAndControlFlowWithoutRuntimeCheck implies "
2858 "UseActiveLaneMaskForControlFlow");
2859
2860 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2861 auto *FoundWidenCanonicalIVUser = find_if(
2863 assert(FoundWidenCanonicalIVUser &&
2864 "Must have widened canonical IV when tail folding!");
2865 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2866 auto *WideCanonicalIV =
2867 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2868 VPSingleDefRecipe *LaneMask;
2869 if (UseActiveLaneMaskForControlFlow) {
2872 } else {
2873 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2874 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2875 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2876 LaneMask =
2877 B.createNaryOp(VPInstruction::ActiveLaneMask,
2878 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2879 nullptr, "active.lane.mask");
2880 }
2881
2882 // Walk users of WideCanonicalIV and replace the header mask of the form
2883 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2884 // removing the old one to ensure there is always only a single header mask.
2885 HeaderMask->replaceAllUsesWith(LaneMask);
2886 HeaderMask->eraseFromParent();
2887}
2888
2889template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2890 Op0_t In;
2892
2893 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2894
2895 template <typename OpTy> bool match(OpTy *V) const {
2896 if (m_Specific(In).match(V)) {
2897 Out = nullptr;
2898 return true;
2899 }
2900 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2901 }
2902};
2903
2904/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2905/// Returns the remaining part \p Out if so, or nullptr otherwise.
2906template <typename Op0_t, typename Op1_t>
2907static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2908 Op1_t &Out) {
2909 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2910}
2911
2912/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2913/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2914/// recipe could be created.
2915/// \p HeaderMask Header Mask.
2916/// \p CurRecipe Recipe to be transform.
2917/// \p TypeInfo VPlan-based type analysis.
2918/// \p EVL The explicit vector length parameter of vector-predication
2919/// intrinsics.
2921 VPRecipeBase &CurRecipe,
2922 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2923 VPlan *Plan = CurRecipe.getParent()->getPlan();
2924 DebugLoc DL = CurRecipe.getDebugLoc();
2925 VPValue *Addr, *Mask, *EndPtr;
2926
2927 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2928 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2929 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2930 EVLEndPtr->insertBefore(&CurRecipe);
2931 EVLEndPtr->setOperand(1, &EVL);
2932 return EVLEndPtr;
2933 };
2934
2935 if (match(&CurRecipe,
2936 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2937 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2938 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2939 EVL, Mask);
2940
2941 VPValue *ReversedVal;
2942 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2943 match(ReversedVal,
2944 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2945 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2946 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2947 auto *LoadR = new VPWidenLoadEVLRecipe(
2948 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2949 LoadR->insertBefore(&CurRecipe);
2950 return new VPWidenIntrinsicRecipe(
2951 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2952 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2953 }
2954
2955 VPValue *StoredVal;
2956 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2957 m_RemoveMask(HeaderMask, Mask))) &&
2958 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2959 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2960 StoredVal, EVL, Mask);
2961
2962 if (match(&CurRecipe,
2963 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2964 m_RemoveMask(HeaderMask, Mask))) &&
2965 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2966 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2967 auto *NewReverse = new VPWidenIntrinsicRecipe(
2968 Intrinsic::experimental_vp_reverse,
2969 {ReversedVal, Plan->getTrue(), &EVL},
2970 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2971 NewReverse->insertBefore(&CurRecipe);
2972 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2973 AdjustEndPtr(EndPtr), NewReverse, EVL,
2974 Mask);
2975 }
2976
2977 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2978 if (Rdx->isConditional() &&
2979 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2980 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2981
2982 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2983 if (Interleave->getMask() &&
2984 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2985 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2986
2987 VPValue *LHS, *RHS;
2988 if (match(&CurRecipe,
2989 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2990 return new VPWidenIntrinsicRecipe(
2991 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2992 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2993
2994 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2995 m_VPValue(RHS))))
2996 return new VPWidenIntrinsicRecipe(
2997 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2998 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2999
3000 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3001 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3002 VPValue *ZExt =
3003 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3004 return new VPInstruction(Instruction::Sub,
3005 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
3006 }
3007
3008 return nullptr;
3009}
3010
3011/// Replace recipes with their EVL variants.
3013 VPTypeAnalysis TypeInfo(Plan);
3014 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3015 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3016
3017 assert(all_of(Plan.getVF().users(),
3020 "User of VF that we can't transform to EVL.");
3021 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3023 });
3024
3025 assert(all_of(Plan.getVFxUF().users(),
3026 [&LoopRegion, &Plan](VPUser *U) {
3027 return match(U,
3028 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3029 m_Specific(&Plan.getVFxUF()))) ||
3030 isa<VPWidenPointerInductionRecipe>(U);
3031 }) &&
3032 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3033 "increment of the canonical induction.");
3034 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3035 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3036 // canonical induction must not be updated.
3038 });
3039
3040 // Defer erasing recipes till the end so that we don't invalidate the
3041 // VPTypeAnalysis cache.
3043
3044 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3045 // contained.
3046 bool ContainsFORs =
3048 if (ContainsFORs) {
3049 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3050 VPValue *MaxEVL = &Plan.getVF();
3051 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3052 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3053 MaxEVL = Builder.createScalarZExtOrTrunc(
3054 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3055 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3056
3057 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3058 VPValue *PrevEVL = Builder.createScalarPhi(
3059 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3060
3063 for (VPRecipeBase &R : *VPBB) {
3064 VPValue *V1, *V2;
3065 if (!match(&R,
3067 m_VPValue(V1), m_VPValue(V2))))
3068 continue;
3069 VPValue *Imm = Plan.getOrAddLiveIn(
3072 Intrinsic::experimental_vp_splice,
3073 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3074 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3075 R.getDebugLoc());
3076 VPSplice->insertBefore(&R);
3077 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3078 ToErase.push_back(&R);
3079 }
3080 }
3081 }
3082
3083 VPValue *HeaderMask = findHeaderMask(Plan);
3084 if (!HeaderMask)
3085 return;
3086
3087 // Replace header masks with a mask equivalent to predicating by EVL:
3088 //
3089 // icmp ule widen-canonical-iv backedge-taken-count
3090 // ->
3091 // icmp ult step-vector, EVL
3092 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3093 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3094 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3095 VPValue *EVLMask = Builder.createICmp(
3097 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3098 HeaderMask->replaceAllUsesWith(EVLMask);
3099 ToErase.push_back(HeaderMask->getDefiningRecipe());
3100
3101 // Try to optimize header mask recipes away to their EVL variants.
3102 // TODO: Split optimizeMaskToEVL out and move into
3103 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3104 // tryToBuildVPlanWithVPRecipes beforehand.
3105 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3106 auto *CurRecipe = cast<VPRecipeBase>(U);
3107 VPRecipeBase *EVLRecipe =
3108 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3109 if (!EVLRecipe)
3110 continue;
3111
3112 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3113 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3114 "New recipe must define the same number of values as the "
3115 "original.");
3116 EVLRecipe->insertBefore(CurRecipe);
3118 EVLRecipe)) {
3119 for (unsigned I = 0; I < NumDefVal; ++I) {
3120 VPValue *CurVPV = CurRecipe->getVPValue(I);
3121 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3122 }
3123 }
3124 ToErase.push_back(CurRecipe);
3125 }
3126 // Remove dead EVL mask.
3127 if (EVLMask->getNumUsers() == 0)
3128 ToErase.push_back(EVLMask->getDefiningRecipe());
3129
3130 for (VPRecipeBase *R : reverse(ToErase)) {
3131 SmallVector<VPValue *> PossiblyDead(R->operands());
3132 R->eraseFromParent();
3133 for (VPValue *Op : PossiblyDead)
3135 }
3136}
3137
3138/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3139/// replaces all uses except the canonical IV increment of
3140/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3141/// is used only for loop iterations counting after this transformation.
3142///
3143/// The function uses the following definitions:
3144/// %StartV is the canonical induction start value.
3145///
3146/// The function adds the following recipes:
3147///
3148/// vector.ph:
3149/// ...
3150///
3151/// vector.body:
3152/// ...
3153/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3154/// [ %NextEVLIV, %vector.body ]
3155/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3156/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3157/// ...
3158/// %OpEVL = cast i32 %VPEVL to IVSize
3159/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3160/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3161/// ...
3162///
3163/// If MaxSafeElements is provided, the function adds the following recipes:
3164/// vector.ph:
3165/// ...
3166///
3167/// vector.body:
3168/// ...
3169/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3170/// [ %NextEVLIV, %vector.body ]
3171/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3172/// %cmp = cmp ult %AVL, MaxSafeElements
3173/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3174/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3175/// ...
3176/// %OpEVL = cast i32 %VPEVL to IVSize
3177/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3178/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3179/// ...
3180///
3182 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3183 if (Plan.hasScalarVFOnly())
3184 return;
3185 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3186 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3187
3188 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3189 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3190 VPValue *StartV = CanonicalIVPHI->getStartValue();
3191
3192 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3193 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3194 EVLPhi->insertAfter(CanonicalIVPHI);
3195 VPBuilder Builder(Header, Header->getFirstNonPhi());
3196 // Create the AVL (application vector length), starting from TC -> 0 in steps
3197 // of EVL.
3198 VPPhi *AVLPhi = Builder.createScalarPhi(
3199 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3200 VPValue *AVL = AVLPhi;
3201
3202 if (MaxSafeElements) {
3203 // Support for MaxSafeDist for correct loop emission.
3204 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3205 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3206 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3207 "safe_avl");
3208 }
3209 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3211
3212 auto *CanonicalIVIncrement =
3213 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3214 Builder.setInsertPoint(CanonicalIVIncrement);
3215 VPValue *OpVPEVL = VPEVL;
3216
3217 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3218 OpVPEVL = Builder.createScalarZExtOrTrunc(
3219 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3220
3221 auto *NextEVLIV = Builder.createOverflowingOp(
3222 Instruction::Add, {OpVPEVL, EVLPhi},
3223 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3224 CanonicalIVIncrement->hasNoSignedWrap()},
3225 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3226 EVLPhi->addOperand(NextEVLIV);
3227
3228 VPValue *NextAVL = Builder.createOverflowingOp(
3229 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3230 DebugLoc::getCompilerGenerated(), "avl.next");
3231 AVLPhi->addOperand(NextAVL);
3232
3233 transformRecipestoEVLRecipes(Plan, *VPEVL);
3234
3235 // Replace all uses of VPCanonicalIVPHIRecipe by
3236 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3237 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3238 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3239 // TODO: support unroll factor > 1.
3240 Plan.setUF(1);
3241}
3242
3244 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3245 // There should be only one EVL PHI in the entire plan.
3246 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3247
3250 for (VPRecipeBase &R : VPBB->phis())
3251 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3252 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3253 EVLPhi = PhiR;
3254 }
3255
3256 // Early return if no EVL PHI is found.
3257 if (!EVLPhi)
3258 return;
3259
3260 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3261 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3262 VPValue *AVL;
3263 [[maybe_unused]] bool FoundAVL =
3264 match(EVLIncrement,
3265 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3266 assert(FoundAVL && "Didn't find AVL?");
3267
3268 // The AVL may be capped to a safe distance.
3269 VPValue *SafeAVL;
3270 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3271 AVL = SafeAVL;
3272
3273 VPValue *AVLNext;
3274 [[maybe_unused]] bool FoundAVLNext =
3276 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3277 assert(FoundAVLNext && "Didn't find AVL backedge?");
3278
3279 // Convert EVLPhi to concrete recipe.
3280 auto *ScalarR =
3281 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3282 EVLPhi->getDebugLoc(), "evl.based.iv");
3283 EVLPhi->replaceAllUsesWith(ScalarR);
3284 EVLPhi->eraseFromParent();
3285
3286 // Replace CanonicalIVInc with EVL-PHI increment.
3287 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3288 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3289 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3290 m_Specific(&Plan.getVFxUF()))) &&
3291 "Unexpected canonical iv");
3292 Backedge->replaceAllUsesWith(EVLIncrement);
3293
3294 // Remove unused phi and increment.
3295 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3296 CanonicalIVIncrement->eraseFromParent();
3297 CanonicalIV->eraseFromParent();
3298
3299 // Replace the use of VectorTripCount in the latch-exiting block.
3300 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3301 // After: (branch-on-cond icmp eq AVLNext, 0)
3302 VPBasicBlock *LatchExiting =
3303 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3304 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3305 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3306 return;
3307
3308 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3309 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3310 m_Specific(&Plan.getVectorTripCount())))) &&
3311 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3312 "trip count");
3313
3314 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3315 VPBuilder Builder(LatchExitingBr);
3316 LatchExitingBr->setOperand(0,
3317 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3318 Plan.getConstantInt(AVLTy, 0)));
3319}
3320
3322 VPlan &Plan, PredicatedScalarEvolution &PSE,
3323 const DenseMap<Value *, const SCEV *> &StridesMap) {
3324 // Replace VPValues for known constant strides guaranteed by predicate scalar
3325 // evolution.
3326 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3327 auto *R = cast<VPRecipeBase>(&U);
3328 return R->getRegion() ||
3329 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3330 };
3331 ValueToSCEVMapTy RewriteMap;
3332 for (const SCEV *Stride : StridesMap.values()) {
3333 using namespace SCEVPatternMatch;
3334 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3335 const APInt *StrideConst;
3336 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3337 // Only handle constant strides for now.
3338 continue;
3339
3340 auto *CI = Plan.getConstantInt(*StrideConst);
3341 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3342 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3343
3344 // The versioned value may not be used in the loop directly but through a
3345 // sext/zext. Add new live-ins in those cases.
3346 for (Value *U : StrideV->users()) {
3348 continue;
3349 VPValue *StrideVPV = Plan.getLiveIn(U);
3350 if (!StrideVPV)
3351 continue;
3352 unsigned BW = U->getType()->getScalarSizeInBits();
3353 APInt C =
3354 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3355 VPValue *CI = Plan.getConstantInt(C);
3356 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3357 }
3358 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3359 }
3360
3361 for (VPRecipeBase &R : *Plan.getEntry()) {
3362 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3363 if (!ExpSCEV)
3364 continue;
3365 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3366 auto *NewSCEV =
3367 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3368 if (NewSCEV != ScevExpr) {
3369 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3370 ExpSCEV->replaceAllUsesWith(NewExp);
3371 if (Plan.getTripCount() == ExpSCEV)
3372 Plan.resetTripCount(NewExp);
3373 }
3374 }
3375}
3376
3378 VPlan &Plan,
3379 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3380 // Collect recipes in the backward slice of `Root` that may generate a poison
3381 // value that is used after vectorization.
3383 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3385 Worklist.push_back(Root);
3386
3387 // Traverse the backward slice of Root through its use-def chain.
3388 while (!Worklist.empty()) {
3389 VPRecipeBase *CurRec = Worklist.pop_back_val();
3390
3391 if (!Visited.insert(CurRec).second)
3392 continue;
3393
3394 // Prune search if we find another recipe generating a widen memory
3395 // instruction. Widen memory instructions involved in address computation
3396 // will lead to gather/scatter instructions, which don't need to be
3397 // handled.
3399 VPHeaderPHIRecipe>(CurRec))
3400 continue;
3401
3402 // This recipe contributes to the address computation of a widen
3403 // load/store. If the underlying instruction has poison-generating flags,
3404 // drop them directly.
3405 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3406 VPValue *A, *B;
3407 // Dropping disjoint from an OR may yield incorrect results, as some
3408 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3409 // for dependence analysis). Instead, replace it with an equivalent Add.
3410 // This is possible as all users of the disjoint OR only access lanes
3411 // where the operands are disjoint or poison otherwise.
3412 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3413 RecWithFlags->isDisjoint()) {
3414 VPBuilder Builder(RecWithFlags);
3415 VPInstruction *New = Builder.createOverflowingOp(
3416 Instruction::Add, {A, B}, {false, false},
3417 RecWithFlags->getDebugLoc());
3418 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3419 RecWithFlags->replaceAllUsesWith(New);
3420 RecWithFlags->eraseFromParent();
3421 CurRec = New;
3422 } else
3423 RecWithFlags->dropPoisonGeneratingFlags();
3424 } else {
3427 (void)Instr;
3428 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3429 "found instruction with poison generating flags not covered by "
3430 "VPRecipeWithIRFlags");
3431 }
3432
3433 // Add new definitions to the worklist.
3434 for (VPValue *Operand : CurRec->operands())
3435 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3436 Worklist.push_back(OpDef);
3437 }
3438 });
3439
3440 // Traverse all the recipes in the VPlan and collect the poison-generating
3441 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3442 // VPInterleaveRecipe.
3443 auto Iter = vp_depth_first_deep(Plan.getEntry());
3445 for (VPRecipeBase &Recipe : *VPBB) {
3446 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3447 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3448 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3449 if (AddrDef && WidenRec->isConsecutive() &&
3450 BlockNeedsPredication(UnderlyingInstr.getParent()))
3451 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3452 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3453 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3454 if (AddrDef) {
3455 // Check if any member of the interleave group needs predication.
3456 const InterleaveGroup<Instruction> *InterGroup =
3457 InterleaveRec->getInterleaveGroup();
3458 bool NeedPredication = false;
3459 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3460 I < NumMembers; ++I) {
3461 Instruction *Member = InterGroup->getMember(I);
3462 if (Member)
3463 NeedPredication |= BlockNeedsPredication(Member->getParent());
3464 }
3465
3466 if (NeedPredication)
3467 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3468 }
3469 }
3470 }
3471 }
3472}
3473
3475 VPlan &Plan,
3477 &InterleaveGroups,
3478 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3479 if (InterleaveGroups.empty())
3480 return;
3481
3482 // Interleave memory: for each Interleave Group we marked earlier as relevant
3483 // for this VPlan, replace the Recipes widening its memory instructions with a
3484 // single VPInterleaveRecipe at its insertion point.
3485 VPDominatorTree VPDT(Plan);
3486 for (const auto *IG : InterleaveGroups) {
3487 auto *Start =
3488 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3489 VPIRMetadata InterleaveMD(*Start);
3490 SmallVector<VPValue *, 4> StoredValues;
3491 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3492 StoredValues.push_back(StoreR->getStoredValue());
3493 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3494 Instruction *MemberI = IG->getMember(I);
3495 if (!MemberI)
3496 continue;
3497 VPWidenMemoryRecipe *MemoryR =
3498 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3499 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3500 StoredValues.push_back(StoreR->getStoredValue());
3501 InterleaveMD.intersect(*MemoryR);
3502 }
3503
3504 bool NeedsMaskForGaps =
3505 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3506 (!StoredValues.empty() && !IG->isFull());
3507
3508 Instruction *IRInsertPos = IG->getInsertPos();
3509 auto *InsertPos =
3510 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3511
3513 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3514 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3515 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3516
3517 // Get or create the start address for the interleave group.
3518 VPValue *Addr = Start->getAddr();
3519 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3520 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3521 // We cannot re-use the address of member zero because it does not
3522 // dominate the insert position. Instead, use the address of the insert
3523 // position and create a PtrAdd adjusting it to the address of member
3524 // zero.
3525 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3526 // InsertPos or sink loads above zero members to join it.
3527 assert(IG->getIndex(IRInsertPos) != 0 &&
3528 "index of insert position shouldn't be zero");
3529 auto &DL = IRInsertPos->getDataLayout();
3530 APInt Offset(32,
3531 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3532 IG->getIndex(IRInsertPos),
3533 /*IsSigned=*/true);
3534 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3535 VPBuilder B(InsertPos);
3536 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3537 }
3538 // If the group is reverse, adjust the index to refer to the last vector
3539 // lane instead of the first. We adjust the index from the first vector
3540 // lane, rather than directly getting the pointer for lane VF - 1, because
3541 // the pointer operand of the interleaved access is supposed to be uniform.
3542 if (IG->isReverse()) {
3543 auto *ReversePtr = new VPVectorEndPointerRecipe(
3544 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3545 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3546 ReversePtr->insertBefore(InsertPos);
3547 Addr = ReversePtr;
3548 }
3549 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3550 InsertPos->getMask(), NeedsMaskForGaps,
3551 InterleaveMD, InsertPos->getDebugLoc());
3552 VPIG->insertBefore(InsertPos);
3553
3554 unsigned J = 0;
3555 for (unsigned i = 0; i < IG->getFactor(); ++i)
3556 if (Instruction *Member = IG->getMember(i)) {
3557 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3558 if (!Member->getType()->isVoidTy()) {
3559 VPValue *OriginalV = MemberR->getVPSingleValue();
3560 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3561 J++;
3562 }
3563 MemberR->eraseFromParent();
3564 }
3565 }
3566}
3567
3568/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3569/// value, phi and backedge value. In the following example:
3570///
3571/// vector.ph:
3572/// Successor(s): vector loop
3573///
3574/// <x1> vector loop: {
3575/// vector.body:
3576/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3577/// ...
3578/// EMIT branch-on-count ...
3579/// No successors
3580/// }
3581///
3582/// WIDEN-INDUCTION will get expanded to:
3583///
3584/// vector.ph:
3585/// ...
3586/// vp<%induction.start> = ...
3587/// vp<%induction.increment> = ...
3588///
3589/// Successor(s): vector loop
3590///
3591/// <x1> vector loop: {
3592/// vector.body:
3593/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3594/// ...
3595/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3596/// EMIT branch-on-count ...
3597/// No successors
3598/// }
3599static void
3601 VPTypeAnalysis &TypeInfo) {
3602 VPlan *Plan = WidenIVR->getParent()->getPlan();
3603 VPValue *Start = WidenIVR->getStartValue();
3604 VPValue *Step = WidenIVR->getStepValue();
3605 VPValue *VF = WidenIVR->getVFValue();
3606 DebugLoc DL = WidenIVR->getDebugLoc();
3607
3608 // The value from the original loop to which we are mapping the new induction
3609 // variable.
3610 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3611
3612 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3615 VPIRFlags Flags = *WidenIVR;
3616 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3617 AddOp = Instruction::Add;
3618 MulOp = Instruction::Mul;
3619 } else {
3620 AddOp = ID.getInductionOpcode();
3621 MulOp = Instruction::FMul;
3622 }
3623
3624 // If the phi is truncated, truncate the start and step values.
3625 VPBuilder Builder(Plan->getVectorPreheader());
3626 Type *StepTy = TypeInfo.inferScalarType(Step);
3627 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3628 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3629 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3630 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3631 // Truncation doesn't preserve WrapFlags.
3632 Flags.dropPoisonGeneratingFlags();
3633 StepTy = Ty;
3634 }
3635
3636 // Construct the initial value of the vector IV in the vector loop preheader.
3637 Type *IVIntTy =
3639 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3640 if (StepTy->isFloatingPointTy())
3641 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3642
3643 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3644 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3645
3646 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3647 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3648 DebugLoc::getUnknown(), "induction");
3649
3650 // Create the widened phi of the vector IV.
3651 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3652 WidenIVR->getDebugLoc(), "vec.ind");
3653 WidePHI->insertBefore(WidenIVR);
3654
3655 // Create the backedge value for the vector IV.
3656 VPValue *Inc;
3657 VPValue *Prev;
3658 // If unrolled, use the increment and prev value from the operands.
3659 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3660 Inc = SplatVF;
3661 Prev = WidenIVR->getLastUnrolledPartOperand();
3662 } else {
3663 if (VPRecipeBase *R = VF->getDefiningRecipe())
3664 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3665 // Multiply the vectorization factor by the step using integer or
3666 // floating-point arithmetic as appropriate.
3667 if (StepTy->isFloatingPointTy())
3668 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3669 DL);
3670 else
3671 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3672 TypeInfo.inferScalarType(VF), DL);
3673
3674 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3675 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3676 Prev = WidePHI;
3677 }
3678
3680 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3681 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3682 WidenIVR->getDebugLoc(), "vec.ind.next");
3683
3684 WidePHI->addOperand(Next);
3685
3686 WidenIVR->replaceAllUsesWith(WidePHI);
3687}
3688
3689/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3690/// initial value, phi and backedge value. In the following example:
3691///
3692/// <x1> vector loop: {
3693/// vector.body:
3694/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3695/// ...
3696/// EMIT branch-on-count ...
3697/// }
3698///
3699/// WIDEN-POINTER-INDUCTION will get expanded to:
3700///
3701/// <x1> vector loop: {
3702/// vector.body:
3703/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3704/// EMIT %mul = mul %stepvector, %step
3705/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3706/// ...
3707/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3708/// EMIT branch-on-count ...
3709/// }
3711 VPTypeAnalysis &TypeInfo) {
3712 VPlan *Plan = R->getParent()->getPlan();
3713 VPValue *Start = R->getStartValue();
3714 VPValue *Step = R->getStepValue();
3715 VPValue *VF = R->getVFValue();
3716
3717 assert(R->getInductionDescriptor().getKind() ==
3719 "Not a pointer induction according to InductionDescriptor!");
3720 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3721 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3722 "Recipe should have been replaced");
3723
3724 VPBuilder Builder(R);
3725 DebugLoc DL = R->getDebugLoc();
3726
3727 // Build a scalar pointer phi.
3728 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3729
3730 // Create actual address geps that use the pointer phi as base and a
3731 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3732 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3733 Type *StepTy = TypeInfo.inferScalarType(Step);
3734 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3735 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3736 VPValue *PtrAdd = Builder.createNaryOp(
3737 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3738 R->replaceAllUsesWith(PtrAdd);
3739
3740 // Create the backedge value for the scalar pointer phi.
3742 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3743 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3744 DL);
3745 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3746
3747 VPValue *InductionGEP =
3748 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3749 ScalarPtrPhi->addOperand(InductionGEP);
3750}
3751
3753 // Replace loop regions with explicity CFG.
3754 SmallVector<VPRegionBlock *> LoopRegions;
3756 vp_depth_first_deep(Plan.getEntry()))) {
3757 if (!R->isReplicator())
3758 LoopRegions.push_back(R);
3759 }
3760 for (VPRegionBlock *R : LoopRegions)
3761 R->dissolveToCFGLoop();
3762}
3763
3766 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3767 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3770 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3771 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3772 }
3773
3774 // Expand BranchOnTwoConds instructions into explicit CFG with
3775 // single-condition branches, by introducing a new branch in VPBB that jumps
3776 // to a new intermediate block if either condition is true and to the
3777 // third successor otherwise. The intermediate block jumps to the first or
3778 // second successor, depending on the first condition.
3779 for (VPInstruction *Br : WorkList) {
3780 assert(Br->getNumOperands() == 2 &&
3781 "BranchOnTwoConds must have exactly 2 conditions");
3782 DebugLoc DL = Br->getDebugLoc();
3783 VPBasicBlock *Latch = Br->getParent();
3784 const auto Successors = to_vector(Latch->getSuccessors());
3785 assert(Successors.size() == 3 &&
3786 "BranchOnTwoConds must have exactly 3 successors");
3787
3788 for (VPBlockBase *Succ : Successors)
3789 VPBlockUtils::disconnectBlocks(Latch, Succ);
3790
3791 VPValue *EarlyExitingCond = Br->getOperand(0);
3792 VPValue *LateExitingCond = Br->getOperand(1);
3793 VPBlockBase *EarlyExitBB = Successors[0];
3794 VPBlockBase *LateExitBB = Successors[1];
3795 VPBlockBase *Header = Successors[2];
3796
3797 VPBasicBlock *MiddleSplit = Plan.createVPBasicBlock("middle.split");
3798 MiddleSplit->setParent(LateExitBB->getParent());
3799
3800 VPBuilder Builder(Latch);
3801 VPValue *AnyExitTaken = Builder.createNaryOp(
3802 Instruction::Or, {EarlyExitingCond, LateExitingCond}, DL);
3803 Builder.createNaryOp(VPInstruction::BranchOnCond, {AnyExitTaken}, DL);
3804 VPBlockUtils::connectBlocks(Latch, MiddleSplit);
3805 VPBlockUtils::connectBlocks(Latch, Header);
3806
3807 VPBuilder(MiddleSplit)
3808 .createNaryOp(VPInstruction::BranchOnCond, {EarlyExitingCond}, DL);
3809 VPBlockUtils::connectBlocks(MiddleSplit, EarlyExitBB);
3810 VPBlockUtils::connectBlocks(MiddleSplit, LateExitBB);
3811
3812 Br->eraseFromParent();
3813 }
3814}
3815
3817 VPTypeAnalysis TypeInfo(Plan);
3820 vp_depth_first_deep(Plan.getEntry()))) {
3821 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3822 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3823 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3824 ToRemove.push_back(WidenIVR);
3825 continue;
3826 }
3827
3828 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3829 // If the recipe only generates scalars, scalarize it instead of
3830 // expanding it.
3831 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3832 VPBuilder Builder(WidenIVR);
3833 VPValue *PtrAdd =
3834 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3835 WidenIVR->replaceAllUsesWith(PtrAdd);
3836 ToRemove.push_back(WidenIVR);
3837 continue;
3838 }
3839 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3840 ToRemove.push_back(WidenIVR);
3841 continue;
3842 }
3843
3844 // Expand VPBlendRecipe into VPInstruction::Select.
3845 VPBuilder Builder(&R);
3846 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3847 VPValue *Select = Blend->getIncomingValue(0);
3848 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3849 Select = Builder.createSelect(Blend->getMask(I),
3850 Blend->getIncomingValue(I), Select,
3851 R.getDebugLoc(), "predphi");
3852 Blend->replaceAllUsesWith(Select);
3853 ToRemove.push_back(Blend);
3854 }
3855
3856 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3857 Expr->decompose();
3858 ToRemove.push_back(Expr);
3859 }
3860
3861 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3862 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3863 if (LastActiveL &&
3864 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3865 // Create Not(Mask) for all operands.
3867 for (VPValue *Op : LastActiveL->operands()) {
3868 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3869 NotMasks.push_back(NotMask);
3870 }
3871
3872 // Create FirstActiveLane on the inverted masks.
3873 VPValue *FirstInactiveLane = Builder.createNaryOp(
3875 LastActiveL->getDebugLoc(), "first.inactive.lane");
3876
3877 // Subtract 1 to get the last active lane.
3878 VPValue *One = Plan.getOrAddLiveIn(
3879 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3880 VPValue *LastLane = Builder.createNaryOp(
3881 Instruction::Sub, {FirstInactiveLane, One},
3882 LastActiveL->getDebugLoc(), "last.active.lane");
3883
3884 LastActiveL->replaceAllUsesWith(LastLane);
3885 ToRemove.push_back(LastActiveL);
3886 continue;
3887 }
3888
3889 // Lower BranchOnCount to ICmp + BranchOnCond.
3890 VPValue *IV, *TC;
3891 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3892 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3893 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3894 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3895 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3896 ToRemove.push_back(BranchOnCountInst);
3897 continue;
3898 }
3899
3900 VPValue *VectorStep;
3901 VPValue *ScalarStep;
3903 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3904 continue;
3905
3906 // Expand WideIVStep.
3907 auto *VPI = cast<VPInstruction>(&R);
3908 Type *IVTy = TypeInfo.inferScalarType(VPI);
3909 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3911 ? Instruction::UIToFP
3912 : Instruction::Trunc;
3913 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3914 }
3915
3916 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3917 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3918 ScalarStep =
3919 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3920 }
3921
3922 VPIRFlags Flags;
3923 if (IVTy->isFloatingPointTy())
3924 Flags = {VPI->getFastMathFlags()};
3925
3926 unsigned MulOpc =
3927 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3928 VPInstruction *Mul = Builder.createNaryOp(
3929 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3930 VectorStep = Mul;
3931 VPI->replaceAllUsesWith(VectorStep);
3932 ToRemove.push_back(VPI);
3933 }
3934 }
3935
3936 for (VPRecipeBase *R : ToRemove)
3937 R->eraseFromParent();
3938}
3939
3941 VPBasicBlock *EarlyExitVPBB,
3942 VPlan &Plan,
3943 VPBasicBlock *HeaderVPBB,
3944 VPBasicBlock *LatchVPBB) {
3945 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3946 if (!EarlyExitVPBB->getSinglePredecessor() &&
3947 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3948 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3949 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3950 "unsupported early exit VPBB");
3951 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3952 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3953 // of the phis.
3954 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3955 cast<VPIRPhi>(&R)->swapOperands();
3956 }
3957
3958 VPBuilder Builder(LatchVPBB->getTerminator());
3959 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3960 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3961 "Terminator must be be BranchOnCond");
3962 VPValue *CondOfEarlyExitingVPBB =
3963 EarlyExitingVPBB->getTerminator()->getOperand(0);
3964 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3965 ? CondOfEarlyExitingVPBB
3966 : Builder.createNot(CondOfEarlyExitingVPBB);
3967
3968 // Create a BranchOnTwoConds in the latch that branches to:
3969 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3970 VPValue *IsEarlyExitTaken =
3971 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3972 VPBasicBlock *VectorEarlyExitVPBB =
3973 Plan.createVPBasicBlock("vector.early.exit");
3974 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3975
3976 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3977
3978 // Update the exit phis in the early exit block.
3979 VPBuilder MiddleBuilder(MiddleVPBB);
3980 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3981 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3982 auto *ExitIRI = cast<VPIRPhi>(&R);
3983 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3984 // a single predecessor and 1 if it has two.
3985 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3986 if (ExitIRI->getNumOperands() != 1) {
3987 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3988 // predecessor. Extract its final lane.
3989 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3990 }
3991
3992 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3993 if (!isa<VPIRValue>(IncomingFromEarlyExit)) {
3994 // Update the incoming value from the early exit.
3995 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3996 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3997 DebugLoc::getUnknown(), "first.active.lane");
3998 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3999 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
4000 DebugLoc::getUnknown(), "early.exit.value");
4001 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
4002 }
4003 }
4004
4005 // Replace the conditional branch controlling the latch exit from the vector
4006 // loop with a multi-conditional branch exiting to vector early exit if the
4007 // early exit has been taken, exiting to middle block if the original
4008 // condition of the vector latch is true, otherwise continuing back to header.
4009 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4010 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4011 "Unexpected terminator");
4012 auto *IsLatchExitTaken =
4013 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4014 LatchExitingBranch->getOperand(1));
4015
4016 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4017 LatchExitingBranch->eraseFromParent();
4018
4019 Builder.setInsertPoint(LatchVPBB);
4020 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4021 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
4022 LatchVPBB->clearSuccessors();
4023 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4024 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4025}
4026
4027/// This function tries convert extended in-loop reductions to
4028/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4029/// valid. The created recipe must be decomposed to its constituent
4030/// recipes before execution.
4031static VPExpressionRecipe *
4033 VFRange &Range) {
4034 Type *RedTy = Ctx.Types.inferScalarType(Red);
4035 VPValue *VecOp = Red->getVecOp();
4036
4037 // Clamp the range if using extended-reduction is profitable.
4038 auto IsExtendedRedValidAndClampRange =
4039 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4041 [&](ElementCount VF) {
4042 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4044
4045 InstructionCost ExtRedCost;
4046 InstructionCost ExtCost =
4047 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4048 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4049
4050 if (Red->isPartialReduction()) {
4053 // FIXME: Move partial reduction creation, costing and clamping
4054 // here from LoopVectorize.cpp.
4055 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4056 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4057 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
4058 } else {
4059 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4060 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4061 Red->getFastMathFlags(), CostKind);
4062 }
4063 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4064 },
4065 Range);
4066 };
4067
4068 VPValue *A;
4069 // Match reduce(ext)).
4070 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
4071 IsExtendedRedValidAndClampRange(
4072 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4073 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4074 Ctx.Types.inferScalarType(A)))
4075 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4076
4077 return nullptr;
4078}
4079
4080/// This function tries convert extended in-loop reductions to
4081/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4082/// and valid. The created VPExpressionRecipe must be decomposed to its
4083/// constituent recipes before execution. Patterns of the
4084/// VPExpressionRecipe:
4085/// reduce.add(mul(...)),
4086/// reduce.add(mul(ext(A), ext(B))),
4087/// reduce.add(ext(mul(ext(A), ext(B)))).
4088static VPExpressionRecipe *
4090 VPCostContext &Ctx, VFRange &Range) {
4091 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4092 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
4093 return nullptr;
4094
4095 Type *RedTy = Ctx.Types.inferScalarType(Red);
4096
4097 // Clamp the range if using multiply-accumulate-reduction is profitable.
4098 auto IsMulAccValidAndClampRange =
4100 VPWidenCastRecipe *OuterExt) -> bool {
4102 [&](ElementCount VF) {
4104 Type *SrcTy =
4105 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4106 InstructionCost MulAccCost;
4107
4108 if (Red->isPartialReduction()) {
4109 Type *SrcTy2 =
4110 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4111 // FIXME: Move partial reduction creation, costing and clamping
4112 // here from LoopVectorize.cpp.
4113 MulAccCost = Ctx.TTI.getPartialReductionCost(
4114 Opcode, SrcTy, SrcTy2, RedTy, VF,
4116 Ext0->getOpcode())
4119 Ext1->getOpcode())
4121 Mul->getOpcode(), CostKind);
4122 } else {
4123 // Only partial reductions support mixed extends at the moment.
4124 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
4125 return false;
4126
4127 bool IsZExt =
4128 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4129 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4130 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4131 SrcVecTy, CostKind);
4132 }
4133
4134 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4135 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4136 InstructionCost ExtCost = 0;
4137 if (Ext0)
4138 ExtCost += Ext0->computeCost(VF, Ctx);
4139 if (Ext1)
4140 ExtCost += Ext1->computeCost(VF, Ctx);
4141 if (OuterExt)
4142 ExtCost += OuterExt->computeCost(VF, Ctx);
4143
4144 return MulAccCost.isValid() &&
4145 MulAccCost < ExtCost + MulCost + RedCost;
4146 },
4147 Range);
4148 };
4149
4150 VPValue *VecOp = Red->getVecOp();
4151 VPRecipeBase *Sub = nullptr;
4152 VPValue *A, *B;
4153 VPValue *Tmp = nullptr;
4154 // Sub reductions could have a sub between the add reduction and vec op.
4155 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4156 Sub = VecOp->getDefiningRecipe();
4157 VecOp = Tmp;
4158 }
4159
4160 // If ValB is a constant and can be safely extended, truncate it to the same
4161 // type as ExtA's operand, then extend it to the same type as ExtA. This
4162 // creates two uniform extends that can more easily be matched by the rest of
4163 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4164 // replaced with the new extend of the constant.
4165 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4166 VPWidenCastRecipe *&ExtB,
4167 VPValue *&ValB, VPWidenRecipe *Mul) {
4168 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4169 return;
4170 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4171 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4172 const APInt *Const;
4173 if (!match(ValB, m_APInt(Const)) ||
4175 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4176 return;
4177 // The truncate ensures that the type of each extended operand is the
4178 // same, and it's been proven that the constant can be extended from
4179 // NarrowTy safely. Necessary since ExtA's extended operand would be
4180 // e.g. an i8, while the const will likely be an i32. This will be
4181 // elided by later optimisations.
4182 VPBuilder Builder(Mul);
4183 auto *Trunc =
4184 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4185 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4186 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4187 Mul->setOperand(1, ExtB);
4188 };
4189
4190 // Try to match reduce.add(mul(...)).
4191 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4194 auto *Mul = cast<VPWidenRecipe>(VecOp);
4195
4196 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4197 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4198
4199 // Match reduce.add/sub(mul(ext, ext)).
4200 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4201 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4202 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4203 if (Sub)
4204 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4205 cast<VPWidenRecipe>(Sub), Red);
4206 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4207 }
4208 // TODO: Add an expression type for this variant with a negated mul
4209 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4210 return new VPExpressionRecipe(Mul, Red);
4211 }
4212 // TODO: Add an expression type for negated versions of other expression
4213 // variants.
4214 if (Sub)
4215 return nullptr;
4216
4217 // Match reduce.add(ext(mul(A, B))).
4218 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4219 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4220 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4223
4224 // reduce.add(ext(mul(ext, const)))
4225 // -> reduce.add(ext(mul(ext, ext(const))))
4226 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4227
4228 // reduce.add(ext(mul(ext(A), ext(B))))
4229 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4230 // The inner extends must either have the same opcode as the outer extend or
4231 // be the same, in which case the multiply can never result in a negative
4232 // value and the outer extend can be folded away by doing wider
4233 // extends for the operands of the mul.
4234 if (Ext0 && Ext1 &&
4235 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4236 Ext0->getOpcode() == Ext1->getOpcode() &&
4237 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4238 auto *NewExt0 = new VPWidenCastRecipe(
4239 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4240 *Ext0, *Ext0, Ext0->getDebugLoc());
4241 NewExt0->insertBefore(Ext0);
4242
4243 VPWidenCastRecipe *NewExt1 = NewExt0;
4244 if (Ext0 != Ext1) {
4245 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4246 Ext->getResultType(), nullptr, *Ext1,
4247 *Ext1, Ext1->getDebugLoc());
4248 NewExt1->insertBefore(Ext1);
4249 }
4250 Mul->setOperand(0, NewExt0);
4251 Mul->setOperand(1, NewExt1);
4252 Red->setOperand(1, Mul);
4253 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4254 }
4255 }
4256 return nullptr;
4257}
4258
4259/// This function tries to create abstract recipes from the reduction recipe for
4260/// following optimizations and cost estimation.
4262 VPCostContext &Ctx,
4263 VFRange &Range) {
4264 VPExpressionRecipe *AbstractR = nullptr;
4265 auto IP = std::next(Red->getIterator());
4266 auto *VPBB = Red->getParent();
4267 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4268 AbstractR = MulAcc;
4269 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4270 AbstractR = ExtRed;
4271 // Cannot create abstract inloop reduction recipes.
4272 if (!AbstractR)
4273 return;
4274
4275 AbstractR->insertBefore(*VPBB, IP);
4276 Red->replaceAllUsesWith(AbstractR);
4277}
4278
4289
4291 if (Plan.hasScalarVFOnly())
4292 return;
4293
4294#ifndef NDEBUG
4295 VPDominatorTree VPDT(Plan);
4296#endif
4297
4298 SmallVector<VPValue *> VPValues;
4301 append_range(VPValues, Plan.getLiveIns());
4302 for (VPRecipeBase &R : *Plan.getEntry())
4303 append_range(VPValues, R.definedValues());
4304
4305 auto *VectorPreheader = Plan.getVectorPreheader();
4306 for (VPValue *VPV : VPValues) {
4308 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4309 continue;
4310
4311 // Add explicit broadcast at the insert point that dominates all users.
4312 VPBasicBlock *HoistBlock = VectorPreheader;
4313 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4314 for (VPUser *User : VPV->users()) {
4315 if (User->usesScalars(VPV))
4316 continue;
4317 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4318 HoistPoint = HoistBlock->begin();
4319 else
4320 assert(VPDT.dominates(VectorPreheader,
4321 cast<VPRecipeBase>(User)->getParent()) &&
4322 "All users must be in the vector preheader or dominated by it");
4323 }
4324
4325 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4326 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4327 VPV->replaceUsesWithIf(Broadcast,
4328 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4329 return Broadcast != &U && !U.usesScalars(VPV);
4330 });
4331 }
4332}
4333
4335 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4336
4337 // Collect candidate loads with invariant addresses and noalias scopes
4338 // metadata and memory-writing recipes with noalias metadata.
4342 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4343 for (VPRecipeBase &R : *VPBB) {
4344 // Only handle single-scalar replicated loads with invariant addresses.
4345 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4346 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4347 RepR->getOpcode() != Instruction::Load)
4348 continue;
4349
4350 VPValue *Addr = RepR->getOperand(0);
4351 if (Addr->isDefinedOutsideLoopRegions()) {
4353 if (!Loc.AATags.Scope)
4354 continue;
4355 CandidateLoads.push_back({RepR, Loc});
4356 }
4357 }
4358 if (R.mayWriteToMemory()) {
4360 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4361 return;
4362 Stores.push_back(*Loc);
4363 }
4364 }
4365 }
4366
4367 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4368 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4369 // Hoist the load to the preheader if it doesn't alias with any stores
4370 // according to the noalias metadata. Other loads should have been hoisted
4371 // by other passes
4372 const AAMDNodes &LoadAA = LoadLoc.AATags;
4373 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4375 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4376 })) {
4377 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4378 }
4379 }
4380}
4381
4382// Collect common metadata from a group of replicate recipes by intersecting
4383// metadata from all recipes in the group.
4385 VPIRMetadata CommonMetadata = *Recipes.front();
4386 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4387 CommonMetadata.intersect(*Recipe);
4388 return CommonMetadata;
4389}
4390
4391template <unsigned Opcode>
4395 const Loop *L) {
4396 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4397 "Only Load and Store opcodes supported");
4398 constexpr bool IsLoad = (Opcode == Instruction::Load);
4399 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4400 VPTypeAnalysis TypeInfo(Plan);
4401
4402 // Group predicated operations by their address SCEV.
4404 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4405 auto *VPBB = cast<VPBasicBlock>(Block);
4406 for (VPRecipeBase &R : *VPBB) {
4407 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4408 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4409 continue;
4410
4411 // For loads, operand 0 is address; for stores, operand 1 is address.
4412 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4413 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4414 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4415 RecipesByAddress[AddrSCEV].push_back(RepR);
4416 }
4417 }
4418
4419 // For each address, collect operations with the same or complementary masks.
4421 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4422 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4423 };
4424 for (auto &[Addr, Recipes] : RecipesByAddress) {
4425 if (Recipes.size() < 2)
4426 continue;
4427
4428 // Collect groups with the same or complementary masks.
4429 for (VPReplicateRecipe *&RecipeI : Recipes) {
4430 if (!RecipeI)
4431 continue;
4432
4433 VPValue *MaskI = RecipeI->getMask();
4434 Type *TypeI = GetLoadStoreValueType(RecipeI);
4436 Group.push_back(RecipeI);
4437 RecipeI = nullptr;
4438
4439 // Find all operations with the same or complementary masks.
4440 bool HasComplementaryMask = false;
4441 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4442 if (!RecipeJ)
4443 continue;
4444
4445 VPValue *MaskJ = RecipeJ->getMask();
4446 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4447 if (TypeI == TypeJ) {
4448 // Check if any operation in the group has a complementary mask with
4449 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4450 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4451 match(MaskJ, m_Not(m_Specific(MaskI)));
4452 Group.push_back(RecipeJ);
4453 RecipeJ = nullptr;
4454 }
4455 }
4456
4457 if (HasComplementaryMask) {
4458 assert(Group.size() >= 2 && "must have at least 2 entries");
4459 AllGroups.push_back(std::move(Group));
4460 }
4461 }
4462 }
4463
4464 return AllGroups;
4465}
4466
4467// Find the recipe with minimum alignment in the group.
4468template <typename InstType>
4469static VPReplicateRecipe *
4471 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4472 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4473 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4474 });
4475}
4476
4479 const Loop *L) {
4480 auto Groups =
4482 if (Groups.empty())
4483 return;
4484
4485 VPDominatorTree VPDT(Plan);
4486
4487 // Process each group of loads.
4488 for (auto &Group : Groups) {
4489 // Sort loads by dominance order, with earliest (most dominating) first.
4490 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4491 return VPDT.properlyDominates(A, B);
4492 });
4493
4494 // Try to use the earliest (most dominating) load to replace all others.
4495 VPReplicateRecipe *EarliestLoad = Group[0];
4496 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4497 VPBasicBlock *LastBB = Group.back()->getParent();
4498
4499 // Check that the load doesn't alias with stores between first and last.
4500 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4501 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4502 continue;
4503
4504 // Collect common metadata from all loads in the group.
4505 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4506
4507 // Find the load with minimum alignment to use.
4508 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4509
4510 // Create an unpredicated version of the earliest load with common
4511 // metadata.
4512 auto *UnpredicatedLoad = new VPReplicateRecipe(
4513 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4514 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4515 CommonMetadata);
4516
4517 UnpredicatedLoad->insertBefore(EarliestLoad);
4518
4519 // Replace all loads in the group with the unpredicated load.
4520 for (VPReplicateRecipe *Load : Group) {
4521 Load->replaceAllUsesWith(UnpredicatedLoad);
4522 Load->eraseFromParent();
4523 }
4524 }
4525}
4526
4527static bool
4529 PredicatedScalarEvolution &PSE, const Loop &L,
4530 VPTypeAnalysis &TypeInfo) {
4531 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4532 if (!StoreLoc || !StoreLoc->AATags.Scope)
4533 return false;
4534
4535 // When sinking a group of stores, all members of the group alias each other.
4536 // Skip them during the alias checks.
4537 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4538 StoresToSink.end());
4539
4540 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4541 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4542 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4543 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4544}
4545
4548 const Loop *L) {
4549 auto Groups =
4551 if (Groups.empty())
4552 return;
4553
4554 VPDominatorTree VPDT(Plan);
4555 VPTypeAnalysis TypeInfo(Plan);
4556
4557 for (auto &Group : Groups) {
4558 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4559 return VPDT.properlyDominates(A, B);
4560 });
4561
4562 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4563 continue;
4564
4565 // Use the last (most dominated) store's location for the unconditional
4566 // store.
4567 VPReplicateRecipe *LastStore = Group.back();
4568 VPBasicBlock *InsertBB = LastStore->getParent();
4569
4570 // Collect common alias metadata from all stores in the group.
4571 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4572
4573 // Build select chain for stored values.
4574 VPValue *SelectedValue = Group[0]->getOperand(0);
4575 VPBuilder Builder(InsertBB, LastStore->getIterator());
4576
4577 for (unsigned I = 1; I < Group.size(); ++I) {
4578 VPValue *Mask = Group[I]->getMask();
4579 VPValue *Value = Group[I]->getOperand(0);
4580 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4581 Group[I]->getDebugLoc());
4582 }
4583
4584 // Find the store with minimum alignment to use.
4585 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4586
4587 // Create unconditional store with selected value and common metadata.
4588 auto *UnpredicatedStore =
4589 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4590 {SelectedValue, LastStore->getOperand(1)},
4591 /*IsSingleScalar=*/false,
4592 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4593 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4594
4595 // Remove all predicated stores from the group.
4596 for (VPReplicateRecipe *Store : Group)
4597 Store->eraseFromParent();
4598 }
4599}
4600
4602 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4604 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4605 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4606
4607 VPValue *TC = Plan.getTripCount();
4608 // Skip cases for which the trip count may be non-trivial to materialize.
4609 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4610 // tail is required.
4611 if (!Plan.hasScalarTail() ||
4613 Plan.getScalarPreheader() ||
4614 !isa<VPIRValue>(TC))
4615 return;
4616
4617 // Materialize vector trip counts for constants early if it can simply
4618 // be computed as (Original TC / VF * UF) * VF * UF.
4619 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4620 // tail-folded loops.
4621 ScalarEvolution &SE = *PSE.getSE();
4622 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4623 if (!isa<SCEVConstant>(TCScev))
4624 return;
4625 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4626 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4627 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4628 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4629}
4630
4632 VPBasicBlock *VectorPH) {
4634 if (BTC->getNumUsers() == 0)
4635 return;
4636
4637 VPBuilder Builder(VectorPH, VectorPH->begin());
4638 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4639 auto *TCMO = Builder.createNaryOp(
4640 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4641 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4642 BTC->replaceAllUsesWith(TCMO);
4643}
4644
4646 if (Plan.hasScalarVFOnly())
4647 return;
4648
4649 VPTypeAnalysis TypeInfo(Plan);
4650 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4651 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4653 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4654 vp_depth_first_shallow(LoopRegion->getEntry()));
4655 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4656 // VPInstructions, excluding ones in replicate regions. Those are not
4657 // materialized explicitly yet. Those vector users are still handled in
4658 // VPReplicateRegion::execute(), via shouldPack().
4659 // TODO: materialize build vectors for replicating recipes in replicating
4660 // regions.
4661 for (VPBasicBlock *VPBB :
4662 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4663 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4665 continue;
4666 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4667 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4668 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4669 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4670 };
4671 if ((isa<VPReplicateRecipe>(DefR) &&
4672 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4673 (isa<VPInstruction>(DefR) &&
4675 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4676 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4677 continue;
4678
4679 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4680 unsigned Opcode = ScalarTy->isStructTy()
4683 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4684 BuildVector->insertAfter(DefR);
4685
4686 DefR->replaceUsesWithIf(
4687 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4688 VPUser &U, unsigned) {
4689 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4690 });
4691 }
4692 }
4693
4694 // Create explicit VPInstructions to convert vectors to scalars. The current
4695 // implementation is conservative - it may miss some cases that may or may not
4696 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4697 // if they are known to operate on scalar values.
4698 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4699 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4702 continue;
4703 for (VPValue *Def : R.definedValues()) {
4704 // Skip recipes that are single-scalar or only have their first lane
4705 // used.
4706 // TODO: The Defs skipped here may or may not be vector values.
4707 // Introduce Unpacks, and remove them later, if they are guaranteed to
4708 // produce scalar values.
4710 continue;
4711
4712 // At the moment, we create unpacks only for scalar users outside
4713 // replicate regions. Recipes inside replicate regions still extract the
4714 // required lanes implicitly.
4715 // TODO: Remove once replicate regions are unrolled completely.
4716 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4717 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4718 return U->usesScalars(Def) &&
4719 (!ParentRegion || !ParentRegion->isReplicator());
4720 };
4721 if (none_of(Def->users(), IsCandidateUnpackUser))
4722 continue;
4723
4724 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4725 if (R.isPhi())
4726 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4727 else
4728 Unpack->insertAfter(&R);
4729 Def->replaceUsesWithIf(Unpack,
4730 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4731 return IsCandidateUnpackUser(&U);
4732 });
4733 }
4734 }
4735 }
4736}
4737
4739 VPBasicBlock *VectorPHVPBB,
4740 bool TailByMasking,
4741 bool RequiresScalarEpilogue) {
4742 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4743 // There's nothing to do if there are no users of the vector trip count or its
4744 // IR value has already been set.
4745 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4746 return;
4747
4748 VPValue *TC = Plan.getTripCount();
4749 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4750 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4751 VPValue *Step = &Plan.getVFxUF();
4752
4753 // If the tail is to be folded by masking, round the number of iterations N
4754 // up to a multiple of Step instead of rounding down. This is done by first
4755 // adding Step-1 and then rounding down. Note that it's ok if this addition
4756 // overflows: the vector induction variable will eventually wrap to zero given
4757 // that it starts at zero and its Step is a power of two; the loop will then
4758 // exit, with the last early-exit vector comparison also producing all-true.
4759 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4760 // is accounted for in emitIterationCountCheck that adds an overflow check.
4761 if (TailByMasking) {
4762 TC = Builder.createNaryOp(
4763 Instruction::Add,
4764 {TC, Builder.createNaryOp(Instruction::Sub,
4765 {Step, Plan.getConstantInt(TCTy, 1)})},
4766 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4767 }
4768
4769 // Now we need to generate the expression for the part of the loop that the
4770 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4771 // iterations are not required for correctness, or N - Step, otherwise. Step
4772 // is equal to the vectorization factor (number of SIMD elements) times the
4773 // unroll factor (number of SIMD instructions).
4774 VPValue *R =
4775 Builder.createNaryOp(Instruction::URem, {TC, Step},
4776 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4777
4778 // There are cases where we *must* run at least one iteration in the remainder
4779 // loop. See the cost model for when this can happen. If the step evenly
4780 // divides the trip count, we set the remainder to be equal to the step. If
4781 // the step does not evenly divide the trip count, no adjustment is necessary
4782 // since there will already be scalar iterations. Note that the minimum
4783 // iterations check ensures that N >= Step.
4784 if (RequiresScalarEpilogue) {
4785 assert(!TailByMasking &&
4786 "requiring scalar epilogue is not supported with fail folding");
4787 VPValue *IsZero =
4788 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4789 R = Builder.createSelect(IsZero, Step, R);
4790 }
4791
4792 VPValue *Res = Builder.createNaryOp(
4793 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4794 VectorTC.replaceAllUsesWith(Res);
4795}
4796
4798 ElementCount VFEC) {
4799 VPBuilder Builder(VectorPH, VectorPH->begin());
4800 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4801 VPValue &VF = Plan.getVF();
4802 VPValue &VFxUF = Plan.getVFxUF();
4803 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4804 // used.
4805 // TODO: Assert that they aren't used.
4806
4807 // If there are no users of the runtime VF, compute VFxUF by constant folding
4808 // the multiplication of VF and UF.
4809 if (VF.getNumUsers() == 0) {
4810 VPValue *RuntimeVFxUF =
4811 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4812 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4813 return;
4814 }
4815
4816 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4817 // vscale) * UF.
4818 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4820 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4822 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4823 }
4824 VF.replaceAllUsesWith(RuntimeVF);
4825
4826 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4827 VPValue *MulByUF = Builder.createOverflowingOp(
4828 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4829 VFxUF.replaceAllUsesWith(MulByUF);
4830}
4831
4834 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4835
4836 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4837 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4838 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4839 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4841 continue;
4842 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4843 if (!ExpSCEV)
4844 break;
4845 const SCEV *Expr = ExpSCEV->getSCEV();
4846 Value *Res =
4847 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4848 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4849 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4850 ExpSCEV->replaceAllUsesWith(Exp);
4851 if (Plan.getTripCount() == ExpSCEV)
4852 Plan.resetTripCount(Exp);
4853 ExpSCEV->eraseFromParent();
4854 }
4856 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4857 "after any VPIRInstructions");
4858 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4859 // to the VPIRBasicBlock.
4860 auto EI = Entry->begin();
4861 for (Instruction &I : drop_end(*EntryBB)) {
4862 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4863 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4864 EI++;
4865 continue;
4866 }
4868 }
4869
4870 return ExpandedSCEVs;
4871}
4872
4873/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4874/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4875/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4876/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4877/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4878/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4879/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4880/// is defined at \p Idx of a load interleave group.
4881static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4882 VPValue *OpV, unsigned Idx) {
4883 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4884 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4885 if (!Member0OpR)
4886 return Member0Op == OpV;
4887 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4888 return !W->getMask() && Member0Op == OpV;
4889 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4890 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4891 return false;
4892}
4893
4894/// Returns true if \p IR is a full interleave group with factor and number of
4895/// members both equal to \p VF. The interleave group must also access the full
4896/// vector width \p VectorRegWidth.
4898 ElementCount VF,
4899 VPTypeAnalysis &TypeInfo,
4900 TypeSize VectorRegWidth) {
4901 if (!InterleaveR || InterleaveR->getMask())
4902 return false;
4903
4904 Type *GroupElementTy = nullptr;
4905 if (InterleaveR->getStoredValues().empty()) {
4906 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4907 if (!all_of(InterleaveR->definedValues(),
4908 [&TypeInfo, GroupElementTy](VPValue *Op) {
4909 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4910 }))
4911 return false;
4912 } else {
4913 GroupElementTy =
4914 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4915 if (!all_of(InterleaveR->getStoredValues(),
4916 [&TypeInfo, GroupElementTy](VPValue *Op) {
4917 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4918 }))
4919 return false;
4920 }
4921
4922 unsigned VFMin = VF.getKnownMinValue();
4923 TypeSize GroupSize = TypeSize::get(
4924 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4925 const auto *IG = InterleaveR->getInterleaveGroup();
4926 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4927 GroupSize == VectorRegWidth;
4928}
4929
4930/// Returns true if \p VPValue is a narrow VPValue.
4931static bool isAlreadyNarrow(VPValue *VPV) {
4932 if (isa<VPIRValue>(VPV))
4933 return true;
4934 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4935 return RepR && RepR->isSingleScalar();
4936}
4937
4938// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4939// a narrow variant.
4940static VPValue *
4942 auto *R = V->getDefiningRecipe();
4943 if (!R || NarrowedOps.contains(V))
4944 return V;
4945
4946 if (isAlreadyNarrow(V))
4947 return V;
4948
4949 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4950 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4951 WideMember0->setOperand(
4952 Idx,
4953 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4954 return V;
4955 }
4956
4957 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4958 // Narrow interleave group to wide load, as transformed VPlan will only
4959 // process one original iteration.
4960 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4961 auto *L = new VPWidenLoadRecipe(
4962 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4963 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4964 L->insertBefore(LoadGroup);
4965 NarrowedOps.insert(L);
4966 return L;
4967 }
4968
4969 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4970 assert(RepR->isSingleScalar() &&
4971 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4972 "must be a single scalar load");
4973 NarrowedOps.insert(RepR);
4974 return RepR;
4975 }
4976
4977 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4978 VPValue *PtrOp = WideLoad->getAddr();
4979 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4980 PtrOp = VecPtr->getOperand(0);
4981 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4982 // process one original iteration.
4983 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4984 /*IsUniform*/ true,
4985 /*Mask*/ nullptr, {}, *WideLoad);
4986 N->insertBefore(WideLoad);
4987 NarrowedOps.insert(N);
4988 return N;
4989}
4990
4992 TypeSize VectorRegWidth) {
4993 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4994 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4995 return;
4996
4997 VPTypeAnalysis TypeInfo(Plan);
4998
5000 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5002 continue;
5003
5006 continue;
5007
5008 // Bail out on recipes not supported at the moment:
5009 // * phi recipes other than the canonical induction
5010 // * recipes writing to memory except interleave groups
5011 // Only support plans with a canonical induction phi.
5012 if (R.isPhi())
5013 return;
5014
5015 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5016 if (R.mayWriteToMemory() && !InterleaveR)
5017 return;
5018
5019 // Do not narrow interleave groups if there are VectorPointer recipes and
5020 // the plan was unrolled. The recipe implicitly uses VF from
5021 // VPTransformState.
5022 // TODO: Remove restriction once the VF for the VectorPointer offset is
5023 // modeled explicitly as operand.
5024 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5025 return;
5026
5027 // All other ops are allowed, but we reject uses that cannot be converted
5028 // when checking all allowed consumers (store interleave groups) below.
5029 if (!InterleaveR)
5030 continue;
5031
5032 // Bail out on non-consecutive interleave groups.
5033 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5034 VectorRegWidth))
5035 return;
5036
5037 // Skip read interleave groups.
5038 if (InterleaveR->getStoredValues().empty())
5039 continue;
5040
5041 // Narrow interleave groups, if all operands are already matching narrow
5042 // ops.
5043 auto *Member0 = InterleaveR->getStoredValues()[0];
5044 if (isAlreadyNarrow(Member0) &&
5045 all_of(InterleaveR->getStoredValues(),
5046 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
5047 StoreGroups.push_back(InterleaveR);
5048 continue;
5049 }
5050
5051 // For now, we only support full interleave groups storing load interleave
5052 // groups.
5053 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5054 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5055 if (!DefR)
5056 return false;
5057 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5058 return IR && IR->getInterleaveGroup()->isFull() &&
5059 IR->getVPValue(Op.index()) == Op.value();
5060 })) {
5061 StoreGroups.push_back(InterleaveR);
5062 continue;
5063 }
5064
5065 // Check if all values feeding InterleaveR are matching wide recipes, which
5066 // operands that can be narrowed.
5067 auto *WideMember0 =
5068 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5069 if (!WideMember0)
5070 return;
5071 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5073 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5074 R->getNumOperands() > 2)
5075 return;
5076 if (any_of(enumerate(R->operands()),
5077 [WideMember0, Idx = I](const auto &P) {
5078 const auto &[OpIdx, OpV] = P;
5079 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5080 }))
5081 return;
5082 }
5083 StoreGroups.push_back(InterleaveR);
5084 }
5085
5086 if (StoreGroups.empty())
5087 return;
5088
5089 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5090 SmallPtrSet<VPValue *, 4> NarrowedOps;
5091 // Narrow operation tree rooted at store groups.
5092 for (auto *StoreGroup : StoreGroups) {
5093 VPValue *Res =
5094 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5095 auto *SI =
5096 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5097 auto *S = new VPWidenStoreRecipe(
5098 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5099 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5100 S->insertBefore(StoreGroup);
5101 StoreGroup->eraseFromParent();
5102 }
5103
5104 // Adjust induction to reflect that the transformed plan only processes one
5105 // original iteration.
5106 auto *CanIV = VectorLoop->getCanonicalIV();
5107 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5108 VPBuilder PHBuilder(Plan.getVectorPreheader());
5109
5110 VPValue *UF = Plan.getOrAddLiveIn(
5111 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5112 if (VF.isScalable()) {
5113 VPValue *VScale = PHBuilder.createElementCount(
5115 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5116 Instruction::Mul, {VScale, UF}, {true, false});
5117 Inc->setOperand(1, VScaleUF);
5118 Plan.getVF().replaceAllUsesWith(VScale);
5119 } else {
5120 Inc->setOperand(1, UF);
5122 Plan.getConstantInt(CanIV->getScalarType(), 1));
5123 }
5124 removeDeadRecipes(Plan);
5125}
5126
5127/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5128/// BranchOnCond recipe.
5130 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5131 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5132 auto *MiddleTerm =
5134 // Only add branch metadata if there is a (conditional) terminator.
5135 if (!MiddleTerm)
5136 return;
5137
5138 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5139 "must have a BranchOnCond");
5140 // Assume that `TripCount % VectorStep ` is equally distributed.
5141 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5142 if (VF.isScalable() && VScaleForTuning.has_value())
5143 VectorStep *= *VScaleForTuning;
5144 assert(VectorStep > 0 && "trip count should not be zero");
5145 MDBuilder MDB(Plan.getContext());
5146 MDNode *BranchWeights =
5147 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5148 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5149}
5150
5151/// Compute and return the end value for \p WideIV, unless it is truncated. If
5152/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5153/// compute the end value of the induction.
5155 VPBuilder &VectorPHBuilder,
5156 VPTypeAnalysis &TypeInfo,
5157 VPValue *VectorTC) {
5158 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5159 // Truncated wide inductions resume from the last lane of their vector value
5160 // in the last vector iteration which is handled elsewhere.
5161 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5162 return nullptr;
5163
5164 VPIRValue *Start = WideIV->getStartValue();
5165 VPValue *Step = WideIV->getStepValue();
5167 VPValue *EndValue = VectorTC;
5168 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5169 EndValue = VectorPHBuilder.createDerivedIV(
5170 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5171 Start, VectorTC, Step);
5172 }
5173
5174 // EndValue is derived from the vector trip count (which has the same type as
5175 // the widest induction) and thus may be wider than the induction here.
5176 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5177 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5178 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5179 ScalarTypeOfWideIV,
5180 WideIV->getDebugLoc());
5181 }
5182
5183 return EndValue;
5184}
5185
5187 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5188 VPTypeAnalysis TypeInfo(Plan);
5189 auto *ScalarPH = Plan.getScalarPreheader();
5190 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5191 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5192 VPBuilder VectorPHBuilder(
5193 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5194 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5195 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5196 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5197
5198 // TODO: Extract final value from induction recipe initially, optimize to
5199 // pre-computed end value together in optimizeInductionExitUsers.
5200 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5201 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5203 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5204 IVEndValues[WideIVR] = EndValue;
5205 ResumePhiR->setOperand(0, EndValue);
5206 ResumePhiR->setName("bc.resume.val");
5207 continue;
5208 }
5209 // TODO: Also handle truncated inductions here. Computing end-values
5210 // separately should be done as VPlan-to-VPlan optimization, after
5211 // legalizing all resume values to use the last lane from the loop.
5212 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5213 "should only skip truncated wide inductions");
5214 continue;
5215 }
5216
5217 // The backedge value provides the value to resume coming out of a loop,
5218 // which for FORs is a vector whose last element needs to be extracted. The
5219 // start value provides the value if the loop is bypassed.
5220 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5221 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5222 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5223 "Cannot handle loops with uncountable early exits");
5224 if (IsFOR) {
5225 auto *ExtractPart = MiddleBuilder.createNaryOp(
5226 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5227 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5229 "vector.recur.extract");
5230 }
5231 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5232 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5233 }
5234}
5235
5237 VFRange &Range) {
5238 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5239 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5240 auto *MiddleVPBB = Plan.getMiddleBlock();
5241 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5242 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5243
5244 auto IsScalableOne = [](ElementCount VF) -> bool {
5245 return VF == ElementCount::getScalable(1);
5246 };
5247
5248 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5249 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5250 if (!FOR)
5251 continue;
5252
5253 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5254 "Cannot handle loops with uncountable early exits");
5255
5256 // This is the second phase of vectorizing first-order recurrences, creating
5257 // extract for users outside the loop. An overview of the transformation is
5258 // described below. Suppose we have the following loop with some use after
5259 // the loop of the last a[i-1],
5260 //
5261 // for (int i = 0; i < n; ++i) {
5262 // t = a[i - 1];
5263 // b[i] = a[i] - t;
5264 // }
5265 // use t;
5266 //
5267 // There is a first-order recurrence on "a". For this loop, the shorthand
5268 // scalar IR looks like:
5269 //
5270 // scalar.ph:
5271 // s.init = a[-1]
5272 // br scalar.body
5273 //
5274 // scalar.body:
5275 // i = phi [0, scalar.ph], [i+1, scalar.body]
5276 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5277 // s2 = a[i]
5278 // b[i] = s2 - s1
5279 // br cond, scalar.body, exit.block
5280 //
5281 // exit.block:
5282 // use = lcssa.phi [s1, scalar.body]
5283 //
5284 // In this example, s1 is a recurrence because it's value depends on the
5285 // previous iteration. In the first phase of vectorization, we created a
5286 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5287 // for users in the scalar preheader and exit block.
5288 //
5289 // vector.ph:
5290 // v_init = vector(..., ..., ..., a[-1])
5291 // br vector.body
5292 //
5293 // vector.body
5294 // i = phi [0, vector.ph], [i+4, vector.body]
5295 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5296 // v2 = a[i, i+1, i+2, i+3]
5297 // b[i] = v2 - v1
5298 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5299 // b[i, i+1, i+2, i+3] = v2 - v1
5300 // br cond, vector.body, middle.block
5301 //
5302 // middle.block:
5303 // vector.recur.extract.for.phi = v2(2)
5304 // vector.recur.extract = v2(3)
5305 // br cond, scalar.ph, exit.block
5306 //
5307 // scalar.ph:
5308 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5309 // [s.init, otherwise]
5310 // br scalar.body
5311 //
5312 // scalar.body:
5313 // i = phi [0, scalar.ph], [i+1, scalar.body]
5314 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5315 // s2 = a[i]
5316 // b[i] = s2 - s1
5317 // br cond, scalar.body, exit.block
5318 //
5319 // exit.block:
5320 // lo = lcssa.phi [s1, scalar.body],
5321 // [vector.recur.extract.for.phi, middle.block]
5322 //
5323 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5324 // Extract the penultimate value of the recurrence and use it as operand for
5325 // the VPIRInstruction modeling the phi.
5327 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5329 continue;
5330
5331 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5332 // penultimate value of the recurrence. Instead we rely on the existing
5333 // extract of the last element from the result of
5334 // VPInstruction::FirstOrderRecurrenceSplice.
5335 // TODO: Consider vscale_range info and UF.
5337 Range))
5338 return;
5339 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5340 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5341 "vector.recur.extract.for.phi");
5342 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5343 }
5344 }
5345}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1549
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:293
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3590
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3947
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4022
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3974
iterator end()
Definition VPlan.h:3984
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3982
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4035
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:228
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:589
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:561
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:635
const VPRecipeBase & back() const
Definition VPlan.h:3996
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2497
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2531
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2521
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2537
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2517
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:198
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:173
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:178
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:221
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:242
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:154
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:173
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3001
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3533
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:453
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:438
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:448
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3701
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3622
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3046
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2018
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2061
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2050
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4100
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4124
Class to record and manage LLVM IR flags.
Definition VPlan.h:608
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:980
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1034
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1136
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1078
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1073
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1070
@ CanonicalIVIncrementForPart
Definition VPlan.h:1054
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2639
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2631
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2660
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2713
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2671
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3188
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4252
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2875
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2764
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4135
const VPBlockBase * getEntry() const
Definition VPlan.h:4171
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4246
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4203
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4188
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4233
const VPBlockBase * getExiting() const
Definition VPlan.h:4183
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4196
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2920
bool isSingleScalar() const
Definition VPlan.h:2961
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2985
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3769
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:594
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:229
operand_range operands()
Definition VPlanValue.h:297
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:273
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:268
void addOperand(VPValue *Operand)
Definition VPlanValue.h:262
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:45
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:133
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1382
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:119
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:72
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:173
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1385
unsigned getNumUsers() const
Definition VPlanValue.h:105
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1389
user_range users()
Definition VPlanValue.h:126
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1877
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3664
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1568
Instruction::CastOps getOpcode() const
Definition VPlan.h:1604
A recipe for handling GEP instructions.
Definition VPlan.h:1814
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2085
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2113
PHINode * getPHINode() const
Definition VPlan.h:2130
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2116
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2133
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2164
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2211
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2215
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2242
A recipe for widening vector intrinsics.
Definition VPlan.h:1618
A common base class for widening memory operations.
Definition VPlan.h:3231
A recipe for widened phis.
Definition VPlan.h:2300
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1520
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4265
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4544
bool hasVF(ElementCount VF) const
Definition VPlan.h:4462
LLVMContext & getContext() const
Definition VPlan.h:4450
VPBasicBlock * getEntry()
Definition VPlan.h:4354
bool hasScalableVF() const
Definition VPlan.h:4463
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4448
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4444
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4412
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4433
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4523
unsigned getUF() const
Definition VPlan.h:4482
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4592
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4547
bool hasUF(unsigned UF) const
Definition VPlan.h:4480
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4402
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4441
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4504
void setVF(ElementCount VF)
Definition VPlan.h:4456
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4495
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1022
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4426
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4379
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4570
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4520
bool hasScalarVFOnly() const
Definition VPlan.h:4473
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4393
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4398
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4359
void setUF(unsigned UF)
Definition VPlan.h:4487
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4624
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4526
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:93
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1726
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2342
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:184
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:137
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:202
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3364
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3322
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3448
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3405
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...