LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan, const TargetLibraryInfo &TLI) {
51
53 Plan.getVectorLoopRegion());
55 // Skip blocks outside region
56 if (!VPBB->getParent())
57 break;
58 VPRecipeBase *Term = VPBB->getTerminator();
59 auto EndIter = Term ? Term->getIterator() : VPBB->end();
60 // Introduce each ingredient into VPlan.
61 for (VPRecipeBase &Ingredient :
62 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
63
64 VPValue *VPV = Ingredient.getVPSingleValue();
65 if (!VPV->getUnderlyingValue())
66 continue;
67
69
70 VPRecipeBase *NewRecipe = nullptr;
71 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
72 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
73 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
74 for (VPValue *Op : PhiR->operands())
75 NewRecipe->addOperand(Op);
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, false /*Reverse*/, *VPI,
83 Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc());
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96 NewRecipe = new VPWidenIntrinsicRecipe(
97 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
98 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
99 *VPI, CI->getDebugLoc());
100 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
101 NewRecipe = new VPWidenCastRecipe(
102 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
103 VPIRFlags(*CI), VPIRMetadata(*CI));
104 } else {
105 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
106 *VPI, Ingredient.getDebugLoc());
107 }
108 } else {
110 "inductions must be created earlier");
111 continue;
112 }
113
114 NewRecipe->insertBefore(&Ingredient);
115 if (NewRecipe->getNumDefinedValues() == 1)
116 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
117 else
118 assert(NewRecipe->getNumDefinedValues() == 0 &&
119 "Only recpies with zero or one defined values expected");
120 Ingredient.eraseFromParent();
121 }
122 }
123 return true;
124}
125
126/// Helper for extra no-alias checks via known-safe recipe and SCEV.
128 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
129 VPReplicateRecipe &GroupLeader;
131 const Loop &L;
132 VPTypeAnalysis &TypeInfo;
133
134 // Return true if \p A and \p B are known to not alias for all VFs in the
135 // plan, checked via the distance between the accesses
136 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
137 if (A->getOpcode() != Instruction::Store ||
138 B->getOpcode() != Instruction::Store)
139 return false;
140
141 VPValue *AddrA = A->getOperand(1);
142 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
143 VPValue *AddrB = B->getOperand(1);
144 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
146 return false;
147
148 const APInt *Distance;
149 ScalarEvolution &SE = *PSE.getSE();
150 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
151 return false;
152
153 const DataLayout &DL = SE.getDataLayout();
154 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
155 uint64_t SizeA = DL.getTypeStoreSize(TyA);
156 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
157 uint64_t SizeB = DL.getTypeStoreSize(TyB);
158
159 // Use the maximum store size to ensure no overlap from either direction.
160 // Currently only handles fixed sizes, as it is only used for
161 // replicating VPReplicateRecipes.
162 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
163
164 auto VFs = B->getParent()->getPlan()->vectorFactors();
166 if (MaxVF.isScalable())
167 return false;
168 return Distance->abs().uge(
169 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
170 }
171
172public:
175 const Loop &L, VPTypeAnalysis &TypeInfo)
176 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
177 L(L), TypeInfo(TypeInfo) {}
178
179 /// Return true if \p R should be skipped during alias checking, either
180 /// because it's in the exclude set or because no-alias can be proven via
181 /// SCEV.
182 bool shouldSkip(VPRecipeBase &R) const {
183 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
184 return ExcludeRecipes.contains(&R) ||
185 (Store && isNoAliasViaDistance(Store, &GroupLeader));
186 }
187};
188
189/// Check if a memory operation doesn't alias with memory operations in blocks
190/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
191/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
192/// checked (for load hoisting). Otherwise recipes that both read and write
193/// memory are checked, and SCEV is used to prove no-alias between the group
194/// leader and other replicate recipes (for store sinking).
195static bool
197 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
198 std::optional<SinkStoreInfo> SinkInfo = {}) {
199 bool CheckReads = SinkInfo.has_value();
200 if (!MemLoc.AATags.Scope)
201 return false;
202
203 const AAMDNodes &MemAA = MemLoc.AATags;
204
205 for (VPBlockBase *Block = FirstBB; Block;
206 Block = Block->getSingleSuccessor()) {
207 assert(Block->getNumSuccessors() <= 1 &&
208 "Expected at most one successor in block chain");
209 auto *VPBB = cast<VPBasicBlock>(Block);
210 for (VPRecipeBase &R : *VPBB) {
211 if (SinkInfo && SinkInfo->shouldSkip(R))
212 continue;
213
214 // Skip recipes that don't need checking.
215 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
216 continue;
217
219 if (!Loc)
220 // Conservatively assume aliasing for memory operations without
221 // location.
222 return false;
223
224 // For reads, check if they don't alias in the reverse direction and
225 // skip if so.
226 if (CheckReads && R.mayReadFromMemory() &&
228 MemAA.NoAlias))
229 continue;
230
231 // Check if the memory operations may alias in the forward direction.
233 Loc->AATags.NoAlias))
234 return false;
235 }
236
237 if (Block == LastBB)
238 break;
239 }
240 return true;
241}
242
243/// Return true if we do not know how to (mechanically) hoist or sink \p R out
244/// of a loop region.
246 // Assumes don't alias anything or throw; as long as they're guaranteed to
247 // execute, they're safe to hoist.
249 return false;
250
251 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
252 // memory location is not modified in the vector loop.
253 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
254 return true;
255
256 // Allocas cannot be hoisted.
257 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
258 return RepR && RepR->getOpcode() == Instruction::Alloca;
259}
260
261static bool sinkScalarOperands(VPlan &Plan) {
262 auto Iter = vp_depth_first_deep(Plan.getEntry());
263 bool ScalarVFOnly = Plan.hasScalarVFOnly();
264 bool Changed = false;
265
267 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
268 VPBasicBlock *SinkTo, VPValue *Op) {
269 auto *Candidate =
270 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
271 if (!Candidate)
272 return;
273
274 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
275 // for now.
277 return;
278
279 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
280 return;
281
282 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
283 if (!ScalarVFOnly && RepR->isSingleScalar())
284 return;
285
286 WorkList.insert({SinkTo, Candidate});
287 };
288
289 // First, collect the operands of all recipes in replicate blocks as seeds for
290 // sinking.
292 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
293 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
294 continue;
295 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
296 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
297 continue;
298 for (auto &Recipe : *VPBB)
299 for (VPValue *Op : Recipe.operands())
300 InsertIfValidSinkCandidate(VPBB, Op);
301 }
302
303 // Try to sink each replicate or scalar IV steps recipe in the worklist.
304 for (unsigned I = 0; I != WorkList.size(); ++I) {
305 VPBasicBlock *SinkTo;
306 VPSingleDefRecipe *SinkCandidate;
307 std::tie(SinkTo, SinkCandidate) = WorkList[I];
308
309 // All recipe users of SinkCandidate must be in the same block SinkTo or all
310 // users outside of SinkTo must only use the first lane of SinkCandidate. In
311 // the latter case, we need to duplicate SinkCandidate.
312 auto UsersOutsideSinkTo =
313 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
314 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
315 });
316 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
317 return !U->usesFirstLaneOnly(SinkCandidate);
318 }))
319 continue;
320 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
321
322 if (NeedsDuplicating) {
323 if (ScalarVFOnly)
324 continue;
325 VPSingleDefRecipe *Clone;
326 if (auto *SinkCandidateRepR =
327 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
328 // TODO: Handle converting to uniform recipes as separate transform,
329 // then cloning should be sufficient here.
330 Instruction *I = SinkCandidate->getUnderlyingInstr();
331 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
332 nullptr /*Mask*/, *SinkCandidateRepR,
333 *SinkCandidateRepR);
334 // TODO: add ".cloned" suffix to name of Clone's VPValue.
335 } else {
336 Clone = SinkCandidate->clone();
337 }
338
339 Clone->insertBefore(SinkCandidate);
340 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
341 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
342 });
343 }
344 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
345 for (VPValue *Op : SinkCandidate->operands())
346 InsertIfValidSinkCandidate(SinkTo, Op);
347 Changed = true;
348 }
349 return Changed;
350}
351
352/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
353/// the mask.
355 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
356 if (!EntryBB || EntryBB->size() != 1 ||
357 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
358 return nullptr;
359
360 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
361}
362
363/// If \p R is a triangle region, return the 'then' block of the triangle.
365 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
366 if (EntryBB->getNumSuccessors() != 2)
367 return nullptr;
368
369 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
370 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
371 if (!Succ0 || !Succ1)
372 return nullptr;
373
374 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
375 return nullptr;
376 if (Succ0->getSingleSuccessor() == Succ1)
377 return Succ0;
378 if (Succ1->getSingleSuccessor() == Succ0)
379 return Succ1;
380 return nullptr;
381}
382
383// Merge replicate regions in their successor region, if a replicate region
384// is connected to a successor replicate region with the same predicate by a
385// single, empty VPBasicBlock.
387 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
388
389 // Collect replicate regions followed by an empty block, followed by another
390 // replicate region with matching masks to process front. This is to avoid
391 // iterator invalidation issues while merging regions.
394 vp_depth_first_deep(Plan.getEntry()))) {
395 if (!Region1->isReplicator())
396 continue;
397 auto *MiddleBasicBlock =
398 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
399 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
400 continue;
401
402 auto *Region2 =
403 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
404 if (!Region2 || !Region2->isReplicator())
405 continue;
406
407 VPValue *Mask1 = getPredicatedMask(Region1);
408 VPValue *Mask2 = getPredicatedMask(Region2);
409 if (!Mask1 || Mask1 != Mask2)
410 continue;
411
412 assert(Mask1 && Mask2 && "both region must have conditions");
413 WorkList.push_back(Region1);
414 }
415
416 // Move recipes from Region1 to its successor region, if both are triangles.
417 for (VPRegionBlock *Region1 : WorkList) {
418 if (TransformedRegions.contains(Region1))
419 continue;
420 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
421 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
422
423 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
424 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
425 if (!Then1 || !Then2)
426 continue;
427
428 // Note: No fusion-preventing memory dependencies are expected in either
429 // region. Such dependencies should be rejected during earlier dependence
430 // checks, which guarantee accesses can be re-ordered for vectorization.
431 //
432 // Move recipes to the successor region.
433 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
434 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
435
436 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
437 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
438
439 // Move VPPredInstPHIRecipes from the merge block to the successor region's
440 // merge block. Update all users inside the successor region to use the
441 // original values.
442 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
443 VPValue *PredInst1 =
444 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
445 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
446 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
447 return cast<VPRecipeBase>(&U)->getParent() == Then2;
448 });
449
450 // Remove phi recipes that are unused after merging the regions.
451 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
452 Phi1ToMove.eraseFromParent();
453 continue;
454 }
455 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
456 }
457
458 // Remove the dead recipes in Region1's entry block.
459 for (VPRecipeBase &R :
460 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
461 R.eraseFromParent();
462
463 // Finally, remove the first region.
464 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
465 VPBlockUtils::disconnectBlocks(Pred, Region1);
466 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
467 }
468 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
469 TransformedRegions.insert(Region1);
470 }
471
472 return !TransformedRegions.empty();
473}
474
476 VPlan &Plan) {
477 Instruction *Instr = PredRecipe->getUnderlyingInstr();
478 // Build the triangular if-then region.
479 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
480 assert(Instr->getParent() && "Predicated instruction not in any basic block");
481 auto *BlockInMask = PredRecipe->getMask();
482 auto *MaskDef = BlockInMask->getDefiningRecipe();
483 auto *BOMRecipe = new VPBranchOnMaskRecipe(
484 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
485 auto *Entry =
486 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
487
488 // Replace predicated replicate recipe with a replicate recipe without a
489 // mask but in the replicate region.
490 auto *RecipeWithoutMask = new VPReplicateRecipe(
491 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
492 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
493 PredRecipe->getDebugLoc());
494 auto *Pred =
495 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
496
497 VPPredInstPHIRecipe *PHIRecipe = nullptr;
498 if (PredRecipe->getNumUsers() != 0) {
499 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
500 RecipeWithoutMask->getDebugLoc());
501 PredRecipe->replaceAllUsesWith(PHIRecipe);
502 PHIRecipe->setOperand(0, RecipeWithoutMask);
503 }
504 PredRecipe->eraseFromParent();
505 auto *Exiting =
506 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
508 Plan.createReplicateRegion(Entry, Exiting, RegionName);
509
510 // Note: first set Entry as region entry and then connect successors starting
511 // from it in order, to propagate the "parent" of each VPBasicBlock.
512 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
513 VPBlockUtils::connectBlocks(Pred, Exiting);
514
515 return Region;
516}
517
518static void addReplicateRegions(VPlan &Plan) {
521 vp_depth_first_deep(Plan.getEntry()))) {
522 for (VPRecipeBase &R : *VPBB)
523 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
524 if (RepR->isPredicated())
525 WorkList.push_back(RepR);
526 }
527 }
528
529 unsigned BBNum = 0;
530 for (VPReplicateRecipe *RepR : WorkList) {
531 VPBasicBlock *CurrentBlock = RepR->getParent();
532 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
533
534 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
535 SplitBlock->setName(
536 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
537 // Record predicated instructions for above packing optimizations.
539 Region->setParent(CurrentBlock->getParent());
541
542 VPRegionBlock *ParentRegion = Region->getParent();
543 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
544 ParentRegion->setExiting(SplitBlock);
545 }
546}
547
548/// Remove redundant VPBasicBlocks by merging them into their predecessor if
549/// the predecessor has a single successor.
553 vp_depth_first_deep(Plan.getEntry()))) {
554 // Don't fold the blocks in the skeleton of the Plan into their single
555 // predecessors for now.
556 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
557 if (!VPBB->getParent())
558 continue;
559 auto *PredVPBB =
560 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
561 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
562 isa<VPIRBasicBlock>(PredVPBB))
563 continue;
564 WorkList.push_back(VPBB);
565 }
566
567 for (VPBasicBlock *VPBB : WorkList) {
568 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
569 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
570 R.moveBefore(*PredVPBB, PredVPBB->end());
571 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
572 auto *ParentRegion = VPBB->getParent();
573 if (ParentRegion && ParentRegion->getExiting() == VPBB)
574 ParentRegion->setExiting(PredVPBB);
575 for (auto *Succ : to_vector(VPBB->successors())) {
577 VPBlockUtils::connectBlocks(PredVPBB, Succ);
578 }
579 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
580 }
581 return !WorkList.empty();
582}
583
585 // Convert masked VPReplicateRecipes to if-then region blocks.
587
588 bool ShouldSimplify = true;
589 while (ShouldSimplify) {
590 ShouldSimplify = sinkScalarOperands(Plan);
591 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
592 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
593 }
594}
595
596/// Remove redundant casts of inductions.
597///
598/// Such redundant casts are casts of induction variables that can be ignored,
599/// because we already proved that the casted phi is equal to the uncasted phi
600/// in the vectorized loop. There is no need to vectorize the cast - the same
601/// value can be used for both the phi and casts in the vector loop.
603 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
605 if (!IV || IV->getTruncInst())
606 continue;
607
608 // A sequence of IR Casts has potentially been recorded for IV, which
609 // *must be bypassed* when the IV is vectorized, because the vectorized IV
610 // will produce the desired casted value. This sequence forms a def-use
611 // chain and is provided in reverse order, ending with the cast that uses
612 // the IV phi. Search for the recipe of the last cast in the chain and
613 // replace it with the original IV. Note that only the final cast is
614 // expected to have users outside the cast-chain and the dead casts left
615 // over will be cleaned up later.
616 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
617 VPValue *FindMyCast = IV;
618 for (Instruction *IRCast : reverse(Casts)) {
619 VPSingleDefRecipe *FoundUserCast = nullptr;
620 for (auto *U : FindMyCast->users()) {
621 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
622 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
623 FoundUserCast = UserCast;
624 break;
625 }
626 }
627 FindMyCast = FoundUserCast;
628 }
629 FindMyCast->replaceAllUsesWith(IV);
630 }
631}
632
633/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
634/// recipe, if it exists.
636 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
637 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
638 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
639 for (VPUser *U : CanonicalIV->users()) {
641 if (WidenNewIV)
642 break;
643 }
644
645 if (!WidenNewIV)
646 return;
647
648 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
649 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
650 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
651
652 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
653 continue;
654
655 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
656 // everything WidenNewIV's users need. That is, WidenOriginalIV will
657 // generate a vector phi or all users of WidenNewIV demand the first lane
658 // only.
659 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
660 vputils::onlyFirstLaneUsed(WidenNewIV)) {
661 // We are replacing a wide canonical iv with a suitable wide induction.
662 // This is used to compute header mask, hence all lanes will be used and
663 // we need to drop wrap flags only applying to lanes guranteed to execute
664 // in the original scalar loop.
665 WidenOriginalIV->dropPoisonGeneratingFlags();
666 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
667 WidenNewIV->eraseFromParent();
668 return;
669 }
670 }
671}
672
673/// Returns true if \p R is dead and can be removed.
674static bool isDeadRecipe(VPRecipeBase &R) {
675 // Do remove conditional assume instructions as their conditions may be
676 // flattened.
677 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
678 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
680 if (IsConditionalAssume)
681 return true;
682
683 if (R.mayHaveSideEffects())
684 return false;
685
686 // Recipe is dead if no user keeps the recipe alive.
687 return all_of(R.definedValues(),
688 [](VPValue *V) { return V->getNumUsers() == 0; });
689}
690
693 vp_post_order_deep(Plan.getEntry()))) {
694 // The recipes in the block are processed in reverse order, to catch chains
695 // of dead recipes.
696 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
697 if (isDeadRecipe(R)) {
698 R.eraseFromParent();
699 continue;
700 }
701
702 // Check if R is a dead VPPhi <-> update cycle and remove it.
703 auto *PhiR = dyn_cast<VPPhi>(&R);
704 if (!PhiR || PhiR->getNumOperands() != 2)
705 continue;
706 VPUser *PhiUser = PhiR->getSingleUser();
707 if (!PhiUser)
708 continue;
709 VPValue *Incoming = PhiR->getOperand(1);
710 if (PhiUser != Incoming->getDefiningRecipe() ||
711 Incoming->getNumUsers() != 1)
712 continue;
713 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
714 PhiR->eraseFromParent();
715 Incoming->getDefiningRecipe()->eraseFromParent();
716 }
717 }
718}
719
722 Instruction::BinaryOps InductionOpcode,
723 FPMathOperator *FPBinOp, Instruction *TruncI,
724 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
725 VPBuilder &Builder) {
726 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
727 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
728 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
729 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
730 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
731
732 // Truncate base induction if needed.
733 VPTypeAnalysis TypeInfo(Plan);
734 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
735 if (TruncI) {
736 Type *TruncTy = TruncI->getType();
737 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
738 "Not truncating.");
739 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
740 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
741 ResultTy = TruncTy;
742 }
743
744 // Truncate step if needed.
745 Type *StepTy = TypeInfo.inferScalarType(Step);
746 if (ResultTy != StepTy) {
747 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
748 "Not truncating.");
749 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
750 auto *VecPreheader =
752 VPBuilder::InsertPointGuard Guard(Builder);
753 Builder.setInsertPoint(VecPreheader);
754 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
755 }
756 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
757 &Plan.getVF(), DL);
758}
759
762 for (unsigned I = 0; I != Users.size(); ++I) {
764 if (isa<VPHeaderPHIRecipe>(Cur))
765 continue;
766 for (VPValue *V : Cur->definedValues())
767 Users.insert_range(V->users());
768 }
769 return Users.takeVector();
770}
771
772/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
773/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
774/// generates scalar values.
775static VPValue *
777 VPlan &Plan, VPBuilder &Builder) {
779 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
780 VPValue *StepV = PtrIV->getOperand(1);
782 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
783 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
784
785 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
786 PtrIV->getDebugLoc(), "next.gep");
787}
788
789/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
790/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
791/// VPWidenPointerInductionRecipe will generate vectors only. If some users
792/// require vectors while other require scalars, the scalar uses need to extract
793/// the scalars from the generated vectors (Note that this is different to how
794/// int/fp inductions are handled). Legalize extract-from-ends using uniform
795/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
796/// the correct end value is available. Also optimize
797/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
798/// providing them scalar steps built on the canonical scalar IV and update the
799/// original IV's users. This is an optional optimization to reduce the needs of
800/// vector extracts.
803 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
804 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
805 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
806 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
807 if (!PhiR)
808 continue;
809
810 // Try to narrow wide and replicating recipes to uniform recipes, based on
811 // VPlan analysis.
812 // TODO: Apply to all recipes in the future, to replace legacy uniformity
813 // analysis.
814 auto Users = collectUsersRecursively(PhiR);
815 for (VPUser *U : reverse(Users)) {
816 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
817 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
818 // Skip recipes that shouldn't be narrowed.
819 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
820 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
821 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
822 continue;
823
824 // Skip recipes that may have other lanes than their first used.
826 continue;
827
828 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
829 Def->operands(), /*IsUniform*/ true,
830 /*Mask*/ nullptr, /*Flags*/ *Def);
831 Clone->insertAfter(Def);
832 Def->replaceAllUsesWith(Clone);
833 }
834
835 // Replace wide pointer inductions which have only their scalars used by
836 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
837 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
838 if (!Plan.hasScalarVFOnly() &&
839 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
840 continue;
841
842 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
843 PtrIV->replaceAllUsesWith(PtrAdd);
844 continue;
845 }
846
847 // Replace widened induction with scalar steps for users that only use
848 // scalars.
849 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
850 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
851 return U->usesScalars(WideIV);
852 }))
853 continue;
854
855 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
857 Plan, ID.getKind(), ID.getInductionOpcode(),
858 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
859 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
860 WideIV->getDebugLoc(), Builder);
861
862 // Update scalar users of IV to use Step instead.
863 if (!HasOnlyVectorVFs) {
864 assert(!Plan.hasScalableVF() &&
865 "plans containing a scalar VF cannot also include scalable VFs");
866 WideIV->replaceAllUsesWith(Steps);
867 } else {
868 bool HasScalableVF = Plan.hasScalableVF();
869 WideIV->replaceUsesWithIf(Steps,
870 [WideIV, HasScalableVF](VPUser &U, unsigned) {
871 if (HasScalableVF)
872 return U.usesFirstLaneOnly(WideIV);
873 return U.usesScalars(WideIV);
874 });
875 }
876 }
877}
878
879/// Check if \p VPV is an untruncated wide induction, either before or after the
880/// increment. If so return the header IV (before the increment), otherwise
881/// return null.
884 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
885 if (WideIV) {
886 // VPV itself is a wide induction, separately compute the end value for exit
887 // users if it is not a truncated IV.
888 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
889 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
890 }
891
892 // Check if VPV is an optimizable induction increment.
893 VPRecipeBase *Def = VPV->getDefiningRecipe();
894 if (!Def || Def->getNumOperands() != 2)
895 return nullptr;
896 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
897 if (!WideIV)
898 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
899 if (!WideIV)
900 return nullptr;
901
902 auto IsWideIVInc = [&]() {
903 auto &ID = WideIV->getInductionDescriptor();
904
905 // Check if VPV increments the induction by the induction step.
906 VPValue *IVStep = WideIV->getStepValue();
907 switch (ID.getInductionOpcode()) {
908 case Instruction::Add:
909 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
910 case Instruction::FAdd:
912 m_Specific(IVStep)));
913 case Instruction::FSub:
914 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
915 m_Specific(IVStep)));
916 case Instruction::Sub: {
917 // IVStep will be the negated step of the subtraction. Check if Step == -1
918 // * IVStep.
919 VPValue *Step;
920 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
921 return false;
922 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
923 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
924 ScalarEvolution &SE = *PSE.getSE();
925 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
926 !isa<SCEVCouldNotCompute>(StepSCEV) &&
927 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
928 }
929 default:
930 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
931 match(VPV, m_GetElementPtr(m_Specific(WideIV),
932 m_Specific(WideIV->getStepValue())));
933 }
934 llvm_unreachable("should have been covered by switch above");
935 };
936 return IsWideIVInc() ? WideIV : nullptr;
937}
938
939/// Attempts to optimize the induction variable exit values for users in the
940/// early exit block.
942 VPTypeAnalysis &TypeInfo,
943 VPBlockBase *PredVPBB,
944 VPValue *Op,
946 VPValue *Incoming, *Mask;
949 return nullptr;
950
951 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
952 if (!WideIV)
953 return nullptr;
954
955 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
956 if (WideIntOrFp && WideIntOrFp->getTruncInst())
957 return nullptr;
958
959 // Calculate the final index.
960 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
961 auto *CanonicalIV = LoopRegion->getCanonicalIV();
962 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
963 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
964
965 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
966 VPValue *FirstActiveLane =
967 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
968 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
969 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
970 FirstActiveLaneType, DL);
971 VPValue *EndValue =
972 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
973
974 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
975 // changed it means the exit is using the incremented value, so we need to
976 // add the step.
977 if (Incoming != WideIV) {
978 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
979 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
980 }
981
982 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
983 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
984 VPIRValue *Start = WideIV->getStartValue();
985 VPValue *Step = WideIV->getStepValue();
986 EndValue = B.createDerivedIV(
987 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
988 Start, EndValue, Step);
989 }
990
991 return EndValue;
992}
993
994/// Attempts to optimize the induction variable exit values for users in the
995/// exit block coming from the latch in the original scalar loop.
997 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1001 return nullptr;
1002
1003 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1004 if (!WideIV)
1005 return nullptr;
1006
1007 VPValue *EndValue = EndValues.lookup(WideIV);
1008 assert(EndValue && "end value must have been pre-computed");
1009
1010 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1011 // changed it means the exit is using the incremented value, so we don't
1012 // need to subtract the step.
1013 if (Incoming != WideIV)
1014 return EndValue;
1015
1016 // Otherwise, subtract the step from the EndValue.
1017 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1018 VPValue *Step = WideIV->getStepValue();
1019 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1020 if (ScalarTy->isIntegerTy())
1021 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1022 DebugLoc::getUnknown(), "ind.escape");
1023 if (ScalarTy->isPointerTy()) {
1024 Type *StepTy = TypeInfo.inferScalarType(Step);
1025 auto *Zero = Plan.getConstantInt(StepTy, 0);
1026 return B.createPtrAdd(EndValue,
1027 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1028 DebugLoc::getUnknown(), "ind.escape");
1029 }
1030 if (ScalarTy->isFloatingPointTy()) {
1031 const auto &ID = WideIV->getInductionDescriptor();
1032 return B.createNaryOp(
1033 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1034 ? Instruction::FSub
1035 : Instruction::FAdd,
1036 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1037 }
1038 llvm_unreachable("all possible induction types must be handled");
1039 return nullptr;
1040}
1041
1043 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1045 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1046 VPTypeAnalysis TypeInfo(Plan);
1047 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1048 for (VPRecipeBase &R : ExitVPBB->phis()) {
1049 auto *ExitIRI = cast<VPIRPhi>(&R);
1050
1051 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1052 VPValue *Escape = nullptr;
1053 if (PredVPBB == MiddleVPBB)
1054 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1055 ExitIRI->getOperand(Idx),
1056 EndValues, PSE);
1057 else
1059 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1060 if (Escape)
1061 ExitIRI->setOperand(Idx, Escape);
1062 }
1063 }
1064 }
1065}
1066
1067/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1068/// them with already existing recipes expanding the same SCEV expression.
1071
1072 for (VPRecipeBase &R :
1074 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1075 if (!ExpR)
1076 continue;
1077
1078 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1079 if (Inserted)
1080 continue;
1081 ExpR->replaceAllUsesWith(V->second);
1082 ExpR->eraseFromParent();
1083 }
1084}
1085
1087 SmallVector<VPValue *> WorkList;
1089 WorkList.push_back(V);
1090
1091 while (!WorkList.empty()) {
1092 VPValue *Cur = WorkList.pop_back_val();
1093 if (!Seen.insert(Cur).second)
1094 continue;
1095 VPRecipeBase *R = Cur->getDefiningRecipe();
1096 if (!R)
1097 continue;
1098 if (!isDeadRecipe(*R))
1099 continue;
1100 append_range(WorkList, R->operands());
1101 R->eraseFromParent();
1102 }
1103}
1104
1105/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1106/// Returns an optional pair, where the first element indicates whether it is
1107/// an intrinsic ID.
1108static std::optional<std::pair<bool, unsigned>>
1110 return TypeSwitch<const VPSingleDefRecipe *,
1111 std::optional<std::pair<bool, unsigned>>>(R)
1114 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1115 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1116 return std::make_pair(true, I->getVectorIntrinsicID());
1117 })
1118 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1119 // For recipes that do not directly map to LLVM IR instructions,
1120 // assign opcodes after the last VPInstruction opcode (which is also
1121 // after the last IR Instruction opcode), based on the VPDefID.
1122 return std::make_pair(false,
1123 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1124 })
1125 .Default([](auto *) { return std::nullopt; });
1126}
1127
1128/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1129/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1130/// Operands are foldable live-ins.
1132 ArrayRef<VPValue *> Operands,
1133 const DataLayout &DL,
1134 VPTypeAnalysis &TypeInfo) {
1135 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1136 if (!OpcodeOrIID)
1137 return nullptr;
1138
1140 for (VPValue *Op : Operands) {
1142 return nullptr;
1143 Value *V = Op->getUnderlyingValue();
1144 if (!V)
1145 return nullptr;
1146 Ops.push_back(V);
1147 }
1148
1149 auto FoldToIRValue = [&]() -> Value * {
1150 InstSimplifyFolder Folder(DL);
1151 if (OpcodeOrIID->first) {
1152 if (R.getNumOperands() != 2)
1153 return nullptr;
1154 unsigned ID = OpcodeOrIID->second;
1155 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1156 TypeInfo.inferScalarType(&R));
1157 }
1158 unsigned Opcode = OpcodeOrIID->second;
1159 if (Instruction::isBinaryOp(Opcode))
1160 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1161 Ops[0], Ops[1]);
1162 if (Instruction::isCast(Opcode))
1163 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1164 TypeInfo.inferScalarType(R.getVPSingleValue()));
1165 switch (Opcode) {
1167 return Folder.FoldSelect(Ops[0], Ops[1],
1169 case VPInstruction::Not:
1170 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1172 case Instruction::Select:
1173 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1174 case Instruction::ICmp:
1175 case Instruction::FCmp:
1176 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1177 Ops[1]);
1178 case Instruction::GetElementPtr: {
1179 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1180 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1181 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1182 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1183 }
1186 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1187 Ops[0], Ops[1],
1188 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1189 // An extract of a live-in is an extract of a broadcast, so return the
1190 // broadcasted element.
1191 case Instruction::ExtractElement:
1192 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1193 return Ops[0];
1194 }
1195 return nullptr;
1196 };
1197
1198 if (Value *V = FoldToIRValue())
1199 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1200 return nullptr;
1201}
1202
1203/// Try to simplify VPSingleDefRecipe \p Def.
1205 VPlan *Plan = Def->getParent()->getPlan();
1206
1207 // Simplification of live-in IR values for SingleDef recipes using
1208 // InstSimplifyFolder.
1209 const DataLayout &DL =
1211 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1212 return Def->replaceAllUsesWith(V);
1213
1214 // Fold PredPHI LiveIn -> LiveIn.
1215 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1216 VPValue *Op = PredPHI->getOperand(0);
1217 if (isa<VPIRValue>(Op))
1218 PredPHI->replaceAllUsesWith(Op);
1219 }
1220
1221 VPBuilder Builder(Def);
1222 VPValue *A;
1223 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1224 Type *TruncTy = TypeInfo.inferScalarType(Def);
1225 Type *ATy = TypeInfo.inferScalarType(A);
1226 if (TruncTy == ATy) {
1227 Def->replaceAllUsesWith(A);
1228 } else {
1229 // Don't replace a scalarizing recipe with a widened cast.
1230 if (isa<VPReplicateRecipe>(Def))
1231 return;
1232 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1233
1234 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1235 ? Instruction::SExt
1236 : Instruction::ZExt;
1237 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1238 TruncTy);
1239 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1240 // UnderlyingExt has distinct return type, used to retain legacy cost.
1241 Ext->setUnderlyingValue(UnderlyingExt);
1242 }
1243 Def->replaceAllUsesWith(Ext);
1244 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1245 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1246 Def->replaceAllUsesWith(Trunc);
1247 }
1248 }
1249#ifndef NDEBUG
1250 // Verify that the cached type info is for both A and its users is still
1251 // accurate by comparing it to freshly computed types.
1252 VPTypeAnalysis TypeInfo2(*Plan);
1253 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1254 for (VPUser *U : A->users()) {
1255 auto *R = cast<VPRecipeBase>(U);
1256 for (VPValue *VPV : R->definedValues())
1257 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1258 }
1259#endif
1260 }
1261
1262 // Simplify (X && Y) || (X && !Y) -> X.
1263 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1264 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1265 // recipes to be visited during simplification.
1266 VPValue *X, *Y, *Z;
1267 if (match(Def,
1270 Def->replaceAllUsesWith(X);
1271 Def->eraseFromParent();
1272 return;
1273 }
1274
1275 // x | 1 -> 1
1276 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1277 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1278
1279 // x | 0 -> x
1280 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1281 return Def->replaceAllUsesWith(X);
1282
1283 // x & 0 -> 0
1284 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1285 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1286
1287 // x && false -> false
1288 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1289 return Def->replaceAllUsesWith(Def->getOperand(1));
1290
1291 // (x && y) || (x && z) -> x && (y || z)
1294 // Simplify only if one of the operands has one use to avoid creating an
1295 // extra recipe.
1296 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1297 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1298 return Def->replaceAllUsesWith(
1299 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1300
1301 // x && !x -> 0
1303 return Def->replaceAllUsesWith(Plan->getFalse());
1304
1305 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1306 return Def->replaceAllUsesWith(X);
1307
1308 // select c, false, true -> not c
1309 VPValue *C;
1310 if (match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1311 return Def->replaceAllUsesWith(Builder.createNot(C));
1312
1313 // select !c, x, y -> select c, y, x
1314 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1315 Def->setOperand(0, C);
1316 Def->setOperand(1, Y);
1317 Def->setOperand(2, X);
1318 return;
1319 }
1320
1321 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1322 // tail folding it is likely that x is a header mask and can be simplified
1323 // further.
1325 m_VPValue(Z))) &&
1326 X->hasMoreThanOneUniqueUser())
1327 return Def->replaceAllUsesWith(
1328 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1329
1330 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1331 return Def->replaceAllUsesWith(A);
1332
1333 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1334 return Def->replaceAllUsesWith(A);
1335
1336 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1337 return Def->replaceAllUsesWith(
1338 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1339
1340 const APInt *APC;
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1342 return Def->replaceAllUsesWith(Builder.createNaryOp(
1343 Instruction::Shl,
1344 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1345 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1346
1347 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1348 // not allowed in them.
1349 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1350 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1351 if (!IsInReplicateRegion && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1352 APC->isPowerOf2())
1353 return Def->replaceAllUsesWith(Builder.createNaryOp(
1354 Instruction::LShr,
1355 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())}, {},
1356 Def->getDebugLoc()));
1357
1358 if (match(Def, m_Not(m_VPValue(A)))) {
1359 if (match(A, m_Not(m_VPValue(A))))
1360 return Def->replaceAllUsesWith(A);
1361
1362 // Try to fold Not into compares by adjusting the predicate in-place.
1363 CmpPredicate Pred;
1364 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1365 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1366 if (all_of(Cmp->users(),
1368 m_Not(m_Specific(Cmp)),
1369 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1370 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1371 for (VPUser *U : to_vector(Cmp->users())) {
1372 auto *R = cast<VPSingleDefRecipe>(U);
1373 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1374 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1375 R->setOperand(1, Y);
1376 R->setOperand(2, X);
1377 } else {
1378 // not (cmp pred) -> cmp inv_pred
1379 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1380 R->replaceAllUsesWith(Cmp);
1381 }
1382 }
1383 // If Cmp doesn't have a debug location, use the one from the negation,
1384 // to preserve the location.
1385 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1386 Cmp->setDebugLoc(Def->getDebugLoc());
1387 }
1388 }
1389 }
1390
1391 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1392 // any-of (fcmp uno %A, %B), ...
1393 if (match(Def, m_AnyOf())) {
1395 VPRecipeBase *UnpairedCmp = nullptr;
1396 for (VPValue *Op : Def->operands()) {
1397 VPValue *X;
1398 if (Op->getNumUsers() > 1 ||
1400 m_Deferred(X)))) {
1401 NewOps.push_back(Op);
1402 } else if (!UnpairedCmp) {
1403 UnpairedCmp = Op->getDefiningRecipe();
1404 } else {
1405 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1406 UnpairedCmp->getOperand(0), X));
1407 UnpairedCmp = nullptr;
1408 }
1409 }
1410
1411 if (UnpairedCmp)
1412 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1413
1414 if (NewOps.size() < Def->getNumOperands()) {
1415 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1416 return Def->replaceAllUsesWith(NewAnyOf);
1417 }
1418 }
1419
1420 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1421 // This is useful for fmax/fmin without fast-math flags, where we need to
1422 // check if any operand is NaN.
1424 m_Deferred(X)),
1426 m_Deferred(Y))))) {
1427 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1428 return Def->replaceAllUsesWith(NewCmp);
1429 }
1430
1431 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1432 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1433 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1434 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1435 TypeInfo.inferScalarType(Def))
1436 return Def->replaceAllUsesWith(Def->getOperand(1));
1437
1439 m_One()))) {
1440 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1441 if (TypeInfo.inferScalarType(X) != WideStepTy)
1442 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1443 Def->replaceAllUsesWith(X);
1444 return;
1445 }
1446
1447 // For i1 vp.merges produced by AnyOf reductions:
1448 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1450 m_VPValue(X), m_VPValue())) &&
1452 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1453 Def->setOperand(1, Def->getOperand(0));
1454 Def->setOperand(0, Y);
1455 return;
1456 }
1457
1458 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1459 if (Phi->getOperand(0) == Phi->getOperand(1))
1460 Phi->replaceAllUsesWith(Phi->getOperand(0));
1461 return;
1462 }
1463
1464 // Look through ExtractLastLane.
1465 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1466 if (match(A, m_BuildVector())) {
1467 auto *BuildVector = cast<VPInstruction>(A);
1468 Def->replaceAllUsesWith(
1469 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1470 return;
1471 }
1472 if (Plan->hasScalarVFOnly())
1473 return Def->replaceAllUsesWith(A);
1474 }
1475
1476 // Look through ExtractPenultimateElement (BuildVector ....).
1478 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1479 Def->replaceAllUsesWith(
1480 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1481 return;
1482 }
1483
1484 uint64_t Idx;
1486 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1487 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1488 return;
1489 }
1490
1491 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1492 Def->replaceAllUsesWith(
1493 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1494 return;
1495 }
1496
1497 // Look through broadcast of single-scalar when used as select conditions; in
1498 // that case the scalar condition can be used directly.
1499 if (match(Def,
1502 "broadcast operand must be single-scalar");
1503 Def->setOperand(0, C);
1504 return;
1505 }
1506
1507 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1508 if (Phi->getNumOperands() == 1)
1509 Phi->replaceAllUsesWith(Phi->getOperand(0));
1510 return;
1511 }
1512
1513 // Some simplifications can only be applied after unrolling. Perform them
1514 // below.
1515 if (!Plan->isUnrolled())
1516 return;
1517
1518 // After unrolling, extract-lane may be used to extract values from multiple
1519 // scalar sources. Only simplify when extracting from a single scalar source.
1520 VPValue *LaneToExtract;
1521 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1522 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1524 return Def->replaceAllUsesWith(A);
1525
1526 // Simplify extract-lane with single source to extract-element.
1527 Def->replaceAllUsesWith(Builder.createNaryOp(
1528 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1529 return;
1530 }
1531
1532 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1533 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1534 isa<VPPhi>(X)) {
1535 auto *Phi = cast<VPPhi>(X);
1536 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1537 Phi->getSingleUser() == Def) {
1538 Phi->setOperand(0, Y);
1539 Def->replaceAllUsesWith(Phi);
1540 return;
1541 }
1542 }
1543
1544 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1545 // just the pointer operand.
1546 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1547 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1548 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1549
1550 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1551 // the first lane is demanded.
1552 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1553 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1554 Steps->replaceAllUsesWith(Steps->getOperand(0));
1555 return;
1556 }
1557 }
1558 // Simplify redundant ReductionStartVector recipes after unrolling.
1559 VPValue *StartV;
1561 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1562 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1563 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1564 return PhiR && PhiR->isInLoop();
1565 });
1566 return;
1567 }
1568
1570 Def->replaceAllUsesWith(A);
1571 return;
1572 }
1573
1574 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1577 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1578 all_of(A->users(),
1579 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1580 return Def->replaceAllUsesWith(A);
1581 }
1582
1583 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1584 return Def->replaceAllUsesWith(A);
1585}
1586
1589 Plan.getEntry());
1590 VPTypeAnalysis TypeInfo(Plan);
1592 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1593 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1594 simplifyRecipe(Def, TypeInfo);
1595 }
1596}
1597
1599 if (Plan.hasScalarVFOnly())
1600 return;
1601
1602 // Try to narrow wide and replicating recipes to single scalar recipes,
1603 // based on VPlan analysis. Only process blocks in the loop region for now,
1604 // without traversing into nested regions, as recipes in replicate regions
1605 // cannot be converted yet.
1608 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1610 VPWidenStoreRecipe>(&R))
1611 continue;
1612 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1613 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1614 continue;
1615
1616 // Convert an unmasked scatter with an uniform address into
1617 // extract-last-lane + scalar store.
1618 // TODO: Add a profitability check comparing the cost of a scatter vs.
1619 // extract + scalar store.
1620 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1621 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1622 !WidenStoreR->isConsecutive()) {
1623 assert(!WidenStoreR->isReverse() &&
1624 "Not consecutive memory recipes shouldn't be reversed");
1625 VPValue *Mask = WidenStoreR->getMask();
1626
1627 // Only convert the scatter to a scalar store if it is unmasked.
1628 // TODO: Support converting scatter masked by the header mask to scalar
1629 // store.
1630 if (Mask)
1631 continue;
1632
1634 {WidenStoreR->getOperand(1)});
1635 Extract->insertBefore(WidenStoreR);
1636
1637 // TODO: Sink the scalar store recipe to middle block if possible.
1638 auto *ScalarStore = new VPReplicateRecipe(
1639 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1640 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1641 *WidenStoreR /*Metadata*/);
1642 ScalarStore->insertBefore(WidenStoreR);
1643 WidenStoreR->eraseFromParent();
1644 continue;
1645 }
1646
1647 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1648 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1649 vputils::isSingleScalar(RepR->getOperand(1))) {
1650 auto *Clone = new VPReplicateRecipe(
1651 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1652 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1653 *RepR /*Metadata*/, RepR->getDebugLoc());
1654 Clone->insertBefore(RepOrWidenR);
1655 VPBuilder Builder(Clone);
1656 VPValue *ExtractOp = Clone->getOperand(0);
1657 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1658 ExtractOp =
1659 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1660 ExtractOp =
1661 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1662 Clone->setOperand(0, ExtractOp);
1663 RepR->eraseFromParent();
1664 continue;
1665 }
1666
1667 // Skip recipes that aren't single scalars.
1668 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1669 continue;
1670
1671 // Skip recipes for which conversion to single-scalar does introduce
1672 // additional broadcasts. No extra broadcasts are needed, if either only
1673 // the scalars of the recipe are used, or at least one of the operands
1674 // would require a broadcast. In the latter case, the single-scalar may
1675 // need to be broadcasted, but another broadcast is removed.
1676 if (!all_of(RepOrWidenR->users(),
1677 [RepOrWidenR](const VPUser *U) {
1678 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1679 unsigned Opcode = VPI->getOpcode();
1680 if (Opcode == VPInstruction::ExtractLastLane ||
1681 Opcode == VPInstruction::ExtractLastPart ||
1682 Opcode == VPInstruction::ExtractPenultimateElement)
1683 return true;
1684 }
1685
1686 return U->usesScalars(RepOrWidenR);
1687 }) &&
1688 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1689 if (Op->getSingleUser() != RepOrWidenR)
1690 return false;
1691 // Non-constant live-ins require broadcasts, while constants do not
1692 // need explicit broadcasts.
1693 auto *IRV = dyn_cast<VPIRValue>(Op);
1694 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1695 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1696 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1697 }))
1698 continue;
1699
1700 auto *Clone = new VPReplicateRecipe(
1701 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1702 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1703 Clone->insertBefore(RepOrWidenR);
1704 RepOrWidenR->replaceAllUsesWith(Clone);
1705 if (isDeadRecipe(*RepOrWidenR))
1706 RepOrWidenR->eraseFromParent();
1707 }
1708 }
1709}
1710
1711/// Try to see if all of \p Blend's masks share a common value logically and'ed
1712/// and remove it from the masks.
1714 if (Blend->isNormalized())
1715 return;
1716 VPValue *CommonEdgeMask;
1717 if (!match(Blend->getMask(0),
1718 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1719 return;
1720 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1721 if (!match(Blend->getMask(I),
1722 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1723 return;
1724 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1725 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1726}
1727
1728/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1729/// to make sure the masks are simplified.
1730static void simplifyBlends(VPlan &Plan) {
1733 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1734 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1735 if (!Blend)
1736 continue;
1737
1738 removeCommonBlendMask(Blend);
1739
1740 // Try to remove redundant blend recipes.
1741 SmallPtrSet<VPValue *, 4> UniqueValues;
1742 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1743 UniqueValues.insert(Blend->getIncomingValue(0));
1744 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1745 if (!match(Blend->getMask(I), m_False()))
1746 UniqueValues.insert(Blend->getIncomingValue(I));
1747
1748 if (UniqueValues.size() == 1) {
1749 Blend->replaceAllUsesWith(*UniqueValues.begin());
1750 Blend->eraseFromParent();
1751 continue;
1752 }
1753
1754 if (Blend->isNormalized())
1755 continue;
1756
1757 // Normalize the blend so its first incoming value is used as the initial
1758 // value with the others blended into it.
1759
1760 unsigned StartIndex = 0;
1761 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1762 // If a value's mask is used only by the blend then is can be deadcoded.
1763 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1764 // that's used by multiple blends where it can be removed from them all.
1765 VPValue *Mask = Blend->getMask(I);
1766 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1767 StartIndex = I;
1768 break;
1769 }
1770 }
1771
1772 SmallVector<VPValue *, 4> OperandsWithMask;
1773 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1774
1775 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1776 if (I == StartIndex)
1777 continue;
1778 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1779 OperandsWithMask.push_back(Blend->getMask(I));
1780 }
1781
1782 auto *NewBlend =
1783 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1784 OperandsWithMask, Blend->getDebugLoc());
1785 NewBlend->insertBefore(&R);
1786
1787 VPValue *DeadMask = Blend->getMask(StartIndex);
1788 Blend->replaceAllUsesWith(NewBlend);
1789 Blend->eraseFromParent();
1791
1792 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1793 VPValue *NewMask;
1794 if (NewBlend->getNumOperands() == 3 &&
1795 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1796 VPValue *Inc0 = NewBlend->getOperand(0);
1797 VPValue *Inc1 = NewBlend->getOperand(1);
1798 VPValue *OldMask = NewBlend->getOperand(2);
1799 NewBlend->setOperand(0, Inc1);
1800 NewBlend->setOperand(1, Inc0);
1801 NewBlend->setOperand(2, NewMask);
1802 if (OldMask->getNumUsers() == 0)
1803 cast<VPInstruction>(OldMask)->eraseFromParent();
1804 }
1805 }
1806 }
1807}
1808
1809/// Optimize the width of vector induction variables in \p Plan based on a known
1810/// constant Trip Count, \p BestVF and \p BestUF.
1812 ElementCount BestVF,
1813 unsigned BestUF) {
1814 // Only proceed if we have not completely removed the vector region.
1815 if (!Plan.getVectorLoopRegion())
1816 return false;
1817
1818 const APInt *TC;
1819 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1820 return false;
1821
1822 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1823 // and UF. Returns at least 8.
1824 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1825 APInt AlignedTC =
1828 APInt MaxVal = AlignedTC - 1;
1829 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1830 };
1831 unsigned NewBitWidth =
1832 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1833
1834 LLVMContext &Ctx = Plan.getContext();
1835 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1836
1837 bool MadeChange = false;
1838
1839 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1840 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1841 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1842
1843 // Currently only handle canonical IVs as it is trivial to replace the start
1844 // and stop values, and we currently only perform the optimization when the
1845 // IV has a single use.
1846 if (!WideIV || !WideIV->isCanonical() ||
1847 WideIV->hasMoreThanOneUniqueUser() ||
1848 NewIVTy == WideIV->getScalarType())
1849 continue;
1850
1851 // Currently only handle cases where the single user is a header-mask
1852 // comparison with the backedge-taken-count.
1853 VPUser *SingleUser = WideIV->getSingleUser();
1854 if (!SingleUser ||
1855 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1858 continue;
1859
1860 // Update IV operands and comparison bound to use new narrower type.
1861 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1862 WideIV->setStartValue(NewStart);
1863 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1864 WideIV->setStepValue(NewStep);
1865
1866 auto *NewBTC = new VPWidenCastRecipe(
1867 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1868 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1869 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1870 Cmp->setOperand(1, NewBTC);
1871
1872 MadeChange = true;
1873 }
1874
1875 return MadeChange;
1876}
1877
1878/// Return true if \p Cond is known to be true for given \p BestVF and \p
1879/// BestUF.
1881 ElementCount BestVF, unsigned BestUF,
1884 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1885 &PSE](VPValue *C) {
1886 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1887 });
1888
1889 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1891 m_Specific(CanIV->getBackedgeValue()),
1892 m_Specific(&Plan.getVectorTripCount()))))
1893 return false;
1894
1895 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1896 // count is not conveniently available as SCEV so far, so we compare directly
1897 // against the original trip count. This is stricter than necessary, as we
1898 // will only return true if the trip count == vector trip count.
1899 const SCEV *VectorTripCount =
1901 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1902 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1903 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1904 "Trip count SCEV must be computable");
1905 ScalarEvolution &SE = *PSE.getSE();
1906 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1907 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1908 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1909}
1910
1911/// Try to replace multiple active lane masks used for control flow with
1912/// a single, wide active lane mask instruction followed by multiple
1913/// extract subvector intrinsics. This applies to the active lane mask
1914/// instructions both in the loop and in the preheader.
1915/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1916/// new extracts from the first active lane mask, which has it's last
1917/// operand (multiplier) set to UF.
1919 unsigned UF) {
1920 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1921 return false;
1922
1923 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1924 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1925 auto *Term = &ExitingVPBB->back();
1926
1927 using namespace llvm::VPlanPatternMatch;
1929 m_VPValue(), m_VPValue(), m_VPValue())))))
1930 return false;
1931
1932 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1933 LLVMContext &Ctx = Plan.getContext();
1934
1935 auto ExtractFromALM = [&](VPInstruction *ALM,
1936 SmallVectorImpl<VPValue *> &Extracts) {
1937 DebugLoc DL = ALM->getDebugLoc();
1938 for (unsigned Part = 0; Part < UF; ++Part) {
1940 Ops.append({ALM, Plan.getOrAddLiveIn(
1941 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1942 VF.getKnownMinValue() * Part))});
1943 auto *Ext =
1944 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1945 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1946 Extracts[Part] = Ext;
1947 Ext->insertAfter(ALM);
1948 }
1949 };
1950
1951 // Create a list of each active lane mask phi, ordered by unroll part.
1953 for (VPRecipeBase &R : Header->phis()) {
1955 if (!Phi)
1956 continue;
1957 VPValue *Index = nullptr;
1958 match(Phi->getBackedgeValue(),
1960 assert(Index && "Expected index from ActiveLaneMask instruction");
1961
1962 uint64_t Part;
1963 if (match(Index,
1965 m_VPValue(), m_ConstantInt(Part))))
1966 Phis[Part] = Phi;
1967 else
1968 // Anything other than a CanonicalIVIncrementForPart is part 0
1969 Phis[0] = Phi;
1970 }
1971
1972 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1973 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1974
1975 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1976 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1977
1978 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1979 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1980 "Expected incoming values of Phi to be ActiveLaneMasks");
1981
1982 // When using wide lane masks, the return type of the get.active.lane.mask
1983 // intrinsic is VF x UF (last operand).
1984 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1985 EntryALM->setOperand(2, ALMMultiplier);
1986 LoopALM->setOperand(2, ALMMultiplier);
1987
1988 // Create UF x extract vectors and insert into preheader.
1989 SmallVector<VPValue *> EntryExtracts(UF);
1990 ExtractFromALM(EntryALM, EntryExtracts);
1991
1992 // Create UF x extract vectors and insert before the loop compare & branch,
1993 // updating the compare to use the first extract.
1994 SmallVector<VPValue *> LoopExtracts(UF);
1995 ExtractFromALM(LoopALM, LoopExtracts);
1996 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1997 Not->setOperand(0, LoopExtracts[0]);
1998
1999 // Update the incoming values of active lane mask phis.
2000 for (unsigned Part = 0; Part < UF; ++Part) {
2001 Phis[Part]->setStartValue(EntryExtracts[Part]);
2002 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2003 }
2004
2005 return true;
2006}
2007
2008/// Try to simplify the branch condition of \p Plan. This may restrict the
2009/// resulting plan to \p BestVF and \p BestUF.
2011 unsigned BestUF,
2013 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2014 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2015 auto *Term = &ExitingVPBB->back();
2016 VPValue *Cond;
2017 if (match(Term, m_BranchOnCount()) ||
2019 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2020 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2021 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2022 const SCEV *VectorTripCount =
2024 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2025 VectorTripCount =
2027 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2028 "Trip count SCEV must be computable");
2029 ScalarEvolution &SE = *PSE.getSE();
2030 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2031 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2032 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2033 return false;
2034 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2036 // For BranchOnCond, check if we can prove the condition to be true using VF
2037 // and UF.
2038 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2039 return false;
2040 } else {
2041 return false;
2042 }
2043
2044 // The vector loop region only executes once. If possible, completely remove
2045 // the region, otherwise replace the terminator controlling the latch with
2046 // (BranchOnCond true).
2047 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2048 // support for other non-canonical widen induction recipes (e.g.,
2049 // VPWidenPointerInductionRecipe).
2050 // TODO: fold branch-on-constant after dissolving region.
2051 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2052 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2053 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2054 return R->isCanonical();
2055 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2056 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2057 })) {
2058 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2059 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2060 VPBuilder Builder(Plan.getVectorPreheader());
2061 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2062 R->getScalarType());
2063 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2064 HeaderR.eraseFromParent();
2065 continue;
2066 }
2067 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2068 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2069 HeaderR.eraseFromParent();
2070 }
2071
2072 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2073 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2074 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2075 for (VPBlockBase *Exit : Exits)
2076 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2077
2078 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2079 B->setParent(nullptr);
2080
2081 VPBlockUtils::connectBlocks(Preheader, Header);
2082
2083 for (VPBlockBase *Exit : Exits)
2084 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2085
2086 // Replace terminating branch-on-two-conds with branch-on-cond to early
2087 // exit.
2088 if (Exits.size() != 1) {
2089 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2090 "BranchOnTwoConds needs 2 remaining exits");
2092 Term->getOperand(0));
2093 }
2095 } else {
2096 // The vector region contains header phis for which we cannot remove the
2097 // loop region yet.
2098
2099 // For BranchOnTwoConds, set the latch exit condition to true directly.
2100 if (match(Term, m_BranchOnTwoConds())) {
2101 Term->setOperand(1, Plan.getTrue());
2102 return true;
2103 }
2104
2105 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2106 {}, {}, Term->getDebugLoc());
2107 ExitingVPBB->appendRecipe(BOC);
2108 }
2109
2110 Term->eraseFromParent();
2111
2112 return true;
2113}
2114
2115/// From the definition of llvm.experimental.get.vector.length,
2116/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2120 vp_depth_first_deep(Plan.getEntry()))) {
2121 for (VPRecipeBase &R : *VPBB) {
2122 VPValue *AVL;
2123 if (!match(&R, m_EVL(m_VPValue(AVL))))
2124 continue;
2125
2126 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2127 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2128 continue;
2129 ScalarEvolution &SE = *PSE.getSE();
2130 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2131 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2132 continue;
2133
2135 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2136 R.getDebugLoc());
2137 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2138 return true;
2139 }
2140 }
2141 return false;
2142}
2143
2145 unsigned BestUF,
2147 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2148 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2149
2150 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2151 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2152 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2153 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2154
2155 if (MadeChange) {
2156 Plan.setVF(BestVF);
2157 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2158 }
2159}
2160
2161/// Sink users of \p FOR after the recipe defining the previous value \p
2162/// Previous of the recurrence. \returns true if all users of \p FOR could be
2163/// re-arranged as needed or false if it is not possible.
2164static bool
2166 VPRecipeBase *Previous,
2167 VPDominatorTree &VPDT) {
2168 // Collect recipes that need sinking.
2171 Seen.insert(Previous);
2172 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2173 // The previous value must not depend on the users of the recurrence phi. In
2174 // that case, FOR is not a fixed order recurrence.
2175 if (SinkCandidate == Previous)
2176 return false;
2177
2178 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2179 !Seen.insert(SinkCandidate).second ||
2180 VPDT.properlyDominates(Previous, SinkCandidate))
2181 return true;
2182
2183 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2184 return false;
2185
2186 WorkList.push_back(SinkCandidate);
2187 return true;
2188 };
2189
2190 // Recursively sink users of FOR after Previous.
2191 WorkList.push_back(FOR);
2192 for (unsigned I = 0; I != WorkList.size(); ++I) {
2193 VPRecipeBase *Current = WorkList[I];
2194 assert(Current->getNumDefinedValues() == 1 &&
2195 "only recipes with a single defined value expected");
2196
2197 for (VPUser *User : Current->getVPSingleValue()->users()) {
2198 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2199 return false;
2200 }
2201 }
2202
2203 // Keep recipes to sink ordered by dominance so earlier instructions are
2204 // processed first.
2205 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2206 return VPDT.properlyDominates(A, B);
2207 });
2208
2209 for (VPRecipeBase *SinkCandidate : WorkList) {
2210 if (SinkCandidate == FOR)
2211 continue;
2212
2213 SinkCandidate->moveAfter(Previous);
2214 Previous = SinkCandidate;
2215 }
2216 return true;
2217}
2218
2219/// Try to hoist \p Previous and its operands before all users of \p FOR.
2221 VPRecipeBase *Previous,
2222 VPDominatorTree &VPDT) {
2223 if (cannotHoistOrSinkRecipe(*Previous))
2224 return false;
2225
2226 // Collect recipes that need hoisting.
2227 SmallVector<VPRecipeBase *> HoistCandidates;
2229 VPRecipeBase *HoistPoint = nullptr;
2230 // Find the closest hoist point by looking at all users of FOR and selecting
2231 // the recipe dominating all other users.
2232 for (VPUser *U : FOR->users()) {
2233 auto *R = cast<VPRecipeBase>(U);
2234 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2235 HoistPoint = R;
2236 }
2237 assert(all_of(FOR->users(),
2238 [&VPDT, HoistPoint](VPUser *U) {
2239 auto *R = cast<VPRecipeBase>(U);
2240 return HoistPoint == R ||
2241 VPDT.properlyDominates(HoistPoint, R);
2242 }) &&
2243 "HoistPoint must dominate all users of FOR");
2244
2245 auto NeedsHoisting = [HoistPoint, &VPDT,
2246 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2247 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2248 if (!HoistCandidate)
2249 return nullptr;
2250 VPRegionBlock *EnclosingLoopRegion =
2251 HoistCandidate->getParent()->getEnclosingLoopRegion();
2252 assert((!HoistCandidate->getRegion() ||
2253 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2254 "CFG in VPlan should still be flat, without replicate regions");
2255 // Hoist candidate was already visited, no need to hoist.
2256 if (!Visited.insert(HoistCandidate).second)
2257 return nullptr;
2258
2259 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2260 // hoisting.
2261 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2262 return nullptr;
2263
2264 // If we reached a recipe that dominates HoistPoint, we don't need to
2265 // hoist the recipe.
2266 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2267 return nullptr;
2268 return HoistCandidate;
2269 };
2270
2271 if (!NeedsHoisting(Previous->getVPSingleValue()))
2272 return true;
2273
2274 // Recursively try to hoist Previous and its operands before all users of FOR.
2275 HoistCandidates.push_back(Previous);
2276
2277 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2278 VPRecipeBase *Current = HoistCandidates[I];
2279 assert(Current->getNumDefinedValues() == 1 &&
2280 "only recipes with a single defined value expected");
2281 if (cannotHoistOrSinkRecipe(*Current))
2282 return false;
2283
2284 for (VPValue *Op : Current->operands()) {
2285 // If we reach FOR, it means the original Previous depends on some other
2286 // recurrence that in turn depends on FOR. If that is the case, we would
2287 // also need to hoist recipes involving the other FOR, which may break
2288 // dependencies.
2289 if (Op == FOR)
2290 return false;
2291
2292 if (auto *R = NeedsHoisting(Op)) {
2293 // Bail out if the recipe defines multiple values.
2294 // TODO: Hoisting such recipes requires additional handling.
2295 if (R->getNumDefinedValues() != 1)
2296 return false;
2297 HoistCandidates.push_back(R);
2298 }
2299 }
2300 }
2301
2302 // Order recipes to hoist by dominance so earlier instructions are processed
2303 // first.
2304 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2305 return VPDT.properlyDominates(A, B);
2306 });
2307
2308 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2309 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2310 HoistPoint->getIterator());
2311 }
2312
2313 return true;
2314}
2315
2317 VPBuilder &LoopBuilder) {
2318 VPDominatorTree VPDT(Plan);
2319
2321 for (VPRecipeBase &R :
2324 RecurrencePhis.push_back(FOR);
2325
2326 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2328 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2329 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2330 // to terminate.
2331 while (auto *PrevPhi =
2333 assert(PrevPhi->getParent() == FOR->getParent());
2334 assert(SeenPhis.insert(PrevPhi).second);
2335 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2336 }
2337
2338 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2339 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2340 return false;
2341
2342 // Introduce a recipe to combine the incoming and previous values of a
2343 // fixed-order recurrence.
2344 VPBasicBlock *InsertBlock = Previous->getParent();
2345 if (isa<VPHeaderPHIRecipe>(Previous))
2346 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2347 else
2348 LoopBuilder.setInsertPoint(InsertBlock,
2349 std::next(Previous->getIterator()));
2350
2351 auto *RecurSplice =
2353 {FOR, FOR->getBackedgeValue()});
2354
2355 FOR->replaceAllUsesWith(RecurSplice);
2356 // Set the first operand of RecurSplice to FOR again, after replacing
2357 // all users.
2358 RecurSplice->setOperand(0, FOR);
2359
2360 // Check for users extracting at the penultimate active lane of the FOR.
2361 // If only a single lane is active in the current iteration, we need to
2362 // select the last element from the previous iteration (from the FOR phi
2363 // directly).
2364 for (VPUser *U : RecurSplice->users()) {
2366 m_Specific(RecurSplice))))
2367 continue;
2368
2370 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2371 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2372 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2373 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2374 VPValue *PenultimateIndex =
2375 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2376 VPValue *PenultimateLastIter =
2377 B.createNaryOp(VPInstruction::ExtractLane,
2378 {PenultimateIndex, FOR->getBackedgeValue()});
2379 VPValue *LastPrevIter =
2380 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2381
2382 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2383 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2384 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2385 }
2386 }
2387 return true;
2388}
2389
2391 for (VPRecipeBase &R :
2393 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2394 if (!PhiR)
2395 continue;
2396 RecurKind RK = PhiR->getRecurrenceKind();
2397 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2399 continue;
2400
2401 for (VPUser *U : collectUsersRecursively(PhiR))
2402 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2403 RecWithFlags->dropPoisonGeneratingFlags();
2404 }
2405 }
2406}
2407
2408namespace {
2409struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2410 static bool isSentinel(const VPSingleDefRecipe *Def) {
2411 return Def == getEmptyKey() || Def == getTombstoneKey();
2412 }
2413
2414 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2415 /// return that source element type.
2416 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2417 // All VPInstructions that lower to GEPs must have the i8 source element
2418 // type (as they are PtrAdds), so we omit it.
2420 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2421 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2422 return GEP->getSourceElementType();
2423 return nullptr;
2424 })
2425 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2426 [](auto *I) { return I->getSourceElementType(); })
2427 .Default([](auto *) { return nullptr; });
2428 }
2429
2430 /// Returns true if recipe \p Def can be safely handed for CSE.
2431 static bool canHandle(const VPSingleDefRecipe *Def) {
2432 // We can extend the list of handled recipes in the future,
2433 // provided we account for the data embedded in them while checking for
2434 // equality or hashing.
2435 auto C = getOpcodeOrIntrinsicID(Def);
2436
2437 // The issue with (Insert|Extract)Value is that the index of the
2438 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2439 // VPlan.
2440 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2441 C->second == Instruction::ExtractValue)))
2442 return false;
2443
2444 // During CSE, we can only handle recipes that don't read from memory: if
2445 // they read from memory, there could be an intervening write to memory
2446 // before the next instance is CSE'd, leading to an incorrect result.
2447 return !Def->mayReadFromMemory();
2448 }
2449
2450 /// Hash the underlying data of \p Def.
2451 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2452 const VPlan *Plan = Def->getParent()->getPlan();
2453 VPTypeAnalysis TypeInfo(*Plan);
2454 hash_code Result = hash_combine(
2455 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2456 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2458 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2459 if (RFlags->hasPredicate())
2460 return hash_combine(Result, RFlags->getPredicate());
2461 return Result;
2462 }
2463
2464 /// Check equality of underlying data of \p L and \p R.
2465 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2466 if (isSentinel(L) || isSentinel(R))
2467 return L == R;
2468 if (L->getVPDefID() != R->getVPDefID() ||
2470 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2472 !equal(L->operands(), R->operands()))
2473 return false;
2475 "must have valid opcode info for both recipes");
2476 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2477 if (LFlags->hasPredicate() &&
2478 LFlags->getPredicate() !=
2479 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2480 return false;
2481 // Recipes in replicate regions implicitly depend on predicate. If either
2482 // recipe is in a replicate region, only consider them equal if both have
2483 // the same parent.
2484 const VPRegionBlock *RegionL = L->getRegion();
2485 const VPRegionBlock *RegionR = R->getRegion();
2486 if (((RegionL && RegionL->isReplicator()) ||
2487 (RegionR && RegionR->isReplicator())) &&
2488 L->getParent() != R->getParent())
2489 return false;
2490 const VPlan *Plan = L->getParent()->getPlan();
2491 VPTypeAnalysis TypeInfo(*Plan);
2492 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2493 }
2494};
2495} // end anonymous namespace
2496
2497/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2498/// Plan.
2500 VPDominatorTree VPDT(Plan);
2502
2504 vp_depth_first_deep(Plan.getEntry()))) {
2505 for (VPRecipeBase &R : *VPBB) {
2506 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2507 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2508 continue;
2509 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2510 // V must dominate Def for a valid replacement.
2511 if (!VPDT.dominates(V->getParent(), VPBB))
2512 continue;
2513 // Only keep flags present on both V and Def.
2514 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2515 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2516 Def->replaceAllUsesWith(V);
2517 continue;
2518 }
2519 CSEMap[Def] = Def;
2520 }
2521 }
2522}
2523
2524/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2525static void licm(VPlan &Plan) {
2526 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2527
2528 // Hoist any loop invariant recipes from the vector loop region to the
2529 // preheader. Preform a shallow traversal of the vector loop region, to
2530 // exclude recipes in replicate regions. Since the top-level blocks in the
2531 // vector loop region are guaranteed to execute if the vector pre-header is,
2532 // we don't need to check speculation safety.
2533 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2534 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2535 "Expected vector prehader's successor to be the vector loop region");
2537 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2538 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2540 continue;
2541 if (any_of(R.operands(), [](VPValue *Op) {
2542 return !Op->isDefinedOutsideLoopRegions();
2543 }))
2544 continue;
2545 R.moveBefore(*Preheader, Preheader->end());
2546 }
2547 }
2548}
2549
2551 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2552 if (Plan.hasScalarVFOnly())
2553 return;
2554 // Keep track of created truncates, so they can be re-used. Note that we
2555 // cannot use RAUW after creating a new truncate, as this would could make
2556 // other uses have different types for their operands, making them invalidly
2557 // typed.
2559 VPTypeAnalysis TypeInfo(Plan);
2560 VPBasicBlock *PH = Plan.getVectorPreheader();
2563 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2566 continue;
2567
2568 VPValue *ResultVPV = R.getVPSingleValue();
2569 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2570 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2571 if (!NewResSizeInBits)
2572 continue;
2573
2574 // If the value wasn't vectorized, we must maintain the original scalar
2575 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2576 // skip casts which do not need to be handled explicitly here, as
2577 // redundant casts will be removed during recipe simplification.
2579 continue;
2580
2581 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2582 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2583 assert(OldResTy->isIntegerTy() && "only integer types supported");
2584 (void)OldResSizeInBits;
2585
2586 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2587
2588 // Any wrapping introduced by shrinking this operation shouldn't be
2589 // considered undefined behavior. So, we can't unconditionally copy
2590 // arithmetic wrapping flags to VPW.
2591 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2592 VPW->dropPoisonGeneratingFlags();
2593
2594 if (OldResSizeInBits != NewResSizeInBits &&
2595 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2596 // Extend result to original width.
2597 auto *Ext =
2598 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2599 Ext->insertAfter(&R);
2600 ResultVPV->replaceAllUsesWith(Ext);
2601 Ext->setOperand(0, ResultVPV);
2602 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2603 } else {
2604 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2605 "Only ICmps should not need extending the result.");
2606 }
2607
2608 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2610 continue;
2611
2612 // Shrink operands by introducing truncates as needed.
2613 unsigned StartIdx =
2614 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2615 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2616 auto *Op = R.getOperand(Idx);
2617 unsigned OpSizeInBits =
2619 if (OpSizeInBits == NewResSizeInBits)
2620 continue;
2621 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2622 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2623 if (!IterIsEmpty) {
2624 R.setOperand(Idx, ProcessedIter->second);
2625 continue;
2626 }
2627
2628 VPBuilder Builder;
2629 if (isa<VPIRValue>(Op))
2630 Builder.setInsertPoint(PH);
2631 else
2632 Builder.setInsertPoint(&R);
2633 VPWidenCastRecipe *NewOp =
2634 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2635 ProcessedIter->second = NewOp;
2636 R.setOperand(Idx, NewOp);
2637 }
2638
2639 }
2640 }
2641}
2642
2646 VPValue *Cond;
2647 // Skip blocks that are not terminated by BranchOnCond.
2648 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2649 continue;
2650
2651 assert(VPBB->getNumSuccessors() == 2 &&
2652 "Two successors expected for BranchOnCond");
2653 unsigned RemovedIdx;
2654 if (match(Cond, m_True()))
2655 RemovedIdx = 1;
2656 else if (match(Cond, m_False()))
2657 RemovedIdx = 0;
2658 else
2659 continue;
2660
2661 VPBasicBlock *RemovedSucc =
2662 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2663 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2664 "There must be a single edge between VPBB and its successor");
2665 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2666 // these recipes.
2667 for (VPRecipeBase &R : RemovedSucc->phis())
2668 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2669
2670 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2671 // automatically on VPlan destruction if it becomes unreachable.
2672 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2673 VPBB->back().eraseFromParent();
2674 }
2675}
2676
2696
2697// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2698// the loop terminator with a branch-on-cond recipe with the negated
2699// active-lane-mask as operand. Note that this turns the loop into an
2700// uncountable one. Only the existing terminator is replaced, all other existing
2701// recipes/users remain unchanged, except for poison-generating flags being
2702// dropped from the canonical IV increment. Return the created
2703// VPActiveLaneMaskPHIRecipe.
2704//
2705// The function uses the following definitions:
2706//
2707// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2708// calculate-trip-count-minus-VF (original TC) : original TC
2709// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2710// CanonicalIVPhi : CanonicalIVIncrement
2711// %StartV is the canonical induction start value.
2712//
2713// The function adds the following recipes:
2714//
2715// vector.ph:
2716// %TripCount = calculate-trip-count-minus-VF (original TC)
2717// [if DataWithControlFlowWithoutRuntimeCheck]
2718// %EntryInc = canonical-iv-increment-for-part %StartV
2719// %EntryALM = active-lane-mask %EntryInc, %TripCount
2720//
2721// vector.body:
2722// ...
2723// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2724// ...
2725// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2726// %ALM = active-lane-mask %InLoopInc, TripCount
2727// %Negated = Not %ALM
2728// branch-on-cond %Negated
2729//
2732 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2733 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2734 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2735 VPValue *StartV = CanonicalIVPHI->getStartValue();
2736
2737 auto *CanonicalIVIncrement =
2738 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2739 // TODO: Check if dropping the flags is needed if
2740 // !DataAndControlFlowWithoutRuntimeCheck.
2741 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2742 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2743 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2744 // we have to take unrolling into account. Each part needs to start at
2745 // Part * VF
2746 auto *VecPreheader = Plan.getVectorPreheader();
2747 VPBuilder Builder(VecPreheader);
2748
2749 // Create the ActiveLaneMask instruction using the correct start values.
2750 VPValue *TC = Plan.getTripCount();
2751
2752 VPValue *TripCount, *IncrementValue;
2754 // When the loop is guarded by a runtime overflow check for the loop
2755 // induction variable increment by VF, we can increment the value before
2756 // the get.active.lane mask and use the unmodified tripcount.
2757 IncrementValue = CanonicalIVIncrement;
2758 TripCount = TC;
2759 } else {
2760 // When avoiding a runtime check, the active.lane.mask inside the loop
2761 // uses a modified trip count and the induction variable increment is
2762 // done after the active.lane.mask intrinsic is called.
2763 IncrementValue = CanonicalIVPHI;
2764 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2765 {TC}, DL);
2766 }
2767 auto *EntryIncrement = Builder.createOverflowingOp(
2768 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2769 "index.part.next");
2770
2771 // Create the active lane mask instruction in the VPlan preheader.
2772 VPValue *ALMMultiplier =
2773 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2774 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2775 {EntryIncrement, TC, ALMMultiplier}, DL,
2776 "active.lane.mask.entry");
2777
2778 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2779 // preheader ActiveLaneMask instruction.
2780 auto *LaneMaskPhi =
2782 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2783
2784 // Create the active lane mask for the next iteration of the loop before the
2785 // original terminator.
2786 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2787 Builder.setInsertPoint(OriginalTerminator);
2788 auto *InLoopIncrement =
2789 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2790 {IncrementValue}, {false, false}, DL);
2791 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2792 {InLoopIncrement, TripCount, ALMMultiplier},
2793 DL, "active.lane.mask.next");
2794 LaneMaskPhi->addOperand(ALM);
2795
2796 // Replace the original terminator with BranchOnCond. We have to invert the
2797 // mask here because a true condition means jumping to the exit block.
2798 auto *NotMask = Builder.createNot(ALM, DL);
2799 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2800 OriginalTerminator->eraseFromParent();
2801 return LaneMaskPhi;
2802}
2803
2804/// Collect the header mask with the pattern:
2805/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2806/// TODO: Introduce explicit recipe for header-mask instead of searching
2807/// for the header-mask pattern manually.
2809 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2810 SmallVector<VPValue *> WideCanonicalIVs;
2811 auto *FoundWidenCanonicalIVUser = find_if(
2813 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2815 "Must have at most one VPWideCanonicalIVRecipe");
2816 if (FoundWidenCanonicalIVUser !=
2817 LoopRegion->getCanonicalIV()->users().end()) {
2818 auto *WideCanonicalIV =
2819 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2820 WideCanonicalIVs.push_back(WideCanonicalIV);
2821 }
2822
2823 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2824 // version of the canonical induction.
2825 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2826 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2827 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2828 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2829 WideCanonicalIVs.push_back(WidenOriginalIV);
2830 }
2831
2832 // Walk users of wide canonical IVs and find the single compare of the form
2833 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2834 VPSingleDefRecipe *HeaderMask = nullptr;
2835 for (auto *Wide : WideCanonicalIVs) {
2836 for (VPUser *U : Wide->users()) {
2837 auto *VPI = dyn_cast<VPInstruction>(U);
2838 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2839 continue;
2840
2841 assert(VPI->getOperand(0) == Wide &&
2842 "WidenCanonicalIV must be the first operand of the compare");
2843 assert(!HeaderMask && "Multiple header masks found?");
2844 HeaderMask = VPI;
2845 }
2846 }
2847 return HeaderMask;
2848}
2849
2851 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2854 UseActiveLaneMaskForControlFlow) &&
2855 "DataAndControlFlowWithoutRuntimeCheck implies "
2856 "UseActiveLaneMaskForControlFlow");
2857
2858 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2859 auto *FoundWidenCanonicalIVUser = find_if(
2861 assert(FoundWidenCanonicalIVUser &&
2862 "Must have widened canonical IV when tail folding!");
2863 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2864 auto *WideCanonicalIV =
2865 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2866 VPSingleDefRecipe *LaneMask;
2867 if (UseActiveLaneMaskForControlFlow) {
2870 } else {
2871 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2872 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2873 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2874 LaneMask =
2875 B.createNaryOp(VPInstruction::ActiveLaneMask,
2876 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2877 nullptr, "active.lane.mask");
2878 }
2879
2880 // Walk users of WideCanonicalIV and replace the header mask of the form
2881 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2882 // removing the old one to ensure there is always only a single header mask.
2883 HeaderMask->replaceAllUsesWith(LaneMask);
2884 HeaderMask->eraseFromParent();
2885}
2886
2887template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2888 Op0_t In;
2890
2891 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2892
2893 template <typename OpTy> bool match(OpTy *V) const {
2894 if (m_Specific(In).match(V)) {
2895 Out = nullptr;
2896 return true;
2897 }
2898 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2899 }
2900};
2901
2902/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2903/// Returns the remaining part \p Out if so, or nullptr otherwise.
2904template <typename Op0_t, typename Op1_t>
2905static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2906 Op1_t &Out) {
2907 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2908}
2909
2910/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2911/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2912/// recipe could be created.
2913/// \p HeaderMask Header Mask.
2914/// \p CurRecipe Recipe to be transform.
2915/// \p TypeInfo VPlan-based type analysis.
2916/// \p EVL The explicit vector length parameter of vector-predication
2917/// intrinsics.
2919 VPRecipeBase &CurRecipe,
2920 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2921 VPlan *Plan = CurRecipe.getParent()->getPlan();
2922 DebugLoc DL = CurRecipe.getDebugLoc();
2923 VPValue *Addr, *Mask, *EndPtr;
2924
2925 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2926 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2927 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2928 EVLEndPtr->insertBefore(&CurRecipe);
2929 EVLEndPtr->setOperand(1, &EVL);
2930 return EVLEndPtr;
2931 };
2932
2933 if (match(&CurRecipe,
2934 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2935 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2936 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2937 EVL, Mask);
2938
2939 VPValue *ReversedVal;
2940 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2941 match(ReversedVal,
2942 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2943 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2944 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2945 auto *LoadR = new VPWidenLoadEVLRecipe(
2946 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2947 LoadR->insertBefore(&CurRecipe);
2948 return new VPWidenIntrinsicRecipe(
2949 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2950 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2951 }
2952
2953 VPValue *StoredVal;
2954 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2955 m_RemoveMask(HeaderMask, Mask))) &&
2956 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2957 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2958 StoredVal, EVL, Mask);
2959
2960 if (match(&CurRecipe,
2961 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2962 m_RemoveMask(HeaderMask, Mask))) &&
2963 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2964 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2965 auto *NewReverse = new VPWidenIntrinsicRecipe(
2966 Intrinsic::experimental_vp_reverse,
2967 {ReversedVal, Plan->getTrue(), &EVL},
2968 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2969 NewReverse->insertBefore(&CurRecipe);
2970 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2971 AdjustEndPtr(EndPtr), NewReverse, EVL,
2972 Mask);
2973 }
2974
2975 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2976 if (Rdx->isConditional() &&
2977 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2978 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2979
2980 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2981 if (Interleave->getMask() &&
2982 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2983 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2984
2985 VPValue *LHS, *RHS;
2986 if (match(&CurRecipe,
2987 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2988 return new VPWidenIntrinsicRecipe(
2989 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2990 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2991
2992 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2993 m_VPValue(RHS))))
2994 return new VPWidenIntrinsicRecipe(
2995 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2996 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2997
2998 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2999 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3000 VPValue *ZExt =
3001 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3002 return new VPInstruction(Instruction::Sub,
3003 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
3004 }
3005
3006 return nullptr;
3007}
3008
3009/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3010/// The transforms here need to preserve the original semantics.
3012 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3013 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3016 m_VPValue(EVL))) &&
3017 match(EVL, m_EVL(m_VPValue()))) {
3018 HeaderMask = R.getVPSingleValue();
3019 break;
3020 }
3021 }
3022 if (!HeaderMask)
3023 return;
3024
3025 VPTypeAnalysis TypeInfo(Plan);
3026 SmallVector<VPRecipeBase *> OldRecipes;
3027 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3029 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3030 NewR->insertBefore(R);
3031 for (auto [Old, New] :
3032 zip_equal(R->definedValues(), NewR->definedValues()))
3033 Old->replaceAllUsesWith(New);
3034 OldRecipes.push_back(R);
3035 }
3036 }
3037 // Erase old recipes at the end so we don't invalidate TypeInfo.
3038 for (VPRecipeBase *R : reverse(OldRecipes)) {
3039 SmallVector<VPValue *> PossiblyDead(R->operands());
3040 R->eraseFromParent();
3041 for (VPValue *Op : PossiblyDead)
3043 }
3044}
3045
3046/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3047/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3048/// iteration.
3049static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3050 VPTypeAnalysis TypeInfo(Plan);
3051 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3052 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3053
3054 assert(all_of(Plan.getVF().users(),
3057 "User of VF that we can't transform to EVL.");
3058 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3060 });
3061
3062 assert(all_of(Plan.getVFxUF().users(),
3063 [&LoopRegion, &Plan](VPUser *U) {
3064 return match(U,
3065 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3066 m_Specific(&Plan.getVFxUF()))) ||
3067 isa<VPWidenPointerInductionRecipe>(U);
3068 }) &&
3069 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3070 "increment of the canonical induction.");
3071 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3072 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3073 // canonical induction must not be updated.
3075 });
3076
3077 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3078 // contained.
3079 bool ContainsFORs =
3081 if (ContainsFORs) {
3082 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3083 VPValue *MaxEVL = &Plan.getVF();
3084 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3085 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3086 MaxEVL = Builder.createScalarZExtOrTrunc(
3087 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3088 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3089
3090 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3091 VPValue *PrevEVL = Builder.createScalarPhi(
3092 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3093
3096 for (VPRecipeBase &R : *VPBB) {
3097 VPValue *V1, *V2;
3098 if (!match(&R,
3100 m_VPValue(V1), m_VPValue(V2))))
3101 continue;
3102 VPValue *Imm = Plan.getOrAddLiveIn(
3105 Intrinsic::experimental_vp_splice,
3106 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3107 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3108 R.getDebugLoc());
3109 VPSplice->insertBefore(&R);
3110 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3111 }
3112 }
3113 }
3114
3115 VPValue *HeaderMask = findHeaderMask(Plan);
3116 if (!HeaderMask)
3117 return;
3118
3119 // Replace header masks with a mask equivalent to predicating by EVL:
3120 //
3121 // icmp ule widen-canonical-iv backedge-taken-count
3122 // ->
3123 // icmp ult step-vector, EVL
3124 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3125 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3126 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3127 VPValue *EVLMask = Builder.createICmp(
3129 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3130 HeaderMask->replaceAllUsesWith(EVLMask);
3131}
3132
3133/// Converts a tail folded vector loop region to step by
3134/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3135/// iteration.
3136///
3137/// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3138/// replaces all uses except the canonical IV increment of
3139/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
3140/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3141/// this transformation.
3142///
3143/// - The header mask is replaced with a header mask based on the EVL.
3144///
3145/// - Plans with FORs have a new phi added to keep track of the EVL of the
3146/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3147/// @llvm.vp.splice.
3148///
3149/// The function uses the following definitions:
3150/// %StartV is the canonical induction start value.
3151///
3152/// The function adds the following recipes:
3153///
3154/// vector.ph:
3155/// ...
3156///
3157/// vector.body:
3158/// ...
3159/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3160/// [ %NextEVLIV, %vector.body ]
3161/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3162/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3163/// ...
3164/// %OpEVL = cast i32 %VPEVL to IVSize
3165/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3166/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3167/// ...
3168///
3169/// If MaxSafeElements is provided, the function adds the following recipes:
3170/// vector.ph:
3171/// ...
3172///
3173/// vector.body:
3174/// ...
3175/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3176/// [ %NextEVLIV, %vector.body ]
3177/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3178/// %cmp = cmp ult %AVL, MaxSafeElements
3179/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3180/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3181/// ...
3182/// %OpEVL = cast i32 %VPEVL to IVSize
3183/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3184/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3185/// ...
3186///
3188 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3189 if (Plan.hasScalarVFOnly())
3190 return;
3191 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3192 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3193
3194 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3195 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3196 VPValue *StartV = CanonicalIVPHI->getStartValue();
3197
3198 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3199 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3200 EVLPhi->insertAfter(CanonicalIVPHI);
3201 VPBuilder Builder(Header, Header->getFirstNonPhi());
3202 // Create the AVL (application vector length), starting from TC -> 0 in steps
3203 // of EVL.
3204 VPPhi *AVLPhi = Builder.createScalarPhi(
3205 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3206 VPValue *AVL = AVLPhi;
3207
3208 if (MaxSafeElements) {
3209 // Support for MaxSafeDist for correct loop emission.
3210 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3211 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3212 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3213 "safe_avl");
3214 }
3215 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3216 DebugLoc::getUnknown(), "evl");
3217
3218 auto *CanonicalIVIncrement =
3219 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3220 Builder.setInsertPoint(CanonicalIVIncrement);
3221 VPValue *OpVPEVL = VPEVL;
3222
3223 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3224 OpVPEVL = Builder.createScalarZExtOrTrunc(
3225 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3226
3227 auto *NextEVLIV = Builder.createOverflowingOp(
3228 Instruction::Add, {OpVPEVL, EVLPhi},
3229 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3230 CanonicalIVIncrement->hasNoSignedWrap()},
3231 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3232 EVLPhi->addOperand(NextEVLIV);
3233
3234 VPValue *NextAVL = Builder.createOverflowingOp(
3235 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3236 DebugLoc::getCompilerGenerated(), "avl.next");
3237 AVLPhi->addOperand(NextAVL);
3238
3239 fixupVFUsersForEVL(Plan, *VPEVL);
3240 removeDeadRecipes(Plan);
3241
3242 // Replace all uses of VPCanonicalIVPHIRecipe by
3243 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3244 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3245 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3246 // TODO: support unroll factor > 1.
3247 Plan.setUF(1);
3248}
3249
3251 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3252 // There should be only one EVL PHI in the entire plan.
3253 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3254
3257 for (VPRecipeBase &R : VPBB->phis())
3258 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3259 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3260 EVLPhi = PhiR;
3261 }
3262
3263 // Early return if no EVL PHI is found.
3264 if (!EVLPhi)
3265 return;
3266
3267 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3268 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3269 VPValue *AVL;
3270 [[maybe_unused]] bool FoundAVL =
3271 match(EVLIncrement,
3272 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3273 assert(FoundAVL && "Didn't find AVL?");
3274
3275 // The AVL may be capped to a safe distance.
3276 VPValue *SafeAVL;
3277 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3278 AVL = SafeAVL;
3279
3280 VPValue *AVLNext;
3281 [[maybe_unused]] bool FoundAVLNext =
3283 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3284 assert(FoundAVLNext && "Didn't find AVL backedge?");
3285
3286 // Convert EVLPhi to concrete recipe.
3287 auto *ScalarR =
3288 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3289 EVLPhi->getDebugLoc(), "evl.based.iv");
3290 EVLPhi->replaceAllUsesWith(ScalarR);
3291 EVLPhi->eraseFromParent();
3292
3293 // Replace CanonicalIVInc with EVL-PHI increment.
3294 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3295 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3296 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3297 m_Specific(&Plan.getVFxUF()))) &&
3298 "Unexpected canonical iv");
3299 Backedge->replaceAllUsesWith(EVLIncrement);
3300
3301 // Remove unused phi and increment.
3302 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3303 CanonicalIVIncrement->eraseFromParent();
3304 CanonicalIV->eraseFromParent();
3305
3306 // Replace the use of VectorTripCount in the latch-exiting block.
3307 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3308 // After: (branch-on-cond icmp eq AVLNext, 0)
3309 VPBasicBlock *LatchExiting =
3310 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3311 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3312 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3313 return;
3314
3315 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3316 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3317 m_Specific(&Plan.getVectorTripCount())))) &&
3318 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3319 "trip count");
3320
3321 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3322 VPBuilder Builder(LatchExitingBr);
3323 LatchExitingBr->setOperand(0,
3324 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3325 Plan.getConstantInt(AVLTy, 0)));
3326}
3327
3329 VPlan &Plan, PredicatedScalarEvolution &PSE,
3330 const DenseMap<Value *, const SCEV *> &StridesMap) {
3331 // Replace VPValues for known constant strides guaranteed by predicate scalar
3332 // evolution.
3333 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3334 auto *R = cast<VPRecipeBase>(&U);
3335 return R->getRegion() ||
3336 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3337 };
3338 ValueToSCEVMapTy RewriteMap;
3339 for (const SCEV *Stride : StridesMap.values()) {
3340 using namespace SCEVPatternMatch;
3341 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3342 const APInt *StrideConst;
3343 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3344 // Only handle constant strides for now.
3345 continue;
3346
3347 auto *CI = Plan.getConstantInt(*StrideConst);
3348 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3349 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3350
3351 // The versioned value may not be used in the loop directly but through a
3352 // sext/zext. Add new live-ins in those cases.
3353 for (Value *U : StrideV->users()) {
3355 continue;
3356 VPValue *StrideVPV = Plan.getLiveIn(U);
3357 if (!StrideVPV)
3358 continue;
3359 unsigned BW = U->getType()->getScalarSizeInBits();
3360 APInt C =
3361 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3362 VPValue *CI = Plan.getConstantInt(C);
3363 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3364 }
3365 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3366 }
3367
3368 for (VPRecipeBase &R : *Plan.getEntry()) {
3369 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3370 if (!ExpSCEV)
3371 continue;
3372 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3373 auto *NewSCEV =
3374 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3375 if (NewSCEV != ScevExpr) {
3376 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3377 ExpSCEV->replaceAllUsesWith(NewExp);
3378 if (Plan.getTripCount() == ExpSCEV)
3379 Plan.resetTripCount(NewExp);
3380 }
3381 }
3382}
3383
3385 VPlan &Plan,
3386 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3387 // Collect recipes in the backward slice of `Root` that may generate a poison
3388 // value that is used after vectorization.
3390 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3392 Worklist.push_back(Root);
3393
3394 // Traverse the backward slice of Root through its use-def chain.
3395 while (!Worklist.empty()) {
3396 VPRecipeBase *CurRec = Worklist.pop_back_val();
3397
3398 if (!Visited.insert(CurRec).second)
3399 continue;
3400
3401 // Prune search if we find another recipe generating a widen memory
3402 // instruction. Widen memory instructions involved in address computation
3403 // will lead to gather/scatter instructions, which don't need to be
3404 // handled.
3406 VPHeaderPHIRecipe>(CurRec))
3407 continue;
3408
3409 // This recipe contributes to the address computation of a widen
3410 // load/store. If the underlying instruction has poison-generating flags,
3411 // drop them directly.
3412 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3413 VPValue *A, *B;
3414 // Dropping disjoint from an OR may yield incorrect results, as some
3415 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3416 // for dependence analysis). Instead, replace it with an equivalent Add.
3417 // This is possible as all users of the disjoint OR only access lanes
3418 // where the operands are disjoint or poison otherwise.
3419 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3420 RecWithFlags->isDisjoint()) {
3421 VPBuilder Builder(RecWithFlags);
3422 VPInstruction *New = Builder.createOverflowingOp(
3423 Instruction::Add, {A, B}, {false, false},
3424 RecWithFlags->getDebugLoc());
3425 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3426 RecWithFlags->replaceAllUsesWith(New);
3427 RecWithFlags->eraseFromParent();
3428 CurRec = New;
3429 } else
3430 RecWithFlags->dropPoisonGeneratingFlags();
3431 } else {
3434 (void)Instr;
3435 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3436 "found instruction with poison generating flags not covered by "
3437 "VPRecipeWithIRFlags");
3438 }
3439
3440 // Add new definitions to the worklist.
3441 for (VPValue *Operand : CurRec->operands())
3442 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3443 Worklist.push_back(OpDef);
3444 }
3445 });
3446
3447 // Traverse all the recipes in the VPlan and collect the poison-generating
3448 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3449 // VPInterleaveRecipe.
3450 auto Iter = vp_depth_first_deep(Plan.getEntry());
3452 for (VPRecipeBase &Recipe : *VPBB) {
3453 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3454 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3455 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3456 if (AddrDef && WidenRec->isConsecutive() &&
3457 BlockNeedsPredication(UnderlyingInstr.getParent()))
3458 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3459 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3460 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3461 if (AddrDef) {
3462 // Check if any member of the interleave group needs predication.
3463 const InterleaveGroup<Instruction> *InterGroup =
3464 InterleaveRec->getInterleaveGroup();
3465 bool NeedPredication = false;
3466 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3467 I < NumMembers; ++I) {
3468 Instruction *Member = InterGroup->getMember(I);
3469 if (Member)
3470 NeedPredication |= BlockNeedsPredication(Member->getParent());
3471 }
3472
3473 if (NeedPredication)
3474 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3475 }
3476 }
3477 }
3478 }
3479}
3480
3482 VPlan &Plan,
3484 &InterleaveGroups,
3485 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3486 if (InterleaveGroups.empty())
3487 return;
3488
3489 // Interleave memory: for each Interleave Group we marked earlier as relevant
3490 // for this VPlan, replace the Recipes widening its memory instructions with a
3491 // single VPInterleaveRecipe at its insertion point.
3492 VPDominatorTree VPDT(Plan);
3493 for (const auto *IG : InterleaveGroups) {
3494 auto *Start =
3495 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3496 VPIRMetadata InterleaveMD(*Start);
3497 SmallVector<VPValue *, 4> StoredValues;
3498 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3499 StoredValues.push_back(StoreR->getStoredValue());
3500 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3501 Instruction *MemberI = IG->getMember(I);
3502 if (!MemberI)
3503 continue;
3504 VPWidenMemoryRecipe *MemoryR =
3505 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3506 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3507 StoredValues.push_back(StoreR->getStoredValue());
3508 InterleaveMD.intersect(*MemoryR);
3509 }
3510
3511 bool NeedsMaskForGaps =
3512 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3513 (!StoredValues.empty() && !IG->isFull());
3514
3515 Instruction *IRInsertPos = IG->getInsertPos();
3516 auto *InsertPos =
3517 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3518
3520 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3521 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3522 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3523
3524 // Get or create the start address for the interleave group.
3525 VPValue *Addr = Start->getAddr();
3526 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3527 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3528 // We cannot re-use the address of member zero because it does not
3529 // dominate the insert position. Instead, use the address of the insert
3530 // position and create a PtrAdd adjusting it to the address of member
3531 // zero.
3532 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3533 // InsertPos or sink loads above zero members to join it.
3534 assert(IG->getIndex(IRInsertPos) != 0 &&
3535 "index of insert position shouldn't be zero");
3536 auto &DL = IRInsertPos->getDataLayout();
3537 APInt Offset(32,
3538 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3539 IG->getIndex(IRInsertPos),
3540 /*IsSigned=*/true);
3541 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3542 VPBuilder B(InsertPos);
3543 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3544 }
3545 // If the group is reverse, adjust the index to refer to the last vector
3546 // lane instead of the first. We adjust the index from the first vector
3547 // lane, rather than directly getting the pointer for lane VF - 1, because
3548 // the pointer operand of the interleaved access is supposed to be uniform.
3549 if (IG->isReverse()) {
3550 auto *ReversePtr = new VPVectorEndPointerRecipe(
3551 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3552 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3553 ReversePtr->insertBefore(InsertPos);
3554 Addr = ReversePtr;
3555 }
3556 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3557 InsertPos->getMask(), NeedsMaskForGaps,
3558 InterleaveMD, InsertPos->getDebugLoc());
3559 VPIG->insertBefore(InsertPos);
3560
3561 unsigned J = 0;
3562 for (unsigned i = 0; i < IG->getFactor(); ++i)
3563 if (Instruction *Member = IG->getMember(i)) {
3564 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3565 if (!Member->getType()->isVoidTy()) {
3566 VPValue *OriginalV = MemberR->getVPSingleValue();
3567 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3568 J++;
3569 }
3570 MemberR->eraseFromParent();
3571 }
3572 }
3573}
3574
3575/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3576/// value, phi and backedge value. In the following example:
3577///
3578/// vector.ph:
3579/// Successor(s): vector loop
3580///
3581/// <x1> vector loop: {
3582/// vector.body:
3583/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3584/// ...
3585/// EMIT branch-on-count ...
3586/// No successors
3587/// }
3588///
3589/// WIDEN-INDUCTION will get expanded to:
3590///
3591/// vector.ph:
3592/// ...
3593/// vp<%induction.start> = ...
3594/// vp<%induction.increment> = ...
3595///
3596/// Successor(s): vector loop
3597///
3598/// <x1> vector loop: {
3599/// vector.body:
3600/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3601/// ...
3602/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3603/// EMIT branch-on-count ...
3604/// No successors
3605/// }
3606static void
3608 VPTypeAnalysis &TypeInfo) {
3609 VPlan *Plan = WidenIVR->getParent()->getPlan();
3610 VPValue *Start = WidenIVR->getStartValue();
3611 VPValue *Step = WidenIVR->getStepValue();
3612 VPValue *VF = WidenIVR->getVFValue();
3613 DebugLoc DL = WidenIVR->getDebugLoc();
3614
3615 // The value from the original loop to which we are mapping the new induction
3616 // variable.
3617 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3618
3619 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3622 VPIRFlags Flags = *WidenIVR;
3623 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3624 AddOp = Instruction::Add;
3625 MulOp = Instruction::Mul;
3626 } else {
3627 AddOp = ID.getInductionOpcode();
3628 MulOp = Instruction::FMul;
3629 }
3630
3631 // If the phi is truncated, truncate the start and step values.
3632 VPBuilder Builder(Plan->getVectorPreheader());
3633 Type *StepTy = TypeInfo.inferScalarType(Step);
3634 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3635 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3636 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3637 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3638 // Truncation doesn't preserve WrapFlags.
3639 Flags.dropPoisonGeneratingFlags();
3640 StepTy = Ty;
3641 }
3642
3643 // Construct the initial value of the vector IV in the vector loop preheader.
3644 Type *IVIntTy =
3646 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3647 if (StepTy->isFloatingPointTy())
3648 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3649
3650 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3651 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3652
3653 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3654 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3655 DebugLoc::getUnknown(), "induction");
3656
3657 // Create the widened phi of the vector IV.
3658 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3659 WidenIVR->getDebugLoc(), "vec.ind");
3660 WidePHI->insertBefore(WidenIVR);
3661
3662 // Create the backedge value for the vector IV.
3663 VPValue *Inc;
3664 VPValue *Prev;
3665 // If unrolled, use the increment and prev value from the operands.
3666 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3667 Inc = SplatVF;
3668 Prev = WidenIVR->getLastUnrolledPartOperand();
3669 } else {
3670 if (VPRecipeBase *R = VF->getDefiningRecipe())
3671 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3672 // Multiply the vectorization factor by the step using integer or
3673 // floating-point arithmetic as appropriate.
3674 if (StepTy->isFloatingPointTy())
3675 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3676 DL);
3677 else
3678 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3679 TypeInfo.inferScalarType(VF), DL);
3680
3681 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3682 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3683 Prev = WidePHI;
3684 }
3685
3687 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3688 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3689 WidenIVR->getDebugLoc(), "vec.ind.next");
3690
3691 WidePHI->addOperand(Next);
3692
3693 WidenIVR->replaceAllUsesWith(WidePHI);
3694}
3695
3696/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3697/// initial value, phi and backedge value. In the following example:
3698///
3699/// <x1> vector loop: {
3700/// vector.body:
3701/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3702/// ...
3703/// EMIT branch-on-count ...
3704/// }
3705///
3706/// WIDEN-POINTER-INDUCTION will get expanded to:
3707///
3708/// <x1> vector loop: {
3709/// vector.body:
3710/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3711/// EMIT %mul = mul %stepvector, %step
3712/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3713/// ...
3714/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3715/// EMIT branch-on-count ...
3716/// }
3718 VPTypeAnalysis &TypeInfo) {
3719 VPlan *Plan = R->getParent()->getPlan();
3720 VPValue *Start = R->getStartValue();
3721 VPValue *Step = R->getStepValue();
3722 VPValue *VF = R->getVFValue();
3723
3724 assert(R->getInductionDescriptor().getKind() ==
3726 "Not a pointer induction according to InductionDescriptor!");
3727 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3728 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3729 "Recipe should have been replaced");
3730
3731 VPBuilder Builder(R);
3732 DebugLoc DL = R->getDebugLoc();
3733
3734 // Build a scalar pointer phi.
3735 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3736
3737 // Create actual address geps that use the pointer phi as base and a
3738 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3739 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3740 Type *StepTy = TypeInfo.inferScalarType(Step);
3741 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3742 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3743 VPValue *PtrAdd = Builder.createNaryOp(
3744 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3745 R->replaceAllUsesWith(PtrAdd);
3746
3747 // Create the backedge value for the scalar pointer phi.
3749 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3750 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3751 DL);
3752 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3753
3754 VPValue *InductionGEP =
3755 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3756 ScalarPtrPhi->addOperand(InductionGEP);
3757}
3758
3760 // Replace loop regions with explicity CFG.
3761 SmallVector<VPRegionBlock *> LoopRegions;
3763 vp_depth_first_deep(Plan.getEntry()))) {
3764 if (!R->isReplicator())
3765 LoopRegions.push_back(R);
3766 }
3767 for (VPRegionBlock *R : LoopRegions)
3768 R->dissolveToCFGLoop();
3769}
3770
3773 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3774 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3777 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3778 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3779 }
3780
3781 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3782 // single-condition branches:
3783 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3784 // the first condition is true, and otherwise jumps to a new interim block.
3785 // 2. A branch that ends the interim block, jumps to the second successor if
3786 // the second condition is true, and otherwise jumps to the third
3787 // successor.
3788 for (VPInstruction *Br : WorkList) {
3789 assert(Br->getNumOperands() == 2 &&
3790 "BranchOnTwoConds must have exactly 2 conditions");
3791 DebugLoc DL = Br->getDebugLoc();
3792 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3793 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3794 assert(Successors.size() == 3 &&
3795 "BranchOnTwoConds must have exactly 3 successors");
3796
3797 for (VPBlockBase *Succ : Successors)
3798 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3799
3800 VPValue *Cond0 = Br->getOperand(0);
3801 VPValue *Cond1 = Br->getOperand(1);
3802 VPBlockBase *Succ0 = Successors[0];
3803 VPBlockBase *Succ1 = Successors[1];
3804 VPBlockBase *Succ2 = Successors[2];
3805 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3806 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3807
3808 VPBasicBlock *InterimBB =
3809 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3810
3811 VPBuilder(BrOnTwoCondsBB)
3813 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3814 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3815
3817 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3818 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3819 Br->eraseFromParent();
3820 }
3821}
3822
3824 VPTypeAnalysis TypeInfo(Plan);
3827 vp_depth_first_deep(Plan.getEntry()))) {
3828 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3829 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3830 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3831 ToRemove.push_back(WidenIVR);
3832 continue;
3833 }
3834
3835 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3836 // If the recipe only generates scalars, scalarize it instead of
3837 // expanding it.
3838 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3839 VPBuilder Builder(WidenIVR);
3840 VPValue *PtrAdd =
3841 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3842 WidenIVR->replaceAllUsesWith(PtrAdd);
3843 ToRemove.push_back(WidenIVR);
3844 continue;
3845 }
3846 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3847 ToRemove.push_back(WidenIVR);
3848 continue;
3849 }
3850
3851 // Expand VPBlendRecipe into VPInstruction::Select.
3852 VPBuilder Builder(&R);
3853 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3854 VPValue *Select = Blend->getIncomingValue(0);
3855 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3856 Select = Builder.createSelect(Blend->getMask(I),
3857 Blend->getIncomingValue(I), Select,
3858 R.getDebugLoc(), "predphi");
3859 Blend->replaceAllUsesWith(Select);
3860 ToRemove.push_back(Blend);
3861 }
3862
3863 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3864 Expr->decompose();
3865 ToRemove.push_back(Expr);
3866 }
3867
3868 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3869 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3870 if (LastActiveL &&
3871 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3872 // Create Not(Mask) for all operands.
3874 for (VPValue *Op : LastActiveL->operands()) {
3875 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3876 NotMasks.push_back(NotMask);
3877 }
3878
3879 // Create FirstActiveLane on the inverted masks.
3880 VPValue *FirstInactiveLane = Builder.createNaryOp(
3882 LastActiveL->getDebugLoc(), "first.inactive.lane");
3883
3884 // Subtract 1 to get the last active lane.
3885 VPValue *One = Plan.getOrAddLiveIn(
3886 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3887 VPValue *LastLane = Builder.createNaryOp(
3888 Instruction::Sub, {FirstInactiveLane, One},
3889 LastActiveL->getDebugLoc(), "last.active.lane");
3890
3891 LastActiveL->replaceAllUsesWith(LastLane);
3892 ToRemove.push_back(LastActiveL);
3893 continue;
3894 }
3895
3896 // Lower BranchOnCount to ICmp + BranchOnCond.
3897 VPValue *IV, *TC;
3898 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3899 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3900 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3901 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3902 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3903 ToRemove.push_back(BranchOnCountInst);
3904 continue;
3905 }
3906
3907 VPValue *VectorStep;
3908 VPValue *ScalarStep;
3910 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3911 continue;
3912
3913 // Expand WideIVStep.
3914 auto *VPI = cast<VPInstruction>(&R);
3915 Type *IVTy = TypeInfo.inferScalarType(VPI);
3916 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3918 ? Instruction::UIToFP
3919 : Instruction::Trunc;
3920 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3921 }
3922
3923 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3924 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3925 ScalarStep =
3926 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3927 }
3928
3929 VPIRFlags Flags;
3930 if (IVTy->isFloatingPointTy())
3931 Flags = {VPI->getFastMathFlags()};
3932
3933 unsigned MulOpc =
3934 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3935 VPInstruction *Mul = Builder.createNaryOp(
3936 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3937 VectorStep = Mul;
3938 VPI->replaceAllUsesWith(VectorStep);
3939 ToRemove.push_back(VPI);
3940 }
3941 }
3942
3943 for (VPRecipeBase *R : ToRemove)
3944 R->eraseFromParent();
3945}
3946
3948 VPBasicBlock *EarlyExitVPBB,
3949 VPlan &Plan,
3950 VPBasicBlock *HeaderVPBB,
3951 VPBasicBlock *LatchVPBB) {
3952 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3953 if (!EarlyExitVPBB->getSinglePredecessor() &&
3954 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3955 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3956 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3957 "unsupported early exit VPBB");
3958 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3959 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3960 // of the phis.
3961 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3962 cast<VPIRPhi>(&R)->swapOperands();
3963 }
3964
3965 VPBuilder Builder(LatchVPBB->getTerminator());
3966 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3967 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3968 "Terminator must be be BranchOnCond");
3969 VPValue *CondOfEarlyExitingVPBB =
3970 EarlyExitingVPBB->getTerminator()->getOperand(0);
3971 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3972 ? CondOfEarlyExitingVPBB
3973 : Builder.createNot(CondOfEarlyExitingVPBB);
3974
3975 // Create a BranchOnTwoConds in the latch that branches to:
3976 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3977 VPValue *IsEarlyExitTaken =
3978 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3979 VPBasicBlock *VectorEarlyExitVPBB =
3980 Plan.createVPBasicBlock("vector.early.exit");
3981 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3982
3983 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3984
3985 // Update the exit phis in the early exit block.
3986 VPBuilder MiddleBuilder(MiddleVPBB);
3987 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3988 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3989 auto *ExitIRI = cast<VPIRPhi>(&R);
3990 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3991 // a single predecessor and 1 if it has two.
3992 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3993 if (ExitIRI->getNumOperands() != 1) {
3994 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3995 // predecessor. Extract its final lane.
3996 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3997 }
3998
3999 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
4000 if (!isa<VPIRValue>(IncomingFromEarlyExit)) {
4001 // Update the incoming value from the early exit.
4002 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
4003 VPInstruction::FirstActiveLane, {CondToEarlyExit},
4004 DebugLoc::getUnknown(), "first.active.lane");
4005 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
4006 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
4007 DebugLoc::getUnknown(), "early.exit.value");
4008 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
4009 }
4010 }
4011
4012 // Replace the conditional branch controlling the latch exit from the vector
4013 // loop with a multi-conditional branch exiting to vector early exit if the
4014 // early exit has been taken, exiting to middle block if the original
4015 // condition of the vector latch is true, otherwise continuing back to header.
4016 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4017 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4018 "Unexpected terminator");
4019 auto *IsLatchExitTaken =
4020 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4021 LatchExitingBranch->getOperand(1));
4022
4023 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4024 LatchExitingBranch->eraseFromParent();
4025
4026 Builder.setInsertPoint(LatchVPBB);
4027 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4028 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
4029 LatchVPBB->clearSuccessors();
4030 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4031 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4032}
4033
4034/// This function tries convert extended in-loop reductions to
4035/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4036/// valid. The created recipe must be decomposed to its constituent
4037/// recipes before execution.
4038static VPExpressionRecipe *
4040 VFRange &Range) {
4041 Type *RedTy = Ctx.Types.inferScalarType(Red);
4042 VPValue *VecOp = Red->getVecOp();
4043
4044 // Clamp the range if using extended-reduction is profitable.
4045 auto IsExtendedRedValidAndClampRange =
4046 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4048 [&](ElementCount VF) {
4049 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4051
4052 InstructionCost ExtRedCost;
4053 InstructionCost ExtCost =
4054 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4055 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4056
4057 if (Red->isPartialReduction()) {
4060 // FIXME: Move partial reduction creation, costing and clamping
4061 // here from LoopVectorize.cpp.
4062 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4063 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4064 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
4065 } else {
4066 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4067 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4068 Red->getFastMathFlags(), CostKind);
4069 }
4070 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4071 },
4072 Range);
4073 };
4074
4075 VPValue *A;
4076 // Match reduce(ext)).
4077 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
4078 IsExtendedRedValidAndClampRange(
4079 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4080 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4081 Ctx.Types.inferScalarType(A)))
4082 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4083
4084 return nullptr;
4085}
4086
4087/// This function tries convert extended in-loop reductions to
4088/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4089/// and valid. The created VPExpressionRecipe must be decomposed to its
4090/// constituent recipes before execution. Patterns of the
4091/// VPExpressionRecipe:
4092/// reduce.add(mul(...)),
4093/// reduce.add(mul(ext(A), ext(B))),
4094/// reduce.add(ext(mul(ext(A), ext(B)))).
4095static VPExpressionRecipe *
4097 VPCostContext &Ctx, VFRange &Range) {
4098 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4099 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
4100 return nullptr;
4101
4102 Type *RedTy = Ctx.Types.inferScalarType(Red);
4103
4104 // Clamp the range if using multiply-accumulate-reduction is profitable.
4105 auto IsMulAccValidAndClampRange =
4107 VPWidenCastRecipe *OuterExt) -> bool {
4109 [&](ElementCount VF) {
4111 Type *SrcTy =
4112 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4113 InstructionCost MulAccCost;
4114
4115 if (Red->isPartialReduction()) {
4116 Type *SrcTy2 =
4117 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4118 // FIXME: Move partial reduction creation, costing and clamping
4119 // here from LoopVectorize.cpp.
4120 MulAccCost = Ctx.TTI.getPartialReductionCost(
4121 Opcode, SrcTy, SrcTy2, RedTy, VF,
4123 Ext0->getOpcode())
4126 Ext1->getOpcode())
4128 Mul->getOpcode(), CostKind);
4129 } else {
4130 // Only partial reductions support mixed extends at the moment.
4131 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
4132 return false;
4133
4134 bool IsZExt =
4135 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4136 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4137 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4138 SrcVecTy, CostKind);
4139 }
4140
4141 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4142 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4143 InstructionCost ExtCost = 0;
4144 if (Ext0)
4145 ExtCost += Ext0->computeCost(VF, Ctx);
4146 if (Ext1)
4147 ExtCost += Ext1->computeCost(VF, Ctx);
4148 if (OuterExt)
4149 ExtCost += OuterExt->computeCost(VF, Ctx);
4150
4151 return MulAccCost.isValid() &&
4152 MulAccCost < ExtCost + MulCost + RedCost;
4153 },
4154 Range);
4155 };
4156
4157 VPValue *VecOp = Red->getVecOp();
4158 VPRecipeBase *Sub = nullptr;
4159 VPValue *A, *B;
4160 VPValue *Tmp = nullptr;
4161 // Sub reductions could have a sub between the add reduction and vec op.
4162 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4163 Sub = VecOp->getDefiningRecipe();
4164 VecOp = Tmp;
4165 }
4166
4167 // If ValB is a constant and can be safely extended, truncate it to the same
4168 // type as ExtA's operand, then extend it to the same type as ExtA. This
4169 // creates two uniform extends that can more easily be matched by the rest of
4170 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4171 // replaced with the new extend of the constant.
4172 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4173 VPWidenCastRecipe *&ExtB,
4174 VPValue *&ValB, VPWidenRecipe *Mul) {
4175 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4176 return;
4177 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4178 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4179 const APInt *Const;
4180 if (!match(ValB, m_APInt(Const)) ||
4182 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4183 return;
4184 // The truncate ensures that the type of each extended operand is the
4185 // same, and it's been proven that the constant can be extended from
4186 // NarrowTy safely. Necessary since ExtA's extended operand would be
4187 // e.g. an i8, while the const will likely be an i32. This will be
4188 // elided by later optimisations.
4189 VPBuilder Builder(Mul);
4190 auto *Trunc =
4191 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4192 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4193 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4194 Mul->setOperand(1, ExtB);
4195 };
4196
4197 // Try to match reduce.add(mul(...)).
4198 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4201 auto *Mul = cast<VPWidenRecipe>(VecOp);
4202
4203 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4204 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4205
4206 // Match reduce.add/sub(mul(ext, ext)).
4207 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4208 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4209 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4210 if (Sub)
4211 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4212 cast<VPWidenRecipe>(Sub), Red);
4213 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4214 }
4215 // TODO: Add an expression type for this variant with a negated mul
4216 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4217 return new VPExpressionRecipe(Mul, Red);
4218 }
4219 // TODO: Add an expression type for negated versions of other expression
4220 // variants.
4221 if (Sub)
4222 return nullptr;
4223
4224 // Match reduce.add(ext(mul(A, B))).
4225 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4226 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4227 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4230
4231 // reduce.add(ext(mul(ext, const)))
4232 // -> reduce.add(ext(mul(ext, ext(const))))
4233 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4234
4235 // reduce.add(ext(mul(ext(A), ext(B))))
4236 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4237 // The inner extends must either have the same opcode as the outer extend or
4238 // be the same, in which case the multiply can never result in a negative
4239 // value and the outer extend can be folded away by doing wider
4240 // extends for the operands of the mul.
4241 if (Ext0 && Ext1 &&
4242 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4243 Ext0->getOpcode() == Ext1->getOpcode() &&
4244 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4245 auto *NewExt0 = new VPWidenCastRecipe(
4246 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4247 *Ext0, *Ext0, Ext0->getDebugLoc());
4248 NewExt0->insertBefore(Ext0);
4249
4250 VPWidenCastRecipe *NewExt1 = NewExt0;
4251 if (Ext0 != Ext1) {
4252 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4253 Ext->getResultType(), nullptr, *Ext1,
4254 *Ext1, Ext1->getDebugLoc());
4255 NewExt1->insertBefore(Ext1);
4256 }
4257 Mul->setOperand(0, NewExt0);
4258 Mul->setOperand(1, NewExt1);
4259 Red->setOperand(1, Mul);
4260 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4261 }
4262 }
4263 return nullptr;
4264}
4265
4266/// This function tries to create abstract recipes from the reduction recipe for
4267/// following optimizations and cost estimation.
4269 VPCostContext &Ctx,
4270 VFRange &Range) {
4271 VPExpressionRecipe *AbstractR = nullptr;
4272 auto IP = std::next(Red->getIterator());
4273 auto *VPBB = Red->getParent();
4274 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4275 AbstractR = MulAcc;
4276 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4277 AbstractR = ExtRed;
4278 // Cannot create abstract inloop reduction recipes.
4279 if (!AbstractR)
4280 return;
4281
4282 AbstractR->insertBefore(*VPBB, IP);
4283 Red->replaceAllUsesWith(AbstractR);
4284}
4285
4296
4298 if (Plan.hasScalarVFOnly())
4299 return;
4300
4301#ifndef NDEBUG
4302 VPDominatorTree VPDT(Plan);
4303#endif
4304
4305 SmallVector<VPValue *> VPValues;
4308 append_range(VPValues, Plan.getLiveIns());
4309 for (VPRecipeBase &R : *Plan.getEntry())
4310 append_range(VPValues, R.definedValues());
4311
4312 auto *VectorPreheader = Plan.getVectorPreheader();
4313 for (VPValue *VPV : VPValues) {
4315 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4316 continue;
4317
4318 // Add explicit broadcast at the insert point that dominates all users.
4319 VPBasicBlock *HoistBlock = VectorPreheader;
4320 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4321 for (VPUser *User : VPV->users()) {
4322 if (User->usesScalars(VPV))
4323 continue;
4324 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4325 HoistPoint = HoistBlock->begin();
4326 else
4327 assert(VPDT.dominates(VectorPreheader,
4328 cast<VPRecipeBase>(User)->getParent()) &&
4329 "All users must be in the vector preheader or dominated by it");
4330 }
4331
4332 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4333 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4334 VPV->replaceUsesWithIf(Broadcast,
4335 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4336 return Broadcast != &U && !U.usesScalars(VPV);
4337 });
4338 }
4339}
4340
4342 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4343
4344 // Collect candidate loads with invariant addresses and noalias scopes
4345 // metadata and memory-writing recipes with noalias metadata.
4349 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4350 for (VPRecipeBase &R : *VPBB) {
4351 // Only handle single-scalar replicated loads with invariant addresses.
4352 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4353 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4354 RepR->getOpcode() != Instruction::Load)
4355 continue;
4356
4357 VPValue *Addr = RepR->getOperand(0);
4358 if (Addr->isDefinedOutsideLoopRegions()) {
4360 if (!Loc.AATags.Scope)
4361 continue;
4362 CandidateLoads.push_back({RepR, Loc});
4363 }
4364 }
4365 if (R.mayWriteToMemory()) {
4367 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4368 return;
4369 Stores.push_back(*Loc);
4370 }
4371 }
4372 }
4373
4374 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4375 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4376 // Hoist the load to the preheader if it doesn't alias with any stores
4377 // according to the noalias metadata. Other loads should have been hoisted
4378 // by other passes
4379 const AAMDNodes &LoadAA = LoadLoc.AATags;
4380 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4382 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4383 })) {
4384 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4385 }
4386 }
4387}
4388
4389// Collect common metadata from a group of replicate recipes by intersecting
4390// metadata from all recipes in the group.
4392 VPIRMetadata CommonMetadata = *Recipes.front();
4393 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4394 CommonMetadata.intersect(*Recipe);
4395 return CommonMetadata;
4396}
4397
4398template <unsigned Opcode>
4402 const Loop *L) {
4403 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4404 "Only Load and Store opcodes supported");
4405 constexpr bool IsLoad = (Opcode == Instruction::Load);
4406 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4407 VPTypeAnalysis TypeInfo(Plan);
4408
4409 // Group predicated operations by their address SCEV.
4411 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4412 auto *VPBB = cast<VPBasicBlock>(Block);
4413 for (VPRecipeBase &R : *VPBB) {
4414 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4415 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4416 continue;
4417
4418 // For loads, operand 0 is address; for stores, operand 1 is address.
4419 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4420 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4421 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4422 RecipesByAddress[AddrSCEV].push_back(RepR);
4423 }
4424 }
4425
4426 // For each address, collect operations with the same or complementary masks.
4428 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4429 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4430 };
4431 for (auto &[Addr, Recipes] : RecipesByAddress) {
4432 if (Recipes.size() < 2)
4433 continue;
4434
4435 // Collect groups with the same or complementary masks.
4436 for (VPReplicateRecipe *&RecipeI : Recipes) {
4437 if (!RecipeI)
4438 continue;
4439
4440 VPValue *MaskI = RecipeI->getMask();
4441 Type *TypeI = GetLoadStoreValueType(RecipeI);
4443 Group.push_back(RecipeI);
4444 RecipeI = nullptr;
4445
4446 // Find all operations with the same or complementary masks.
4447 bool HasComplementaryMask = false;
4448 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4449 if (!RecipeJ)
4450 continue;
4451
4452 VPValue *MaskJ = RecipeJ->getMask();
4453 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4454 if (TypeI == TypeJ) {
4455 // Check if any operation in the group has a complementary mask with
4456 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4457 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4458 match(MaskJ, m_Not(m_Specific(MaskI)));
4459 Group.push_back(RecipeJ);
4460 RecipeJ = nullptr;
4461 }
4462 }
4463
4464 if (HasComplementaryMask) {
4465 assert(Group.size() >= 2 && "must have at least 2 entries");
4466 AllGroups.push_back(std::move(Group));
4467 }
4468 }
4469 }
4470
4471 return AllGroups;
4472}
4473
4474// Find the recipe with minimum alignment in the group.
4475template <typename InstType>
4476static VPReplicateRecipe *
4478 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4479 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4480 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4481 });
4482}
4483
4486 const Loop *L) {
4487 auto Groups =
4489 if (Groups.empty())
4490 return;
4491
4492 VPDominatorTree VPDT(Plan);
4493
4494 // Process each group of loads.
4495 for (auto &Group : Groups) {
4496 // Sort loads by dominance order, with earliest (most dominating) first.
4497 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4498 return VPDT.properlyDominates(A, B);
4499 });
4500
4501 // Try to use the earliest (most dominating) load to replace all others.
4502 VPReplicateRecipe *EarliestLoad = Group[0];
4503 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4504 VPBasicBlock *LastBB = Group.back()->getParent();
4505
4506 // Check that the load doesn't alias with stores between first and last.
4507 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4508 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4509 continue;
4510
4511 // Collect common metadata from all loads in the group.
4512 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4513
4514 // Find the load with minimum alignment to use.
4515 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4516
4517 // Create an unpredicated version of the earliest load with common
4518 // metadata.
4519 auto *UnpredicatedLoad = new VPReplicateRecipe(
4520 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4521 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4522 CommonMetadata);
4523
4524 UnpredicatedLoad->insertBefore(EarliestLoad);
4525
4526 // Replace all loads in the group with the unpredicated load.
4527 for (VPReplicateRecipe *Load : Group) {
4528 Load->replaceAllUsesWith(UnpredicatedLoad);
4529 Load->eraseFromParent();
4530 }
4531 }
4532}
4533
4534static bool
4536 PredicatedScalarEvolution &PSE, const Loop &L,
4537 VPTypeAnalysis &TypeInfo) {
4538 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4539 if (!StoreLoc || !StoreLoc->AATags.Scope)
4540 return false;
4541
4542 // When sinking a group of stores, all members of the group alias each other.
4543 // Skip them during the alias checks.
4544 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4545 StoresToSink.end());
4546
4547 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4548 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4549 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4550 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4551}
4552
4555 const Loop *L) {
4556 auto Groups =
4558 if (Groups.empty())
4559 return;
4560
4561 VPDominatorTree VPDT(Plan);
4562 VPTypeAnalysis TypeInfo(Plan);
4563
4564 for (auto &Group : Groups) {
4565 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4566 return VPDT.properlyDominates(A, B);
4567 });
4568
4569 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4570 continue;
4571
4572 // Use the last (most dominated) store's location for the unconditional
4573 // store.
4574 VPReplicateRecipe *LastStore = Group.back();
4575 VPBasicBlock *InsertBB = LastStore->getParent();
4576
4577 // Collect common alias metadata from all stores in the group.
4578 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4579
4580 // Build select chain for stored values.
4581 VPValue *SelectedValue = Group[0]->getOperand(0);
4582 VPBuilder Builder(InsertBB, LastStore->getIterator());
4583
4584 for (unsigned I = 1; I < Group.size(); ++I) {
4585 VPValue *Mask = Group[I]->getMask();
4586 VPValue *Value = Group[I]->getOperand(0);
4587 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4588 Group[I]->getDebugLoc());
4589 }
4590
4591 // Find the store with minimum alignment to use.
4592 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4593
4594 // Create unconditional store with selected value and common metadata.
4595 auto *UnpredicatedStore =
4596 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4597 {SelectedValue, LastStore->getOperand(1)},
4598 /*IsSingleScalar=*/false,
4599 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4600 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4601
4602 // Remove all predicated stores from the group.
4603 for (VPReplicateRecipe *Store : Group)
4604 Store->eraseFromParent();
4605 }
4606}
4607
4609 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4611 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4612 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4613
4614 VPValue *TC = Plan.getTripCount();
4615 // Skip cases for which the trip count may be non-trivial to materialize.
4616 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4617 // tail is required.
4618 if (!Plan.hasScalarTail() ||
4620 Plan.getScalarPreheader() ||
4621 !isa<VPIRValue>(TC))
4622 return;
4623
4624 // Materialize vector trip counts for constants early if it can simply
4625 // be computed as (Original TC / VF * UF) * VF * UF.
4626 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4627 // tail-folded loops.
4628 ScalarEvolution &SE = *PSE.getSE();
4629 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4630 if (!isa<SCEVConstant>(TCScev))
4631 return;
4632 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4633 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4634 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4635 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4636}
4637
4639 VPBasicBlock *VectorPH) {
4641 if (BTC->getNumUsers() == 0)
4642 return;
4643
4644 VPBuilder Builder(VectorPH, VectorPH->begin());
4645 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4646 auto *TCMO = Builder.createNaryOp(
4647 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4648 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4649 BTC->replaceAllUsesWith(TCMO);
4650}
4651
4653 if (Plan.hasScalarVFOnly())
4654 return;
4655
4656 VPTypeAnalysis TypeInfo(Plan);
4657 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4658 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4660 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4661 vp_depth_first_shallow(LoopRegion->getEntry()));
4662 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4663 // VPInstructions, excluding ones in replicate regions. Those are not
4664 // materialized explicitly yet. Those vector users are still handled in
4665 // VPReplicateRegion::execute(), via shouldPack().
4666 // TODO: materialize build vectors for replicating recipes in replicating
4667 // regions.
4668 for (VPBasicBlock *VPBB :
4669 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4670 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4672 continue;
4673 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4674 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4675 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4676 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4677 };
4678 if ((isa<VPReplicateRecipe>(DefR) &&
4679 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4680 (isa<VPInstruction>(DefR) &&
4682 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4683 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4684 continue;
4685
4686 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4687 unsigned Opcode = ScalarTy->isStructTy()
4690 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4691 BuildVector->insertAfter(DefR);
4692
4693 DefR->replaceUsesWithIf(
4694 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4695 VPUser &U, unsigned) {
4696 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4697 });
4698 }
4699 }
4700
4701 // Create explicit VPInstructions to convert vectors to scalars. The current
4702 // implementation is conservative - it may miss some cases that may or may not
4703 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4704 // if they are known to operate on scalar values.
4705 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4706 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4709 continue;
4710 for (VPValue *Def : R.definedValues()) {
4711 // Skip recipes that are single-scalar or only have their first lane
4712 // used.
4713 // TODO: The Defs skipped here may or may not be vector values.
4714 // Introduce Unpacks, and remove them later, if they are guaranteed to
4715 // produce scalar values.
4717 continue;
4718
4719 // At the moment, we create unpacks only for scalar users outside
4720 // replicate regions. Recipes inside replicate regions still extract the
4721 // required lanes implicitly.
4722 // TODO: Remove once replicate regions are unrolled completely.
4723 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4724 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4725 return U->usesScalars(Def) &&
4726 (!ParentRegion || !ParentRegion->isReplicator());
4727 };
4728 if (none_of(Def->users(), IsCandidateUnpackUser))
4729 continue;
4730
4731 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4732 if (R.isPhi())
4733 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4734 else
4735 Unpack->insertAfter(&R);
4736 Def->replaceUsesWithIf(Unpack,
4737 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4738 return IsCandidateUnpackUser(&U);
4739 });
4740 }
4741 }
4742 }
4743}
4744
4746 VPBasicBlock *VectorPHVPBB,
4747 bool TailByMasking,
4748 bool RequiresScalarEpilogue) {
4749 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4750 // There's nothing to do if there are no users of the vector trip count or its
4751 // IR value has already been set.
4752 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4753 return;
4754
4755 VPValue *TC = Plan.getTripCount();
4756 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4757 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4758 VPValue *Step = &Plan.getVFxUF();
4759
4760 // If the tail is to be folded by masking, round the number of iterations N
4761 // up to a multiple of Step instead of rounding down. This is done by first
4762 // adding Step-1 and then rounding down. Note that it's ok if this addition
4763 // overflows: the vector induction variable will eventually wrap to zero given
4764 // that it starts at zero and its Step is a power of two; the loop will then
4765 // exit, with the last early-exit vector comparison also producing all-true.
4766 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4767 // is accounted for in emitIterationCountCheck that adds an overflow check.
4768 if (TailByMasking) {
4769 TC = Builder.createNaryOp(
4770 Instruction::Add,
4771 {TC, Builder.createNaryOp(Instruction::Sub,
4772 {Step, Plan.getConstantInt(TCTy, 1)})},
4773 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4774 }
4775
4776 // Now we need to generate the expression for the part of the loop that the
4777 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4778 // iterations are not required for correctness, or N - Step, otherwise. Step
4779 // is equal to the vectorization factor (number of SIMD elements) times the
4780 // unroll factor (number of SIMD instructions).
4781 VPValue *R =
4782 Builder.createNaryOp(Instruction::URem, {TC, Step},
4783 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4784
4785 // There are cases where we *must* run at least one iteration in the remainder
4786 // loop. See the cost model for when this can happen. If the step evenly
4787 // divides the trip count, we set the remainder to be equal to the step. If
4788 // the step does not evenly divide the trip count, no adjustment is necessary
4789 // since there will already be scalar iterations. Note that the minimum
4790 // iterations check ensures that N >= Step.
4791 if (RequiresScalarEpilogue) {
4792 assert(!TailByMasking &&
4793 "requiring scalar epilogue is not supported with fail folding");
4794 VPValue *IsZero =
4795 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4796 R = Builder.createSelect(IsZero, Step, R);
4797 }
4798
4799 VPValue *Res = Builder.createNaryOp(
4800 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4801 VectorTC.replaceAllUsesWith(Res);
4802}
4803
4805 ElementCount VFEC) {
4806 VPBuilder Builder(VectorPH, VectorPH->begin());
4807 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4808 VPValue &VF = Plan.getVF();
4809 VPValue &VFxUF = Plan.getVFxUF();
4810 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4811 // used.
4812 // TODO: Assert that they aren't used.
4813
4814 // If there are no users of the runtime VF, compute VFxUF by constant folding
4815 // the multiplication of VF and UF.
4816 if (VF.getNumUsers() == 0) {
4817 VPValue *RuntimeVFxUF =
4818 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4819 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4820 return;
4821 }
4822
4823 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4824 // vscale) * UF.
4825 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4827 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4829 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4830 }
4831 VF.replaceAllUsesWith(RuntimeVF);
4832
4833 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4834 VPValue *MulByUF = Builder.createOverflowingOp(
4835 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4836 VFxUF.replaceAllUsesWith(MulByUF);
4837}
4838
4841 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4842
4843 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4844 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4845 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4846 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4848 continue;
4849 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4850 if (!ExpSCEV)
4851 break;
4852 const SCEV *Expr = ExpSCEV->getSCEV();
4853 Value *Res =
4854 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4855 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4856 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4857 ExpSCEV->replaceAllUsesWith(Exp);
4858 if (Plan.getTripCount() == ExpSCEV)
4859 Plan.resetTripCount(Exp);
4860 ExpSCEV->eraseFromParent();
4861 }
4863 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4864 "after any VPIRInstructions");
4865 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4866 // to the VPIRBasicBlock.
4867 auto EI = Entry->begin();
4868 for (Instruction &I : drop_end(*EntryBB)) {
4869 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4870 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4871 EI++;
4872 continue;
4873 }
4875 }
4876
4877 return ExpandedSCEVs;
4878}
4879
4880/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4881/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4882/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4883/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4884/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4885/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4886/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4887/// is defined at \p Idx of a load interleave group.
4888static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4889 VPValue *OpV, unsigned Idx) {
4890 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4891 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4892 if (!Member0OpR)
4893 return Member0Op == OpV;
4894 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4895 return !W->getMask() && Member0Op == OpV;
4896 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4897 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4898 return false;
4899}
4900
4901/// Returns true if \p IR is a full interleave group with factor and number of
4902/// members both equal to \p VF. The interleave group must also access the full
4903/// vector width \p VectorRegWidth.
4905 ElementCount VF,
4906 VPTypeAnalysis &TypeInfo,
4907 TypeSize VectorRegWidth) {
4908 if (!InterleaveR || InterleaveR->getMask())
4909 return false;
4910
4911 Type *GroupElementTy = nullptr;
4912 if (InterleaveR->getStoredValues().empty()) {
4913 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4914 if (!all_of(InterleaveR->definedValues(),
4915 [&TypeInfo, GroupElementTy](VPValue *Op) {
4916 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4917 }))
4918 return false;
4919 } else {
4920 GroupElementTy =
4921 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4922 if (!all_of(InterleaveR->getStoredValues(),
4923 [&TypeInfo, GroupElementTy](VPValue *Op) {
4924 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4925 }))
4926 return false;
4927 }
4928
4929 unsigned VFMin = VF.getKnownMinValue();
4930 TypeSize GroupSize = TypeSize::get(
4931 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4932 const auto *IG = InterleaveR->getInterleaveGroup();
4933 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4934 GroupSize == VectorRegWidth;
4935}
4936
4937/// Returns true if \p VPValue is a narrow VPValue.
4938static bool isAlreadyNarrow(VPValue *VPV) {
4939 if (isa<VPIRValue>(VPV))
4940 return true;
4941 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4942 return RepR && RepR->isSingleScalar();
4943}
4944
4945// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4946// a narrow variant.
4947static VPValue *
4949 auto *R = V->getDefiningRecipe();
4950 if (!R || NarrowedOps.contains(V))
4951 return V;
4952
4953 if (isAlreadyNarrow(V))
4954 return V;
4955
4956 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4957 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4958 WideMember0->setOperand(
4959 Idx,
4960 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4961 return V;
4962 }
4963
4964 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4965 // Narrow interleave group to wide load, as transformed VPlan will only
4966 // process one original iteration.
4967 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4968 auto *L = new VPWidenLoadRecipe(
4969 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4970 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4971 L->insertBefore(LoadGroup);
4972 NarrowedOps.insert(L);
4973 return L;
4974 }
4975
4976 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4977 assert(RepR->isSingleScalar() &&
4978 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4979 "must be a single scalar load");
4980 NarrowedOps.insert(RepR);
4981 return RepR;
4982 }
4983
4984 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4985 VPValue *PtrOp = WideLoad->getAddr();
4986 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4987 PtrOp = VecPtr->getOperand(0);
4988 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4989 // process one original iteration.
4990 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4991 /*IsUniform*/ true,
4992 /*Mask*/ nullptr, {}, *WideLoad);
4993 N->insertBefore(WideLoad);
4994 NarrowedOps.insert(N);
4995 return N;
4996}
4997
4999 TypeSize VectorRegWidth) {
5000 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5001 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
5002 return;
5003
5004 VPTypeAnalysis TypeInfo(Plan);
5005
5007 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5009 continue;
5010
5013 continue;
5014
5015 // Bail out on recipes not supported at the moment:
5016 // * phi recipes other than the canonical induction
5017 // * recipes writing to memory except interleave groups
5018 // Only support plans with a canonical induction phi.
5019 if (R.isPhi())
5020 return;
5021
5022 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5023 if (R.mayWriteToMemory() && !InterleaveR)
5024 return;
5025
5026 // Do not narrow interleave groups if there are VectorPointer recipes and
5027 // the plan was unrolled. The recipe implicitly uses VF from
5028 // VPTransformState.
5029 // TODO: Remove restriction once the VF for the VectorPointer offset is
5030 // modeled explicitly as operand.
5031 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5032 return;
5033
5034 // All other ops are allowed, but we reject uses that cannot be converted
5035 // when checking all allowed consumers (store interleave groups) below.
5036 if (!InterleaveR)
5037 continue;
5038
5039 // Bail out on non-consecutive interleave groups.
5040 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5041 VectorRegWidth))
5042 return;
5043
5044 // Skip read interleave groups.
5045 if (InterleaveR->getStoredValues().empty())
5046 continue;
5047
5048 // Narrow interleave groups, if all operands are already matching narrow
5049 // ops.
5050 auto *Member0 = InterleaveR->getStoredValues()[0];
5051 if (isAlreadyNarrow(Member0) &&
5052 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5053 StoreGroups.push_back(InterleaveR);
5054 continue;
5055 }
5056
5057 // For now, we only support full interleave groups storing load interleave
5058 // groups.
5059 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5060 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5061 if (!DefR)
5062 return false;
5063 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5064 return IR && IR->getInterleaveGroup()->isFull() &&
5065 IR->getVPValue(Op.index()) == Op.value();
5066 })) {
5067 StoreGroups.push_back(InterleaveR);
5068 continue;
5069 }
5070
5071 // Check if all values feeding InterleaveR are matching wide recipes, which
5072 // operands that can be narrowed.
5073 auto *WideMember0 =
5074 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5075 if (!WideMember0)
5076 return;
5077 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5079 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5080 R->getNumOperands() > 2)
5081 return;
5082 if (any_of(enumerate(R->operands()),
5083 [WideMember0, Idx = I](const auto &P) {
5084 const auto &[OpIdx, OpV] = P;
5085 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5086 }))
5087 return;
5088 }
5089 StoreGroups.push_back(InterleaveR);
5090 }
5091
5092 if (StoreGroups.empty())
5093 return;
5094
5095 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5096 SmallPtrSet<VPValue *, 4> NarrowedOps;
5097 // Narrow operation tree rooted at store groups.
5098 for (auto *StoreGroup : StoreGroups) {
5099 VPValue *Res =
5100 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5101 auto *SI =
5102 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5103 auto *S = new VPWidenStoreRecipe(
5104 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5105 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5106 S->insertBefore(StoreGroup);
5107 StoreGroup->eraseFromParent();
5108 }
5109
5110 // Adjust induction to reflect that the transformed plan only processes one
5111 // original iteration.
5112 auto *CanIV = VectorLoop->getCanonicalIV();
5113 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5114 VPBuilder PHBuilder(Plan.getVectorPreheader());
5115
5116 VPValue *UF = Plan.getOrAddLiveIn(
5117 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5118 if (VF.isScalable()) {
5119 VPValue *VScale = PHBuilder.createElementCount(
5121 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5122 Instruction::Mul, {VScale, UF}, {true, false});
5123 Inc->setOperand(1, VScaleUF);
5124 Plan.getVF().replaceAllUsesWith(VScale);
5125 } else {
5126 Inc->setOperand(1, UF);
5128 Plan.getConstantInt(CanIV->getScalarType(), 1));
5129 }
5130 removeDeadRecipes(Plan);
5131}
5132
5133/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5134/// BranchOnCond recipe.
5136 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5137 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5138 auto *MiddleTerm =
5140 // Only add branch metadata if there is a (conditional) terminator.
5141 if (!MiddleTerm)
5142 return;
5143
5144 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5145 "must have a BranchOnCond");
5146 // Assume that `TripCount % VectorStep ` is equally distributed.
5147 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5148 if (VF.isScalable() && VScaleForTuning.has_value())
5149 VectorStep *= *VScaleForTuning;
5150 assert(VectorStep > 0 && "trip count should not be zero");
5151 MDBuilder MDB(Plan.getContext());
5152 MDNode *BranchWeights =
5153 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5154 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5155}
5156
5157/// Compute and return the end value for \p WideIV, unless it is truncated. If
5158/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5159/// compute the end value of the induction.
5161 VPBuilder &VectorPHBuilder,
5162 VPTypeAnalysis &TypeInfo,
5163 VPValue *VectorTC) {
5164 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5165 // Truncated wide inductions resume from the last lane of their vector value
5166 // in the last vector iteration which is handled elsewhere.
5167 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5168 return nullptr;
5169
5170 VPIRValue *Start = WideIV->getStartValue();
5171 VPValue *Step = WideIV->getStepValue();
5173 VPValue *EndValue = VectorTC;
5174 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5175 EndValue = VectorPHBuilder.createDerivedIV(
5176 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5177 Start, VectorTC, Step);
5178 }
5179
5180 // EndValue is derived from the vector trip count (which has the same type as
5181 // the widest induction) and thus may be wider than the induction here.
5182 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5183 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5184 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5185 ScalarTypeOfWideIV,
5186 WideIV->getDebugLoc());
5187 }
5188
5189 return EndValue;
5190}
5191
5193 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5194 VPTypeAnalysis TypeInfo(Plan);
5195 auto *ScalarPH = Plan.getScalarPreheader();
5196 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5197 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5198 VPBuilder VectorPHBuilder(
5199 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5200 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5201 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5202 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5203
5204 // TODO: Extract final value from induction recipe initially, optimize to
5205 // pre-computed end value together in optimizeInductionExitUsers.
5206 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5207 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5209 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5210 IVEndValues[WideIVR] = EndValue;
5211 ResumePhiR->setOperand(0, EndValue);
5212 ResumePhiR->setName("bc.resume.val");
5213 continue;
5214 }
5215 // TODO: Also handle truncated inductions here. Computing end-values
5216 // separately should be done as VPlan-to-VPlan optimization, after
5217 // legalizing all resume values to use the last lane from the loop.
5218 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5219 "should only skip truncated wide inductions");
5220 continue;
5221 }
5222
5223 // The backedge value provides the value to resume coming out of a loop,
5224 // which for FORs is a vector whose last element needs to be extracted. The
5225 // start value provides the value if the loop is bypassed.
5226 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5227 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5228 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5229 "Cannot handle loops with uncountable early exits");
5230 if (IsFOR) {
5231 auto *ExtractPart = MiddleBuilder.createNaryOp(
5232 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5233 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5235 "vector.recur.extract");
5236 }
5237 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5238 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5239 }
5240}
5241
5243 VFRange &Range) {
5244 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5245 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5246 auto *MiddleVPBB = Plan.getMiddleBlock();
5247 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5248 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5249
5250 auto IsScalableOne = [](ElementCount VF) -> bool {
5251 return VF == ElementCount::getScalable(1);
5252 };
5253
5254 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5255 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5256 if (!FOR)
5257 continue;
5258
5259 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5260 "Cannot handle loops with uncountable early exits");
5261
5262 // This is the second phase of vectorizing first-order recurrences, creating
5263 // extract for users outside the loop. An overview of the transformation is
5264 // described below. Suppose we have the following loop with some use after
5265 // the loop of the last a[i-1],
5266 //
5267 // for (int i = 0; i < n; ++i) {
5268 // t = a[i - 1];
5269 // b[i] = a[i] - t;
5270 // }
5271 // use t;
5272 //
5273 // There is a first-order recurrence on "a". For this loop, the shorthand
5274 // scalar IR looks like:
5275 //
5276 // scalar.ph:
5277 // s.init = a[-1]
5278 // br scalar.body
5279 //
5280 // scalar.body:
5281 // i = phi [0, scalar.ph], [i+1, scalar.body]
5282 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5283 // s2 = a[i]
5284 // b[i] = s2 - s1
5285 // br cond, scalar.body, exit.block
5286 //
5287 // exit.block:
5288 // use = lcssa.phi [s1, scalar.body]
5289 //
5290 // In this example, s1 is a recurrence because it's value depends on the
5291 // previous iteration. In the first phase of vectorization, we created a
5292 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5293 // for users in the scalar preheader and exit block.
5294 //
5295 // vector.ph:
5296 // v_init = vector(..., ..., ..., a[-1])
5297 // br vector.body
5298 //
5299 // vector.body
5300 // i = phi [0, vector.ph], [i+4, vector.body]
5301 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5302 // v2 = a[i, i+1, i+2, i+3]
5303 // b[i] = v2 - v1
5304 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5305 // b[i, i+1, i+2, i+3] = v2 - v1
5306 // br cond, vector.body, middle.block
5307 //
5308 // middle.block:
5309 // vector.recur.extract.for.phi = v2(2)
5310 // vector.recur.extract = v2(3)
5311 // br cond, scalar.ph, exit.block
5312 //
5313 // scalar.ph:
5314 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5315 // [s.init, otherwise]
5316 // br scalar.body
5317 //
5318 // scalar.body:
5319 // i = phi [0, scalar.ph], [i+1, scalar.body]
5320 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5321 // s2 = a[i]
5322 // b[i] = s2 - s1
5323 // br cond, scalar.body, exit.block
5324 //
5325 // exit.block:
5326 // lo = lcssa.phi [s1, scalar.body],
5327 // [vector.recur.extract.for.phi, middle.block]
5328 //
5329 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5330 // Extract the penultimate value of the recurrence and use it as operand for
5331 // the VPIRInstruction modeling the phi.
5333 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5335 continue;
5336
5337 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5338 // penultimate value of the recurrence. Instead we rely on the existing
5339 // extract of the last element from the result of
5340 // VPInstruction::FirstOrderRecurrenceSplice.
5341 // TODO: Consider vscale_range info and UF.
5343 Range))
5344 return;
5345 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5346 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5347 "vector.recur.extract.for.phi");
5348 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5349 }
5350 }
5351}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1521
APInt abs() const
Get the absolute value.
Definition APInt.h:1804
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1549
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3652
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4009
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4084
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4036
iterator end()
Definition VPlan.h:4046
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4044
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4097
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:228
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:589
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:561
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:635
const VPRecipeBase & back() const
Definition VPlan.h:4058
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2558
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2592
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2582
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2598
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2578
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:198
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:173
const std::string & getName() const
Definition VPlan.h:164
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:178
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:221
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:242
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:154
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:173
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3063
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3595
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:477
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:450
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:462
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:472
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3763
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3684
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3108
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2075
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2118
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2107
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4162
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4186
Class to record and manage LLVM IR flags.
Definition VPlan.h:608
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1032
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1086
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1188
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1130
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1125
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1122
@ CanonicalIVIncrementForPart
Definition VPlan.h:1106
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2701
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2693
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2722
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2775
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2733
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3250
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4314
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2937
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2826
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4197
const VPBlockBase * getEntry() const
Definition VPlan.h:4233
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4308
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4265
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4250
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4295
const VPBlockBase * getExiting() const
Definition VPlan.h:4245
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4258
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2982
bool isSingleScalar() const
Definition VPlan.h:3023
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3047
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3831
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:594
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:253
operand_range operands()
Definition VPlanValue.h:321
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:297
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:292
void addOperand(VPValue *Operand)
Definition VPlanValue.h:286
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:47
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:133
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1382
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:119
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:175
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1385
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1389
user_range users()
Definition VPlanValue.h:128
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1934
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3726
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1625
Instruction::CastOps getOpcode() const
Definition VPlan.h:1661
A recipe for handling GEP instructions.
Definition VPlan.h:1871
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2142
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2170
PHINode * getPHINode() const
Definition VPlan.h:2187
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2173
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2190
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2221
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2268
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2272
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2299
A recipe for widening vector intrinsics.
Definition VPlan.h:1675
A common base class for widening memory operations.
Definition VPlan.h:3293
A recipe for widened phis.
Definition VPlan.h:2357
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1577
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4327
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4610
bool hasVF(ElementCount VF) const
Definition VPlan.h:4524
LLVMContext & getContext() const
Definition VPlan.h:4512
VPBasicBlock * getEntry()
Definition VPlan.h:4416
bool hasScalableVF() const
Definition VPlan.h:4525
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4510
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4506
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4474
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4495
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4589
unsigned getUF() const
Definition VPlan.h:4544
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4658
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4613
bool hasUF(unsigned UF) const
Definition VPlan.h:4542
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4464
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4503
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4566
void setVF(ElementCount VF)
Definition VPlan.h:4518
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4557
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1022
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4488
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4441
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4636
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4586
bool hasScalarVFOnly() const
Definition VPlan.h:4535
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4455
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4460
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4421
void setUF(unsigned UF)
Definition VPlan.h:4549
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4690
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4592
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1726
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2403
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:186
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:137
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:226
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3426
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3384
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3510
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3467
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...