LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan, const TargetLibraryInfo &TLI) {
51
53 Plan.getVectorLoopRegion());
55 // Skip blocks outside region
56 if (!VPBB->getParent())
57 break;
58 VPRecipeBase *Term = VPBB->getTerminator();
59 auto EndIter = Term ? Term->getIterator() : VPBB->end();
60 // Introduce each ingredient into VPlan.
61 for (VPRecipeBase &Ingredient :
62 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
63
64 VPValue *VPV = Ingredient.getVPSingleValue();
65 if (!VPV->getUnderlyingValue())
66 continue;
67
69
70 VPRecipeBase *NewRecipe = nullptr;
71 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
72 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
73 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
74 for (VPValue *Op : PhiR->operands())
75 NewRecipe->addOperand(Op);
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, false /*Reverse*/, *VPI,
83 Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc());
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96 NewRecipe = new VPWidenIntrinsicRecipe(
97 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
98 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
99 *VPI, CI->getDebugLoc());
100 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
101 NewRecipe = new VPWidenCastRecipe(
102 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
103 VPIRFlags(*CI), VPIRMetadata(*CI));
104 } else {
105 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
106 *VPI, Ingredient.getDebugLoc());
107 }
108 } else {
110 "inductions must be created earlier");
111 continue;
112 }
113
114 NewRecipe->insertBefore(&Ingredient);
115 if (NewRecipe->getNumDefinedValues() == 1)
116 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
117 else
118 assert(NewRecipe->getNumDefinedValues() == 0 &&
119 "Only recpies with zero or one defined values expected");
120 Ingredient.eraseFromParent();
121 }
122 }
123 return true;
124}
125
126/// Helper for extra no-alias checks via known-safe recipe and SCEV.
128 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
129 VPReplicateRecipe &GroupLeader;
131 const Loop &L;
132 VPTypeAnalysis &TypeInfo;
133
134 // Return true if \p A and \p B are known to not alias for all VFs in the
135 // plan, checked via the distance between the accesses
136 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
137 if (A->getOpcode() != Instruction::Store ||
138 B->getOpcode() != Instruction::Store)
139 return false;
140
141 VPValue *AddrA = A->getOperand(1);
142 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
143 VPValue *AddrB = B->getOperand(1);
144 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
146 return false;
147
148 const APInt *Distance;
149 ScalarEvolution &SE = *PSE.getSE();
150 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
151 return false;
152
153 const DataLayout &DL = SE.getDataLayout();
154 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
155 uint64_t SizeA = DL.getTypeStoreSize(TyA);
156 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
157 uint64_t SizeB = DL.getTypeStoreSize(TyB);
158
159 // Use the maximum store size to ensure no overlap from either direction.
160 // Currently only handles fixed sizes, as it is only used for
161 // replicating VPReplicateRecipes.
162 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
163
164 auto VFs = B->getParent()->getPlan()->vectorFactors();
166 if (MaxVF.isScalable())
167 return false;
168 return Distance->abs().uge(
169 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
170 }
171
172public:
175 const Loop &L, VPTypeAnalysis &TypeInfo)
176 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
177 L(L), TypeInfo(TypeInfo) {}
178
179 /// Return true if \p R should be skipped during alias checking, either
180 /// because it's in the exclude set or because no-alias can be proven via
181 /// SCEV.
182 bool shouldSkip(VPRecipeBase &R) const {
183 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
184 return ExcludeRecipes.contains(&R) ||
185 (Store && isNoAliasViaDistance(Store, &GroupLeader));
186 }
187};
188
189/// Check if a memory operation doesn't alias with memory operations in blocks
190/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
191/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
192/// checked (for load hoisting). Otherwise recipes that both read and write
193/// memory are checked, and SCEV is used to prove no-alias between the group
194/// leader and other replicate recipes (for store sinking).
195static bool
197 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
198 std::optional<SinkStoreInfo> SinkInfo = {}) {
199 bool CheckReads = SinkInfo.has_value();
200 if (!MemLoc.AATags.Scope)
201 return false;
202
203 const AAMDNodes &MemAA = MemLoc.AATags;
204
205 for (VPBlockBase *Block = FirstBB; Block;
206 Block = Block->getSingleSuccessor()) {
207 assert(Block->getNumSuccessors() <= 1 &&
208 "Expected at most one successor in block chain");
209 auto *VPBB = cast<VPBasicBlock>(Block);
210 for (VPRecipeBase &R : *VPBB) {
211 if (SinkInfo && SinkInfo->shouldSkip(R))
212 continue;
213
214 // Skip recipes that don't need checking.
215 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
216 continue;
217
219 if (!Loc)
220 // Conservatively assume aliasing for memory operations without
221 // location.
222 return false;
223
224 // For reads, check if they don't alias in the reverse direction and
225 // skip if so.
226 if (CheckReads && R.mayReadFromMemory() &&
228 MemAA.NoAlias))
229 continue;
230
231 // Check if the memory operations may alias in the forward direction.
233 Loc->AATags.NoAlias))
234 return false;
235 }
236
237 if (Block == LastBB)
238 break;
239 }
240 return true;
241}
242
243/// Return true if we do not know how to (mechanically) hoist or sink \p R out
244/// of a loop region.
246 // Assumes don't alias anything or throw; as long as they're guaranteed to
247 // execute, they're safe to hoist.
249 return false;
250
251 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
252 // memory location is not modified in the vector loop.
253 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
254 return true;
255
256 // Allocas cannot be hoisted.
257 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
258 return RepR && RepR->getOpcode() == Instruction::Alloca;
259}
260
261static bool sinkScalarOperands(VPlan &Plan) {
262 auto Iter = vp_depth_first_deep(Plan.getEntry());
263 bool ScalarVFOnly = Plan.hasScalarVFOnly();
264 bool Changed = false;
265
267 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
268 VPBasicBlock *SinkTo, VPValue *Op) {
269 auto *Candidate =
270 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
271 if (!Candidate)
272 return;
273
274 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
275 // for now.
277 return;
278
279 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
280 return;
281
282 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
283 if (!ScalarVFOnly && RepR->isSingleScalar())
284 return;
285
286 WorkList.insert({SinkTo, Candidate});
287 };
288
289 // First, collect the operands of all recipes in replicate blocks as seeds for
290 // sinking.
292 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
293 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
294 continue;
295 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
296 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
297 continue;
298 for (auto &Recipe : *VPBB)
299 for (VPValue *Op : Recipe.operands())
300 InsertIfValidSinkCandidate(VPBB, Op);
301 }
302
303 // Try to sink each replicate or scalar IV steps recipe in the worklist.
304 for (unsigned I = 0; I != WorkList.size(); ++I) {
305 VPBasicBlock *SinkTo;
306 VPSingleDefRecipe *SinkCandidate;
307 std::tie(SinkTo, SinkCandidate) = WorkList[I];
308
309 // All recipe users of SinkCandidate must be in the same block SinkTo or all
310 // users outside of SinkTo must only use the first lane of SinkCandidate. In
311 // the latter case, we need to duplicate SinkCandidate.
312 auto UsersOutsideSinkTo =
313 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
314 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
315 });
316 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
317 return !U->usesFirstLaneOnly(SinkCandidate);
318 }))
319 continue;
320 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
321
322 if (NeedsDuplicating) {
323 if (ScalarVFOnly)
324 continue;
325 VPSingleDefRecipe *Clone;
326 if (auto *SinkCandidateRepR =
327 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
328 // TODO: Handle converting to uniform recipes as separate transform,
329 // then cloning should be sufficient here.
330 Instruction *I = SinkCandidate->getUnderlyingInstr();
331 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
332 nullptr /*Mask*/, *SinkCandidateRepR,
333 *SinkCandidateRepR);
334 // TODO: add ".cloned" suffix to name of Clone's VPValue.
335 } else {
336 Clone = SinkCandidate->clone();
337 }
338
339 Clone->insertBefore(SinkCandidate);
340 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
341 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
342 });
343 }
344 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
345 for (VPValue *Op : SinkCandidate->operands())
346 InsertIfValidSinkCandidate(SinkTo, Op);
347 Changed = true;
348 }
349 return Changed;
350}
351
352/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
353/// the mask.
355 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
356 if (!EntryBB || EntryBB->size() != 1 ||
357 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
358 return nullptr;
359
360 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
361}
362
363/// If \p R is a triangle region, return the 'then' block of the triangle.
365 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
366 if (EntryBB->getNumSuccessors() != 2)
367 return nullptr;
368
369 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
370 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
371 if (!Succ0 || !Succ1)
372 return nullptr;
373
374 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
375 return nullptr;
376 if (Succ0->getSingleSuccessor() == Succ1)
377 return Succ0;
378 if (Succ1->getSingleSuccessor() == Succ0)
379 return Succ1;
380 return nullptr;
381}
382
383// Merge replicate regions in their successor region, if a replicate region
384// is connected to a successor replicate region with the same predicate by a
385// single, empty VPBasicBlock.
387 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
388
389 // Collect replicate regions followed by an empty block, followed by another
390 // replicate region with matching masks to process front. This is to avoid
391 // iterator invalidation issues while merging regions.
394 vp_depth_first_deep(Plan.getEntry()))) {
395 if (!Region1->isReplicator())
396 continue;
397 auto *MiddleBasicBlock =
398 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
399 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
400 continue;
401
402 auto *Region2 =
403 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
404 if (!Region2 || !Region2->isReplicator())
405 continue;
406
407 VPValue *Mask1 = getPredicatedMask(Region1);
408 VPValue *Mask2 = getPredicatedMask(Region2);
409 if (!Mask1 || Mask1 != Mask2)
410 continue;
411
412 assert(Mask1 && Mask2 && "both region must have conditions");
413 WorkList.push_back(Region1);
414 }
415
416 // Move recipes from Region1 to its successor region, if both are triangles.
417 for (VPRegionBlock *Region1 : WorkList) {
418 if (TransformedRegions.contains(Region1))
419 continue;
420 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
421 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
422
423 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
424 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
425 if (!Then1 || !Then2)
426 continue;
427
428 // Note: No fusion-preventing memory dependencies are expected in either
429 // region. Such dependencies should be rejected during earlier dependence
430 // checks, which guarantee accesses can be re-ordered for vectorization.
431 //
432 // Move recipes to the successor region.
433 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
434 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
435
436 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
437 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
438
439 // Move VPPredInstPHIRecipes from the merge block to the successor region's
440 // merge block. Update all users inside the successor region to use the
441 // original values.
442 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
443 VPValue *PredInst1 =
444 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
445 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
446 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
447 return cast<VPRecipeBase>(&U)->getParent() == Then2;
448 });
449
450 // Remove phi recipes that are unused after merging the regions.
451 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
452 Phi1ToMove.eraseFromParent();
453 continue;
454 }
455 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
456 }
457
458 // Remove the dead recipes in Region1's entry block.
459 for (VPRecipeBase &R :
460 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
461 R.eraseFromParent();
462
463 // Finally, remove the first region.
464 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
465 VPBlockUtils::disconnectBlocks(Pred, Region1);
466 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
467 }
468 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
469 TransformedRegions.insert(Region1);
470 }
471
472 return !TransformedRegions.empty();
473}
474
476 VPlan &Plan) {
477 Instruction *Instr = PredRecipe->getUnderlyingInstr();
478 // Build the triangular if-then region.
479 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
480 assert(Instr->getParent() && "Predicated instruction not in any basic block");
481 auto *BlockInMask = PredRecipe->getMask();
482 auto *MaskDef = BlockInMask->getDefiningRecipe();
483 auto *BOMRecipe = new VPBranchOnMaskRecipe(
484 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
485 auto *Entry =
486 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
487
488 // Replace predicated replicate recipe with a replicate recipe without a
489 // mask but in the replicate region.
490 auto *RecipeWithoutMask = new VPReplicateRecipe(
491 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
492 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
493 PredRecipe->getDebugLoc());
494 auto *Pred =
495 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
496
497 VPPredInstPHIRecipe *PHIRecipe = nullptr;
498 if (PredRecipe->getNumUsers() != 0) {
499 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
500 RecipeWithoutMask->getDebugLoc());
501 PredRecipe->replaceAllUsesWith(PHIRecipe);
502 PHIRecipe->setOperand(0, RecipeWithoutMask);
503 }
504 PredRecipe->eraseFromParent();
505 auto *Exiting =
506 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
508 Plan.createReplicateRegion(Entry, Exiting, RegionName);
509
510 // Note: first set Entry as region entry and then connect successors starting
511 // from it in order, to propagate the "parent" of each VPBasicBlock.
512 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
513 VPBlockUtils::connectBlocks(Pred, Exiting);
514
515 return Region;
516}
517
518static void addReplicateRegions(VPlan &Plan) {
521 vp_depth_first_deep(Plan.getEntry()))) {
522 for (VPRecipeBase &R : *VPBB)
523 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
524 if (RepR->isPredicated())
525 WorkList.push_back(RepR);
526 }
527 }
528
529 unsigned BBNum = 0;
530 for (VPReplicateRecipe *RepR : WorkList) {
531 VPBasicBlock *CurrentBlock = RepR->getParent();
532 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
533
534 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
535 SplitBlock->setName(
536 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
537 // Record predicated instructions for above packing optimizations.
539 Region->setParent(CurrentBlock->getParent());
541
542 VPRegionBlock *ParentRegion = Region->getParent();
543 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
544 ParentRegion->setExiting(SplitBlock);
545 }
546}
547
548/// Remove redundant VPBasicBlocks by merging them into their predecessor if
549/// the predecessor has a single successor.
553 vp_depth_first_deep(Plan.getEntry()))) {
554 // Don't fold the blocks in the skeleton of the Plan into their single
555 // predecessors for now.
556 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
557 if (!VPBB->getParent())
558 continue;
559 auto *PredVPBB =
560 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
561 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
562 isa<VPIRBasicBlock>(PredVPBB))
563 continue;
564 WorkList.push_back(VPBB);
565 }
566
567 for (VPBasicBlock *VPBB : WorkList) {
568 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
569 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
570 R.moveBefore(*PredVPBB, PredVPBB->end());
571 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
572 auto *ParentRegion = VPBB->getParent();
573 if (ParentRegion && ParentRegion->getExiting() == VPBB)
574 ParentRegion->setExiting(PredVPBB);
575 for (auto *Succ : to_vector(VPBB->successors())) {
577 VPBlockUtils::connectBlocks(PredVPBB, Succ);
578 }
579 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
580 }
581 return !WorkList.empty();
582}
583
585 // Convert masked VPReplicateRecipes to if-then region blocks.
587
588 bool ShouldSimplify = true;
589 while (ShouldSimplify) {
590 ShouldSimplify = sinkScalarOperands(Plan);
591 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
592 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
593 }
594}
595
596/// Remove redundant casts of inductions.
597///
598/// Such redundant casts are casts of induction variables that can be ignored,
599/// because we already proved that the casted phi is equal to the uncasted phi
600/// in the vectorized loop. There is no need to vectorize the cast - the same
601/// value can be used for both the phi and casts in the vector loop.
603 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
605 if (!IV || IV->getTruncInst())
606 continue;
607
608 // A sequence of IR Casts has potentially been recorded for IV, which
609 // *must be bypassed* when the IV is vectorized, because the vectorized IV
610 // will produce the desired casted value. This sequence forms a def-use
611 // chain and is provided in reverse order, ending with the cast that uses
612 // the IV phi. Search for the recipe of the last cast in the chain and
613 // replace it with the original IV. Note that only the final cast is
614 // expected to have users outside the cast-chain and the dead casts left
615 // over will be cleaned up later.
616 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
617 VPValue *FindMyCast = IV;
618 for (Instruction *IRCast : reverse(Casts)) {
619 VPSingleDefRecipe *FoundUserCast = nullptr;
620 for (auto *U : FindMyCast->users()) {
621 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
622 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
623 FoundUserCast = UserCast;
624 break;
625 }
626 }
627 FindMyCast = FoundUserCast;
628 }
629 FindMyCast->replaceAllUsesWith(IV);
630 }
631}
632
633/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
634/// recipe, if it exists.
636 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
637 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
638 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
639 for (VPUser *U : CanonicalIV->users()) {
641 if (WidenNewIV)
642 break;
643 }
644
645 if (!WidenNewIV)
646 return;
647
648 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
649 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
650 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
651
652 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
653 continue;
654
655 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
656 // everything WidenNewIV's users need. That is, WidenOriginalIV will
657 // generate a vector phi or all users of WidenNewIV demand the first lane
658 // only.
659 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
660 vputils::onlyFirstLaneUsed(WidenNewIV)) {
661 // We are replacing a wide canonical iv with a suitable wide induction.
662 // This is used to compute header mask, hence all lanes will be used and
663 // we need to drop wrap flags only applying to lanes guranteed to execute
664 // in the original scalar loop.
665 WidenOriginalIV->dropPoisonGeneratingFlags();
666 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
667 WidenNewIV->eraseFromParent();
668 return;
669 }
670 }
671}
672
673/// Returns true if \p R is dead and can be removed.
674static bool isDeadRecipe(VPRecipeBase &R) {
675 // Do remove conditional assume instructions as their conditions may be
676 // flattened.
677 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
678 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
680 if (IsConditionalAssume)
681 return true;
682
683 if (R.mayHaveSideEffects())
684 return false;
685
686 // Recipe is dead if no user keeps the recipe alive.
687 return all_of(R.definedValues(),
688 [](VPValue *V) { return V->getNumUsers() == 0; });
689}
690
693 vp_post_order_deep(Plan.getEntry()))) {
694 // The recipes in the block are processed in reverse order, to catch chains
695 // of dead recipes.
696 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
697 if (isDeadRecipe(R)) {
698 R.eraseFromParent();
699 continue;
700 }
701
702 // Check if R is a dead VPPhi <-> update cycle and remove it.
703 auto *PhiR = dyn_cast<VPPhi>(&R);
704 if (!PhiR || PhiR->getNumOperands() != 2)
705 continue;
706 VPUser *PhiUser = PhiR->getSingleUser();
707 if (!PhiUser)
708 continue;
709 VPValue *Incoming = PhiR->getOperand(1);
710 if (PhiUser != Incoming->getDefiningRecipe() ||
711 Incoming->getNumUsers() != 1)
712 continue;
713 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
714 PhiR->eraseFromParent();
715 Incoming->getDefiningRecipe()->eraseFromParent();
716 }
717 }
718}
719
722 Instruction::BinaryOps InductionOpcode,
723 FPMathOperator *FPBinOp, Instruction *TruncI,
724 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
725 VPBuilder &Builder) {
726 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
727 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
728 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
729 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
730 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
731
732 // Truncate base induction if needed.
733 VPTypeAnalysis TypeInfo(Plan);
734 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
735 if (TruncI) {
736 Type *TruncTy = TruncI->getType();
737 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
738 "Not truncating.");
739 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
740 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
741 ResultTy = TruncTy;
742 }
743
744 // Truncate step if needed.
745 Type *StepTy = TypeInfo.inferScalarType(Step);
746 if (ResultTy != StepTy) {
747 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
748 "Not truncating.");
749 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
750 auto *VecPreheader =
752 VPBuilder::InsertPointGuard Guard(Builder);
753 Builder.setInsertPoint(VecPreheader);
754 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
755 }
756 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
757 &Plan.getVF(), DL);
758}
759
762 for (unsigned I = 0; I != Users.size(); ++I) {
764 if (isa<VPHeaderPHIRecipe>(Cur))
765 continue;
766 for (VPValue *V : Cur->definedValues())
767 Users.insert_range(V->users());
768 }
769 return Users.takeVector();
770}
771
772/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
773/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
774/// generates scalar values.
775static VPValue *
777 VPlan &Plan, VPBuilder &Builder) {
779 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
780 VPValue *StepV = PtrIV->getOperand(1);
782 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
783 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
784
785 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
786 PtrIV->getDebugLoc(), "next.gep");
787}
788
789/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
790/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
791/// VPWidenPointerInductionRecipe will generate vectors only. If some users
792/// require vectors while other require scalars, the scalar uses need to extract
793/// the scalars from the generated vectors (Note that this is different to how
794/// int/fp inductions are handled). Legalize extract-from-ends using uniform
795/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
796/// the correct end value is available. Also optimize
797/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
798/// providing them scalar steps built on the canonical scalar IV and update the
799/// original IV's users. This is an optional optimization to reduce the needs of
800/// vector extracts.
803 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
804 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
805 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
806 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
807 if (!PhiR)
808 continue;
809
810 // Try to narrow wide and replicating recipes to uniform recipes, based on
811 // VPlan analysis.
812 // TODO: Apply to all recipes in the future, to replace legacy uniformity
813 // analysis.
814 auto Users = collectUsersRecursively(PhiR);
815 for (VPUser *U : reverse(Users)) {
816 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
817 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
818 // Skip recipes that shouldn't be narrowed.
819 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
820 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
821 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
822 continue;
823
824 // Skip recipes that may have other lanes than their first used.
826 continue;
827
828 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
829 Def->operands(), /*IsUniform*/ true,
830 /*Mask*/ nullptr, /*Flags*/ *Def);
831 Clone->insertAfter(Def);
832 Def->replaceAllUsesWith(Clone);
833 }
834
835 // Replace wide pointer inductions which have only their scalars used by
836 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
837 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
838 if (!Plan.hasScalarVFOnly() &&
839 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
840 continue;
841
842 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
843 PtrIV->replaceAllUsesWith(PtrAdd);
844 continue;
845 }
846
847 // Replace widened induction with scalar steps for users that only use
848 // scalars.
849 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
850 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
851 return U->usesScalars(WideIV);
852 }))
853 continue;
854
855 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
857 Plan, ID.getKind(), ID.getInductionOpcode(),
858 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
859 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
860 WideIV->getDebugLoc(), Builder);
861
862 // Update scalar users of IV to use Step instead.
863 if (!HasOnlyVectorVFs) {
864 assert(!Plan.hasScalableVF() &&
865 "plans containing a scalar VF cannot also include scalable VFs");
866 WideIV->replaceAllUsesWith(Steps);
867 } else {
868 bool HasScalableVF = Plan.hasScalableVF();
869 WideIV->replaceUsesWithIf(Steps,
870 [WideIV, HasScalableVF](VPUser &U, unsigned) {
871 if (HasScalableVF)
872 return U.usesFirstLaneOnly(WideIV);
873 return U.usesScalars(WideIV);
874 });
875 }
876 }
877}
878
879/// Check if \p VPV is an untruncated wide induction, either before or after the
880/// increment. If so return the header IV (before the increment), otherwise
881/// return null.
884 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
885 if (WideIV) {
886 // VPV itself is a wide induction, separately compute the end value for exit
887 // users if it is not a truncated IV.
888 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
889 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
890 }
891
892 // Check if VPV is an optimizable induction increment.
893 VPRecipeBase *Def = VPV->getDefiningRecipe();
894 if (!Def || Def->getNumOperands() != 2)
895 return nullptr;
896 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
897 if (!WideIV)
898 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
899 if (!WideIV)
900 return nullptr;
901
902 auto IsWideIVInc = [&]() {
903 auto &ID = WideIV->getInductionDescriptor();
904
905 // Check if VPV increments the induction by the induction step.
906 VPValue *IVStep = WideIV->getStepValue();
907 switch (ID.getInductionOpcode()) {
908 case Instruction::Add:
909 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
910 case Instruction::FAdd:
911 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
912 case Instruction::FSub:
913 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
914 m_Specific(IVStep)));
915 case Instruction::Sub: {
916 // IVStep will be the negated step of the subtraction. Check if Step == -1
917 // * IVStep.
918 VPValue *Step;
919 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
920 return false;
921 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
922 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
923 ScalarEvolution &SE = *PSE.getSE();
924 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
925 !isa<SCEVCouldNotCompute>(StepSCEV) &&
926 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
927 }
928 default:
929 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
930 match(VPV, m_GetElementPtr(m_Specific(WideIV),
931 m_Specific(WideIV->getStepValue())));
932 }
933 llvm_unreachable("should have been covered by switch above");
934 };
935 return IsWideIVInc() ? WideIV : nullptr;
936}
937
938/// Attempts to optimize the induction variable exit values for users in the
939/// early exit block.
941 VPTypeAnalysis &TypeInfo,
942 VPBlockBase *PredVPBB,
943 VPValue *Op,
945 VPValue *Incoming, *Mask;
948 return nullptr;
949
950 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
951 if (!WideIV)
952 return nullptr;
953
954 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
955 if (WideIntOrFp && WideIntOrFp->getTruncInst())
956 return nullptr;
957
958 // Calculate the final index.
959 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
960 auto *CanonicalIV = LoopRegion->getCanonicalIV();
961 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
962 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
963
964 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
965 VPValue *FirstActiveLane =
966 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
967 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
968 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
969 FirstActiveLaneType, DL);
970 VPValue *EndValue =
971 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
972
973 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
974 // changed it means the exit is using the incremented value, so we need to
975 // add the step.
976 if (Incoming != WideIV) {
977 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
978 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
979 }
980
981 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
982 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
983 VPIRValue *Start = WideIV->getStartValue();
984 VPValue *Step = WideIV->getStepValue();
985 EndValue = B.createDerivedIV(
986 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
987 Start, EndValue, Step);
988 }
989
990 return EndValue;
991}
992
993/// Attempts to optimize the induction variable exit values for users in the
994/// exit block coming from the latch in the original scalar loop.
996 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1000 return nullptr;
1001
1002 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1003 if (!WideIV)
1004 return nullptr;
1005
1006 VPValue *EndValue = EndValues.lookup(WideIV);
1007 assert(EndValue && "end value must have been pre-computed");
1008
1009 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1010 // changed it means the exit is using the incremented value, so we don't
1011 // need to subtract the step.
1012 if (Incoming != WideIV)
1013 return EndValue;
1014
1015 // Otherwise, subtract the step from the EndValue.
1016 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1017 VPValue *Step = WideIV->getStepValue();
1018 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1019 if (ScalarTy->isIntegerTy())
1020 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1021 DebugLoc::getUnknown(), "ind.escape");
1022 if (ScalarTy->isPointerTy()) {
1023 Type *StepTy = TypeInfo.inferScalarType(Step);
1024 auto *Zero = Plan.getConstantInt(StepTy, 0);
1025 return B.createPtrAdd(EndValue,
1026 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1027 DebugLoc::getUnknown(), "ind.escape");
1028 }
1029 if (ScalarTy->isFloatingPointTy()) {
1030 const auto &ID = WideIV->getInductionDescriptor();
1031 return B.createNaryOp(
1032 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1033 ? Instruction::FSub
1034 : Instruction::FAdd,
1035 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1036 }
1037 llvm_unreachable("all possible induction types must be handled");
1038 return nullptr;
1039}
1040
1042 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1044 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1045 VPTypeAnalysis TypeInfo(Plan);
1046 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1047 for (VPRecipeBase &R : ExitVPBB->phis()) {
1048 auto *ExitIRI = cast<VPIRPhi>(&R);
1049
1050 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1051 VPValue *Escape = nullptr;
1052 if (PredVPBB == MiddleVPBB)
1053 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1054 ExitIRI->getOperand(Idx),
1055 EndValues, PSE);
1056 else
1058 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1059 if (Escape)
1060 ExitIRI->setOperand(Idx, Escape);
1061 }
1062 }
1063 }
1064}
1065
1066/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1067/// them with already existing recipes expanding the same SCEV expression.
1070
1071 for (VPRecipeBase &R :
1073 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1074 if (!ExpR)
1075 continue;
1076
1077 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1078 if (Inserted)
1079 continue;
1080 ExpR->replaceAllUsesWith(V->second);
1081 ExpR->eraseFromParent();
1082 }
1083}
1084
1086 SmallVector<VPValue *> WorkList;
1088 WorkList.push_back(V);
1089
1090 while (!WorkList.empty()) {
1091 VPValue *Cur = WorkList.pop_back_val();
1092 if (!Seen.insert(Cur).second)
1093 continue;
1094 VPRecipeBase *R = Cur->getDefiningRecipe();
1095 if (!R)
1096 continue;
1097 if (!isDeadRecipe(*R))
1098 continue;
1099 append_range(WorkList, R->operands());
1100 R->eraseFromParent();
1101 }
1102}
1103
1104/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1105/// Returns an optional pair, where the first element indicates whether it is
1106/// an intrinsic ID.
1107static std::optional<std::pair<bool, unsigned>>
1109 return TypeSwitch<const VPSingleDefRecipe *,
1110 std::optional<std::pair<bool, unsigned>>>(R)
1113 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1114 .Case([](const VPWidenIntrinsicRecipe *I) {
1115 return std::make_pair(true, I->getVectorIntrinsicID());
1116 })
1117 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1118 // For recipes that do not directly map to LLVM IR instructions,
1119 // assign opcodes after the last VPInstruction opcode (which is also
1120 // after the last IR Instruction opcode), based on the VPRecipeID.
1121 return std::make_pair(false,
1122 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1123 })
1124 .Default([](auto *) { return std::nullopt; });
1125}
1126
1127/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1128/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1129/// Operands are foldable live-ins.
1131 ArrayRef<VPValue *> Operands,
1132 const DataLayout &DL,
1133 VPTypeAnalysis &TypeInfo) {
1134 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1135 if (!OpcodeOrIID)
1136 return nullptr;
1137
1139 for (VPValue *Op : Operands) {
1140 if (!match(Op, m_LiveIn()))
1141 return nullptr;
1142 Value *V = Op->getUnderlyingValue();
1143 if (!V)
1144 return nullptr;
1145 Ops.push_back(V);
1146 }
1147
1148 auto FoldToIRValue = [&]() -> Value * {
1149 InstSimplifyFolder Folder(DL);
1150 if (OpcodeOrIID->first) {
1151 if (R.getNumOperands() != 2)
1152 return nullptr;
1153 unsigned ID = OpcodeOrIID->second;
1154 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1155 TypeInfo.inferScalarType(&R));
1156 }
1157 unsigned Opcode = OpcodeOrIID->second;
1158 if (Instruction::isBinaryOp(Opcode))
1159 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1160 Ops[0], Ops[1]);
1161 if (Instruction::isCast(Opcode))
1162 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1163 TypeInfo.inferScalarType(R.getVPSingleValue()));
1164 switch (Opcode) {
1166 return Folder.FoldSelect(Ops[0], Ops[1],
1168 case VPInstruction::Not:
1169 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1171 case Instruction::Select:
1172 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1173 case Instruction::ICmp:
1174 case Instruction::FCmp:
1175 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1176 Ops[1]);
1177 case Instruction::GetElementPtr: {
1178 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1179 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1180 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1181 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1182 }
1185 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1186 Ops[0], Ops[1],
1187 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1188 // An extract of a live-in is an extract of a broadcast, so return the
1189 // broadcasted element.
1190 case Instruction::ExtractElement:
1191 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1192 return Ops[0];
1193 }
1194 return nullptr;
1195 };
1196
1197 if (Value *V = FoldToIRValue())
1198 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1199 return nullptr;
1200}
1201
1202/// Try to simplify VPSingleDefRecipe \p Def.
1204 VPlan *Plan = Def->getParent()->getPlan();
1205
1206 // Simplification of live-in IR values for SingleDef recipes using
1207 // InstSimplifyFolder.
1208 const DataLayout &DL =
1210 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1211 return Def->replaceAllUsesWith(V);
1212
1213 // Fold PredPHI LiveIn -> LiveIn.
1214 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1215 VPValue *Op = PredPHI->getOperand(0);
1216 if (isa<VPIRValue>(Op))
1217 PredPHI->replaceAllUsesWith(Op);
1218 }
1219
1220 VPBuilder Builder(Def);
1221 VPValue *A;
1222 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1223 Type *TruncTy = TypeInfo.inferScalarType(Def);
1224 Type *ATy = TypeInfo.inferScalarType(A);
1225 if (TruncTy == ATy) {
1226 Def->replaceAllUsesWith(A);
1227 } else {
1228 // Don't replace a scalarizing recipe with a widened cast.
1229 if (isa<VPReplicateRecipe>(Def))
1230 return;
1231 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1232
1233 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1234 ? Instruction::SExt
1235 : Instruction::ZExt;
1236 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1237 TruncTy);
1238 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1239 // UnderlyingExt has distinct return type, used to retain legacy cost.
1240 Ext->setUnderlyingValue(UnderlyingExt);
1241 }
1242 Def->replaceAllUsesWith(Ext);
1243 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1244 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1245 Def->replaceAllUsesWith(Trunc);
1246 }
1247 }
1248#ifndef NDEBUG
1249 // Verify that the cached type info is for both A and its users is still
1250 // accurate by comparing it to freshly computed types.
1251 VPTypeAnalysis TypeInfo2(*Plan);
1252 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1253 for (VPUser *U : A->users()) {
1254 auto *R = cast<VPRecipeBase>(U);
1255 for (VPValue *VPV : R->definedValues())
1256 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1257 }
1258#endif
1259 }
1260
1261 // Simplify (X && Y) || (X && !Y) -> X.
1262 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1263 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1264 // recipes to be visited during simplification.
1265 VPValue *X, *Y, *Z;
1266 if (match(Def,
1269 Def->replaceAllUsesWith(X);
1270 Def->eraseFromParent();
1271 return;
1272 }
1273
1274 // x | 1 -> 1
1275 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1276 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1277
1278 // x | 0 -> x
1279 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1280 return Def->replaceAllUsesWith(X);
1281
1282 // x & 0 -> 0
1283 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1284 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1285
1286 // x && false -> false
1287 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1288 return Def->replaceAllUsesWith(Def->getOperand(1));
1289
1290 // (x && y) || (x && z) -> x && (y || z)
1293 // Simplify only if one of the operands has one use to avoid creating an
1294 // extra recipe.
1295 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1296 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1297 return Def->replaceAllUsesWith(
1298 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1299
1300 // x && !x -> 0
1302 return Def->replaceAllUsesWith(Plan->getFalse());
1303
1304 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1305 return Def->replaceAllUsesWith(X);
1306
1307 // select c, false, true -> not c
1308 VPValue *C;
1309 if (match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1310 return Def->replaceAllUsesWith(Builder.createNot(C));
1311
1312 // select !c, x, y -> select c, y, x
1313 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1314 Def->setOperand(0, C);
1315 Def->setOperand(1, Y);
1316 Def->setOperand(2, X);
1317 return;
1318 }
1319
1320 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1321 // tail folding it is likely that x is a header mask and can be simplified
1322 // further.
1324 m_VPValue(Z))) &&
1325 X->hasMoreThanOneUniqueUser())
1326 return Def->replaceAllUsesWith(
1327 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1328
1329 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1330 return Def->replaceAllUsesWith(A);
1331
1332 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1333 return Def->replaceAllUsesWith(A);
1334
1335 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1336 return Def->replaceAllUsesWith(
1337 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1338
1339 const APInt *APC;
1340 if (match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1341 return Def->replaceAllUsesWith(Builder.createNaryOp(
1342 Instruction::Shl,
1343 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1344 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1345
1346 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1347 // not allowed in them.
1348 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1349 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1350 if (!IsInReplicateRegion && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1351 APC->isPowerOf2())
1352 return Def->replaceAllUsesWith(Builder.createNaryOp(
1353 Instruction::LShr,
1354 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())}, {},
1355 Def->getDebugLoc()));
1356
1357 if (match(Def, m_Not(m_VPValue(A)))) {
1358 if (match(A, m_Not(m_VPValue(A))))
1359 return Def->replaceAllUsesWith(A);
1360
1361 // Try to fold Not into compares by adjusting the predicate in-place.
1362 CmpPredicate Pred;
1363 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1364 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1365 if (all_of(Cmp->users(),
1367 m_Not(m_Specific(Cmp)),
1368 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1369 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1370 for (VPUser *U : to_vector(Cmp->users())) {
1371 auto *R = cast<VPSingleDefRecipe>(U);
1372 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1373 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1374 R->setOperand(1, Y);
1375 R->setOperand(2, X);
1376 } else {
1377 // not (cmp pred) -> cmp inv_pred
1378 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1379 R->replaceAllUsesWith(Cmp);
1380 }
1381 }
1382 // If Cmp doesn't have a debug location, use the one from the negation,
1383 // to preserve the location.
1384 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1385 Cmp->setDebugLoc(Def->getDebugLoc());
1386 }
1387 }
1388 }
1389
1390 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1391 // any-of (fcmp uno %A, %B), ...
1392 if (match(Def, m_AnyOf())) {
1394 VPRecipeBase *UnpairedCmp = nullptr;
1395 for (VPValue *Op : Def->operands()) {
1396 VPValue *X;
1397 if (Op->getNumUsers() > 1 ||
1399 m_Deferred(X)))) {
1400 NewOps.push_back(Op);
1401 } else if (!UnpairedCmp) {
1402 UnpairedCmp = Op->getDefiningRecipe();
1403 } else {
1404 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1405 UnpairedCmp->getOperand(0), X));
1406 UnpairedCmp = nullptr;
1407 }
1408 }
1409
1410 if (UnpairedCmp)
1411 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1412
1413 if (NewOps.size() < Def->getNumOperands()) {
1414 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1415 return Def->replaceAllUsesWith(NewAnyOf);
1416 }
1417 }
1418
1419 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1420 // This is useful for fmax/fmin without fast-math flags, where we need to
1421 // check if any operand is NaN.
1423 m_Deferred(X)),
1425 m_Deferred(Y))))) {
1426 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1427 return Def->replaceAllUsesWith(NewCmp);
1428 }
1429
1430 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1431 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1432 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1433 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1434 TypeInfo.inferScalarType(Def))
1435 return Def->replaceAllUsesWith(Def->getOperand(1));
1436
1438 m_One()))) {
1439 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1440 if (TypeInfo.inferScalarType(X) != WideStepTy)
1441 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1442 Def->replaceAllUsesWith(X);
1443 return;
1444 }
1445
1446 // For i1 vp.merges produced by AnyOf reductions:
1447 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1449 m_VPValue(X), m_VPValue())) &&
1451 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1452 Def->setOperand(1, Def->getOperand(0));
1453 Def->setOperand(0, Y);
1454 return;
1455 }
1456
1457 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1458 if (Phi->getOperand(0) == Phi->getOperand(1))
1459 Phi->replaceAllUsesWith(Phi->getOperand(0));
1460 return;
1461 }
1462
1463 // Look through ExtractLastLane.
1464 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1465 if (match(A, m_BuildVector())) {
1466 auto *BuildVector = cast<VPInstruction>(A);
1467 Def->replaceAllUsesWith(
1468 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1469 return;
1470 }
1471 if (Plan->hasScalarVFOnly())
1472 return Def->replaceAllUsesWith(A);
1473 }
1474
1475 // Look through ExtractPenultimateElement (BuildVector ....).
1477 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1478 Def->replaceAllUsesWith(
1479 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1480 return;
1481 }
1482
1483 uint64_t Idx;
1485 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1486 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1487 return;
1488 }
1489
1490 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1491 Def->replaceAllUsesWith(
1492 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1493 return;
1494 }
1495
1496 // Look through broadcast of single-scalar when used as select conditions; in
1497 // that case the scalar condition can be used directly.
1498 if (match(Def,
1501 "broadcast operand must be single-scalar");
1502 Def->setOperand(0, C);
1503 return;
1504 }
1505
1506 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1507 if (Phi->getNumOperands() == 1)
1508 Phi->replaceAllUsesWith(Phi->getOperand(0));
1509 return;
1510 }
1511
1512 VPIRValue *IRV;
1513 if (Def->getNumOperands() == 1 &&
1515 return Def->replaceAllUsesWith(IRV);
1516
1517 // Some simplifications can only be applied after unrolling. Perform them
1518 // below.
1519 if (!Plan->isUnrolled())
1520 return;
1521
1522 // After unrolling, extract-lane may be used to extract values from multiple
1523 // scalar sources. Only simplify when extracting from a single scalar source.
1524 VPValue *LaneToExtract;
1525 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1526 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1528 return Def->replaceAllUsesWith(A);
1529
1530 // Simplify extract-lane with single source to extract-element.
1531 Def->replaceAllUsesWith(Builder.createNaryOp(
1532 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1533 return;
1534 }
1535
1536 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1537 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1538 isa<VPPhi>(X)) {
1539 auto *Phi = cast<VPPhi>(X);
1540 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1541 Phi->getSingleUser() == Def) {
1542 Phi->setOperand(0, Y);
1543 Def->replaceAllUsesWith(Phi);
1544 return;
1545 }
1546 }
1547
1548 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1549 // just the pointer operand.
1550 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1551 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1552 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1553
1554 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1555 // the start index is zero and only the first lane 0 is demanded.
1556 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1557 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1558 Steps->replaceAllUsesWith(Steps->getOperand(0));
1559 return;
1560 }
1561 }
1562 // Simplify redundant ReductionStartVector recipes after unrolling.
1563 VPValue *StartV;
1565 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1566 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1567 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1568 return PhiR && PhiR->isInLoop();
1569 });
1570 return;
1571 }
1572
1574 Def->replaceAllUsesWith(A);
1575 return;
1576 }
1577
1578 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1581 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1582 all_of(A->users(),
1583 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1584 return Def->replaceAllUsesWith(A);
1585 }
1586
1587 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1588 return Def->replaceAllUsesWith(A);
1589}
1590
1593 Plan.getEntry());
1594 VPTypeAnalysis TypeInfo(Plan);
1596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1597 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1598 simplifyRecipe(Def, TypeInfo);
1599 }
1600}
1601
1603 if (Plan.hasScalarVFOnly())
1604 return;
1605
1606 // Try to narrow wide and replicating recipes to single scalar recipes,
1607 // based on VPlan analysis. Only process blocks in the loop region for now,
1608 // without traversing into nested regions, as recipes in replicate regions
1609 // cannot be converted yet.
1612 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1614 VPWidenStoreRecipe>(&R))
1615 continue;
1616 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1617 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1618 continue;
1619
1620 // Convert an unmasked scatter with an uniform address into
1621 // extract-last-lane + scalar store.
1622 // TODO: Add a profitability check comparing the cost of a scatter vs.
1623 // extract + scalar store.
1624 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1625 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1626 !WidenStoreR->isConsecutive()) {
1627 assert(!WidenStoreR->isReverse() &&
1628 "Not consecutive memory recipes shouldn't be reversed");
1629 VPValue *Mask = WidenStoreR->getMask();
1630
1631 // Only convert the scatter to a scalar store if it is unmasked.
1632 // TODO: Support converting scatter masked by the header mask to scalar
1633 // store.
1634 if (Mask)
1635 continue;
1636
1638 {WidenStoreR->getOperand(1)});
1639 Extract->insertBefore(WidenStoreR);
1640
1641 // TODO: Sink the scalar store recipe to middle block if possible.
1642 auto *ScalarStore = new VPReplicateRecipe(
1643 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1644 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1645 *WidenStoreR /*Metadata*/);
1646 ScalarStore->insertBefore(WidenStoreR);
1647 WidenStoreR->eraseFromParent();
1648 continue;
1649 }
1650
1651 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1652 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1653 vputils::isSingleScalar(RepR->getOperand(1))) {
1654 auto *Clone = new VPReplicateRecipe(
1655 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1656 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1657 *RepR /*Metadata*/, RepR->getDebugLoc());
1658 Clone->insertBefore(RepOrWidenR);
1659 VPBuilder Builder(Clone);
1660 VPValue *ExtractOp = Clone->getOperand(0);
1661 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1662 ExtractOp =
1663 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1664 ExtractOp =
1665 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1666 Clone->setOperand(0, ExtractOp);
1667 RepR->eraseFromParent();
1668 continue;
1669 }
1670
1671 // Skip recipes that aren't single scalars.
1672 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1673 continue;
1674
1675 // Skip recipes for which conversion to single-scalar does introduce
1676 // additional broadcasts. No extra broadcasts are needed, if either only
1677 // the scalars of the recipe are used, or at least one of the operands
1678 // would require a broadcast. In the latter case, the single-scalar may
1679 // need to be broadcasted, but another broadcast is removed.
1680 if (!all_of(RepOrWidenR->users(),
1681 [RepOrWidenR](const VPUser *U) {
1682 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1683 unsigned Opcode = VPI->getOpcode();
1684 if (Opcode == VPInstruction::ExtractLastLane ||
1685 Opcode == VPInstruction::ExtractLastPart ||
1686 Opcode == VPInstruction::ExtractPenultimateElement)
1687 return true;
1688 }
1689
1690 return U->usesScalars(RepOrWidenR);
1691 }) &&
1692 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1693 if (Op->getSingleUser() != RepOrWidenR)
1694 return false;
1695 // Non-constant live-ins require broadcasts, while constants do not
1696 // need explicit broadcasts.
1697 auto *IRV = dyn_cast<VPIRValue>(Op);
1698 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1699 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1700 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1701 }))
1702 continue;
1703
1704 auto *Clone = new VPReplicateRecipe(
1705 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1706 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1707 Clone->insertBefore(RepOrWidenR);
1708 RepOrWidenR->replaceAllUsesWith(Clone);
1709 if (isDeadRecipe(*RepOrWidenR))
1710 RepOrWidenR->eraseFromParent();
1711 }
1712 }
1713}
1714
1715/// Try to see if all of \p Blend's masks share a common value logically and'ed
1716/// and remove it from the masks.
1718 if (Blend->isNormalized())
1719 return;
1720 VPValue *CommonEdgeMask;
1721 if (!match(Blend->getMask(0),
1722 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1723 return;
1724 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1725 if (!match(Blend->getMask(I),
1726 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1727 return;
1728 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1729 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1730}
1731
1732/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1733/// to make sure the masks are simplified.
1734static void simplifyBlends(VPlan &Plan) {
1737 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1738 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1739 if (!Blend)
1740 continue;
1741
1742 removeCommonBlendMask(Blend);
1743
1744 // Try to remove redundant blend recipes.
1745 SmallPtrSet<VPValue *, 4> UniqueValues;
1746 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1747 UniqueValues.insert(Blend->getIncomingValue(0));
1748 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1749 if (!match(Blend->getMask(I), m_False()))
1750 UniqueValues.insert(Blend->getIncomingValue(I));
1751
1752 if (UniqueValues.size() == 1) {
1753 Blend->replaceAllUsesWith(*UniqueValues.begin());
1754 Blend->eraseFromParent();
1755 continue;
1756 }
1757
1758 if (Blend->isNormalized())
1759 continue;
1760
1761 // Normalize the blend so its first incoming value is used as the initial
1762 // value with the others blended into it.
1763
1764 unsigned StartIndex = 0;
1765 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1766 // If a value's mask is used only by the blend then is can be deadcoded.
1767 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1768 // that's used by multiple blends where it can be removed from them all.
1769 VPValue *Mask = Blend->getMask(I);
1770 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1771 StartIndex = I;
1772 break;
1773 }
1774 }
1775
1776 SmallVector<VPValue *, 4> OperandsWithMask;
1777 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1778
1779 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1780 if (I == StartIndex)
1781 continue;
1782 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1783 OperandsWithMask.push_back(Blend->getMask(I));
1784 }
1785
1786 auto *NewBlend =
1787 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1788 OperandsWithMask, Blend->getDebugLoc());
1789 NewBlend->insertBefore(&R);
1790
1791 VPValue *DeadMask = Blend->getMask(StartIndex);
1792 Blend->replaceAllUsesWith(NewBlend);
1793 Blend->eraseFromParent();
1795
1796 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1797 VPValue *NewMask;
1798 if (NewBlend->getNumOperands() == 3 &&
1799 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1800 VPValue *Inc0 = NewBlend->getOperand(0);
1801 VPValue *Inc1 = NewBlend->getOperand(1);
1802 VPValue *OldMask = NewBlend->getOperand(2);
1803 NewBlend->setOperand(0, Inc1);
1804 NewBlend->setOperand(1, Inc0);
1805 NewBlend->setOperand(2, NewMask);
1806 if (OldMask->getNumUsers() == 0)
1807 cast<VPInstruction>(OldMask)->eraseFromParent();
1808 }
1809 }
1810 }
1811}
1812
1813/// Optimize the width of vector induction variables in \p Plan based on a known
1814/// constant Trip Count, \p BestVF and \p BestUF.
1816 ElementCount BestVF,
1817 unsigned BestUF) {
1818 // Only proceed if we have not completely removed the vector region.
1819 if (!Plan.getVectorLoopRegion())
1820 return false;
1821
1822 const APInt *TC;
1823 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1824 return false;
1825
1826 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1827 // and UF. Returns at least 8.
1828 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1829 APInt AlignedTC =
1832 APInt MaxVal = AlignedTC - 1;
1833 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1834 };
1835 unsigned NewBitWidth =
1836 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1837
1838 LLVMContext &Ctx = Plan.getContext();
1839 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1840
1841 bool MadeChange = false;
1842
1843 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1844 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1845 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1846
1847 // Currently only handle canonical IVs as it is trivial to replace the start
1848 // and stop values, and we currently only perform the optimization when the
1849 // IV has a single use.
1850 if (!WideIV || !WideIV->isCanonical() ||
1851 WideIV->hasMoreThanOneUniqueUser() ||
1852 NewIVTy == WideIV->getScalarType())
1853 continue;
1854
1855 // Currently only handle cases where the single user is a header-mask
1856 // comparison with the backedge-taken-count.
1857 VPUser *SingleUser = WideIV->getSingleUser();
1858 if (!SingleUser ||
1859 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1862 continue;
1863
1864 // Update IV operands and comparison bound to use new narrower type.
1865 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1866 WideIV->setStartValue(NewStart);
1867 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1868 WideIV->setStepValue(NewStep);
1869
1870 auto *NewBTC = new VPWidenCastRecipe(
1871 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1872 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1873 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1874 Cmp->setOperand(1, NewBTC);
1875
1876 MadeChange = true;
1877 }
1878
1879 return MadeChange;
1880}
1881
1882/// Return true if \p Cond is known to be true for given \p BestVF and \p
1883/// BestUF.
1885 ElementCount BestVF, unsigned BestUF,
1888 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1889 &PSE](VPValue *C) {
1890 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1891 });
1892
1893 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1895 m_Specific(CanIV->getBackedgeValue()),
1896 m_Specific(&Plan.getVectorTripCount()))))
1897 return false;
1898
1899 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1900 // count is not conveniently available as SCEV so far, so we compare directly
1901 // against the original trip count. This is stricter than necessary, as we
1902 // will only return true if the trip count == vector trip count.
1903 const SCEV *VectorTripCount =
1905 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1906 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1907 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1908 "Trip count SCEV must be computable");
1909 ScalarEvolution &SE = *PSE.getSE();
1910 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1911 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1912 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1913}
1914
1915/// Try to replace multiple active lane masks used for control flow with
1916/// a single, wide active lane mask instruction followed by multiple
1917/// extract subvector intrinsics. This applies to the active lane mask
1918/// instructions both in the loop and in the preheader.
1919/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1920/// new extracts from the first active lane mask, which has it's last
1921/// operand (multiplier) set to UF.
1923 unsigned UF) {
1924 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1925 return false;
1926
1927 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1928 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1929 auto *Term = &ExitingVPBB->back();
1930
1931 using namespace llvm::VPlanPatternMatch;
1933 m_VPValue(), m_VPValue(), m_VPValue())))))
1934 return false;
1935
1936 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1937 LLVMContext &Ctx = Plan.getContext();
1938
1939 auto ExtractFromALM = [&](VPInstruction *ALM,
1940 SmallVectorImpl<VPValue *> &Extracts) {
1941 DebugLoc DL = ALM->getDebugLoc();
1942 for (unsigned Part = 0; Part < UF; ++Part) {
1944 Ops.append({ALM, Plan.getOrAddLiveIn(
1945 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1946 VF.getKnownMinValue() * Part))});
1947 auto *Ext =
1948 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1949 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1950 Extracts[Part] = Ext;
1951 Ext->insertAfter(ALM);
1952 }
1953 };
1954
1955 // Create a list of each active lane mask phi, ordered by unroll part.
1957 for (VPRecipeBase &R : Header->phis()) {
1959 if (!Phi)
1960 continue;
1961 VPValue *Index = nullptr;
1962 match(Phi->getBackedgeValue(),
1964 assert(Index && "Expected index from ActiveLaneMask instruction");
1965
1966 uint64_t Part;
1967 if (match(Index,
1969 m_VPValue(), m_ConstantInt(Part))))
1970 Phis[Part] = Phi;
1971 else
1972 // Anything other than a CanonicalIVIncrementForPart is part 0
1973 Phis[0] = Phi;
1974 }
1975
1976 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1977 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1978
1979 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1980 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1981
1982 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1983 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1984 "Expected incoming values of Phi to be ActiveLaneMasks");
1985
1986 // When using wide lane masks, the return type of the get.active.lane.mask
1987 // intrinsic is VF x UF (last operand).
1988 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1989 EntryALM->setOperand(2, ALMMultiplier);
1990 LoopALM->setOperand(2, ALMMultiplier);
1991
1992 // Create UF x extract vectors and insert into preheader.
1993 SmallVector<VPValue *> EntryExtracts(UF);
1994 ExtractFromALM(EntryALM, EntryExtracts);
1995
1996 // Create UF x extract vectors and insert before the loop compare & branch,
1997 // updating the compare to use the first extract.
1998 SmallVector<VPValue *> LoopExtracts(UF);
1999 ExtractFromALM(LoopALM, LoopExtracts);
2000 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2001 Not->setOperand(0, LoopExtracts[0]);
2002
2003 // Update the incoming values of active lane mask phis.
2004 for (unsigned Part = 0; Part < UF; ++Part) {
2005 Phis[Part]->setStartValue(EntryExtracts[Part]);
2006 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2007 }
2008
2009 return true;
2010}
2011
2012/// Try to simplify the branch condition of \p Plan. This may restrict the
2013/// resulting plan to \p BestVF and \p BestUF.
2015 unsigned BestUF,
2017 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2018 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2019 auto *Term = &ExitingVPBB->back();
2020 VPValue *Cond;
2021 if (match(Term, m_BranchOnCount()) ||
2023 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2024 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2025 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2026 const SCEV *VectorTripCount =
2028 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2029 VectorTripCount =
2031 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2032 "Trip count SCEV must be computable");
2033 ScalarEvolution &SE = *PSE.getSE();
2034 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2035 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2036 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2037 return false;
2038 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2040 // For BranchOnCond, check if we can prove the condition to be true using VF
2041 // and UF.
2042 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2043 return false;
2044 } else {
2045 return false;
2046 }
2047
2048 // The vector loop region only executes once. If possible, completely remove
2049 // the region, otherwise replace the terminator controlling the latch with
2050 // (BranchOnCond true).
2051 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2052 // support for other non-canonical widen induction recipes (e.g.,
2053 // VPWidenPointerInductionRecipe).
2054 // TODO: fold branch-on-constant after dissolving region.
2055 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2056 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2057 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2058 return R->isCanonical();
2059 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2060 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2061 })) {
2062 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2063 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2064 VPBuilder Builder(Plan.getVectorPreheader());
2065 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2066 R->getScalarType());
2067 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2068 HeaderR.eraseFromParent();
2069 continue;
2070 }
2071 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2072 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2073 HeaderR.eraseFromParent();
2074 }
2075
2076 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2077 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2078 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2079 for (VPBlockBase *Exit : Exits)
2080 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2081
2082 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2083 B->setParent(nullptr);
2084
2085 VPBlockUtils::connectBlocks(Preheader, Header);
2086
2087 for (VPBlockBase *Exit : Exits)
2088 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2089
2090 // Replace terminating branch-on-two-conds with branch-on-cond to early
2091 // exit.
2092 if (Exits.size() != 1) {
2093 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2094 "BranchOnTwoConds needs 2 remaining exits");
2096 Term->getOperand(0));
2097 }
2099 } else {
2100 // The vector region contains header phis for which we cannot remove the
2101 // loop region yet.
2102
2103 // For BranchOnTwoConds, set the latch exit condition to true directly.
2104 if (match(Term, m_BranchOnTwoConds())) {
2105 Term->setOperand(1, Plan.getTrue());
2106 return true;
2107 }
2108
2109 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2110 {}, {}, Term->getDebugLoc());
2111 ExitingVPBB->appendRecipe(BOC);
2112 }
2113
2114 Term->eraseFromParent();
2115
2116 return true;
2117}
2118
2119/// From the definition of llvm.experimental.get.vector.length,
2120/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2124 vp_depth_first_deep(Plan.getEntry()))) {
2125 for (VPRecipeBase &R : *VPBB) {
2126 VPValue *AVL;
2127 if (!match(&R, m_EVL(m_VPValue(AVL))))
2128 continue;
2129
2130 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2131 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2132 continue;
2133 ScalarEvolution &SE = *PSE.getSE();
2134 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2135 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2136 continue;
2137
2139 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2140 R.getDebugLoc());
2141 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2142 return true;
2143 }
2144 }
2145 return false;
2146}
2147
2149 unsigned BestUF,
2151 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2152 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2153
2154 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2155 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2156 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2157 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2158
2159 if (MadeChange) {
2160 Plan.setVF(BestVF);
2161 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2162 }
2163}
2164
2165/// Sink users of \p FOR after the recipe defining the previous value \p
2166/// Previous of the recurrence. \returns true if all users of \p FOR could be
2167/// re-arranged as needed or false if it is not possible.
2168static bool
2170 VPRecipeBase *Previous,
2171 VPDominatorTree &VPDT) {
2172 // Collect recipes that need sinking.
2175 Seen.insert(Previous);
2176 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2177 // The previous value must not depend on the users of the recurrence phi. In
2178 // that case, FOR is not a fixed order recurrence.
2179 if (SinkCandidate == Previous)
2180 return false;
2181
2182 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2183 !Seen.insert(SinkCandidate).second ||
2184 VPDT.properlyDominates(Previous, SinkCandidate))
2185 return true;
2186
2187 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2188 return false;
2189
2190 WorkList.push_back(SinkCandidate);
2191 return true;
2192 };
2193
2194 // Recursively sink users of FOR after Previous.
2195 WorkList.push_back(FOR);
2196 for (unsigned I = 0; I != WorkList.size(); ++I) {
2197 VPRecipeBase *Current = WorkList[I];
2198 assert(Current->getNumDefinedValues() == 1 &&
2199 "only recipes with a single defined value expected");
2200
2201 for (VPUser *User : Current->getVPSingleValue()->users()) {
2202 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2203 return false;
2204 }
2205 }
2206
2207 // Keep recipes to sink ordered by dominance so earlier instructions are
2208 // processed first.
2209 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2210 return VPDT.properlyDominates(A, B);
2211 });
2212
2213 for (VPRecipeBase *SinkCandidate : WorkList) {
2214 if (SinkCandidate == FOR)
2215 continue;
2216
2217 SinkCandidate->moveAfter(Previous);
2218 Previous = SinkCandidate;
2219 }
2220 return true;
2221}
2222
2223/// Try to hoist \p Previous and its operands before all users of \p FOR.
2225 VPRecipeBase *Previous,
2226 VPDominatorTree &VPDT) {
2227 if (cannotHoistOrSinkRecipe(*Previous))
2228 return false;
2229
2230 // Collect recipes that need hoisting.
2231 SmallVector<VPRecipeBase *> HoistCandidates;
2233 VPRecipeBase *HoistPoint = nullptr;
2234 // Find the closest hoist point by looking at all users of FOR and selecting
2235 // the recipe dominating all other users.
2236 for (VPUser *U : FOR->users()) {
2237 auto *R = cast<VPRecipeBase>(U);
2238 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2239 HoistPoint = R;
2240 }
2241 assert(all_of(FOR->users(),
2242 [&VPDT, HoistPoint](VPUser *U) {
2243 auto *R = cast<VPRecipeBase>(U);
2244 return HoistPoint == R ||
2245 VPDT.properlyDominates(HoistPoint, R);
2246 }) &&
2247 "HoistPoint must dominate all users of FOR");
2248
2249 auto NeedsHoisting = [HoistPoint, &VPDT,
2250 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2251 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2252 if (!HoistCandidate)
2253 return nullptr;
2254 VPRegionBlock *EnclosingLoopRegion =
2255 HoistCandidate->getParent()->getEnclosingLoopRegion();
2256 assert((!HoistCandidate->getRegion() ||
2257 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2258 "CFG in VPlan should still be flat, without replicate regions");
2259 // Hoist candidate was already visited, no need to hoist.
2260 if (!Visited.insert(HoistCandidate).second)
2261 return nullptr;
2262
2263 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2264 // hoisting.
2265 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2266 return nullptr;
2267
2268 // If we reached a recipe that dominates HoistPoint, we don't need to
2269 // hoist the recipe.
2270 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2271 return nullptr;
2272 return HoistCandidate;
2273 };
2274
2275 if (!NeedsHoisting(Previous->getVPSingleValue()))
2276 return true;
2277
2278 // Recursively try to hoist Previous and its operands before all users of FOR.
2279 HoistCandidates.push_back(Previous);
2280
2281 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2282 VPRecipeBase *Current = HoistCandidates[I];
2283 assert(Current->getNumDefinedValues() == 1 &&
2284 "only recipes with a single defined value expected");
2285 if (cannotHoistOrSinkRecipe(*Current))
2286 return false;
2287
2288 for (VPValue *Op : Current->operands()) {
2289 // If we reach FOR, it means the original Previous depends on some other
2290 // recurrence that in turn depends on FOR. If that is the case, we would
2291 // also need to hoist recipes involving the other FOR, which may break
2292 // dependencies.
2293 if (Op == FOR)
2294 return false;
2295
2296 if (auto *R = NeedsHoisting(Op)) {
2297 // Bail out if the recipe defines multiple values.
2298 // TODO: Hoisting such recipes requires additional handling.
2299 if (R->getNumDefinedValues() != 1)
2300 return false;
2301 HoistCandidates.push_back(R);
2302 }
2303 }
2304 }
2305
2306 // Order recipes to hoist by dominance so earlier instructions are processed
2307 // first.
2308 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2309 return VPDT.properlyDominates(A, B);
2310 });
2311
2312 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2313 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2314 HoistPoint->getIterator());
2315 }
2316
2317 return true;
2318}
2319
2321 VPBuilder &LoopBuilder) {
2322 VPDominatorTree VPDT(Plan);
2323
2325 for (VPRecipeBase &R :
2328 RecurrencePhis.push_back(FOR);
2329
2330 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2332 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2333 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2334 // to terminate.
2335 while (auto *PrevPhi =
2337 assert(PrevPhi->getParent() == FOR->getParent());
2338 assert(SeenPhis.insert(PrevPhi).second);
2339 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2340 }
2341
2342 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2343 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2344 return false;
2345
2346 // Introduce a recipe to combine the incoming and previous values of a
2347 // fixed-order recurrence.
2348 VPBasicBlock *InsertBlock = Previous->getParent();
2349 if (isa<VPHeaderPHIRecipe>(Previous))
2350 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2351 else
2352 LoopBuilder.setInsertPoint(InsertBlock,
2353 std::next(Previous->getIterator()));
2354
2355 auto *RecurSplice =
2357 {FOR, FOR->getBackedgeValue()});
2358
2359 FOR->replaceAllUsesWith(RecurSplice);
2360 // Set the first operand of RecurSplice to FOR again, after replacing
2361 // all users.
2362 RecurSplice->setOperand(0, FOR);
2363
2364 // Check for users extracting at the penultimate active lane of the FOR.
2365 // If only a single lane is active in the current iteration, we need to
2366 // select the last element from the previous iteration (from the FOR phi
2367 // directly).
2368 for (VPUser *U : RecurSplice->users()) {
2370 m_Specific(RecurSplice))))
2371 continue;
2372
2374 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2375 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2376 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2377 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2378 VPValue *PenultimateIndex =
2379 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2380 VPValue *PenultimateLastIter =
2381 B.createNaryOp(VPInstruction::ExtractLane,
2382 {PenultimateIndex, FOR->getBackedgeValue()});
2383 VPValue *LastPrevIter =
2384 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2385
2386 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2387 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2388 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2389 }
2390 }
2391 return true;
2392}
2393
2395 for (VPRecipeBase &R :
2397 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2398 if (!PhiR)
2399 continue;
2400 RecurKind RK = PhiR->getRecurrenceKind();
2401 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2403 continue;
2404
2405 for (VPUser *U : collectUsersRecursively(PhiR))
2406 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2407 RecWithFlags->dropPoisonGeneratingFlags();
2408 }
2409 }
2410}
2411
2412namespace {
2413struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2414 static bool isSentinel(const VPSingleDefRecipe *Def) {
2415 return Def == getEmptyKey() || Def == getTombstoneKey();
2416 }
2417
2418 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2419 /// return that source element type.
2420 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2421 // All VPInstructions that lower to GEPs must have the i8 source element
2422 // type (as they are PtrAdds), so we omit it.
2424 .Case([](const VPReplicateRecipe *I) -> Type * {
2425 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2426 return GEP->getSourceElementType();
2427 return nullptr;
2428 })
2429 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2430 [](auto *I) { return I->getSourceElementType(); })
2431 .Default([](auto *) { return nullptr; });
2432 }
2433
2434 /// Returns true if recipe \p Def can be safely handed for CSE.
2435 static bool canHandle(const VPSingleDefRecipe *Def) {
2436 // We can extend the list of handled recipes in the future,
2437 // provided we account for the data embedded in them while checking for
2438 // equality or hashing.
2439 auto C = getOpcodeOrIntrinsicID(Def);
2440
2441 // The issue with (Insert|Extract)Value is that the index of the
2442 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2443 // VPlan.
2444 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2445 C->second == Instruction::ExtractValue)))
2446 return false;
2447
2448 // During CSE, we can only handle recipes that don't read from memory: if
2449 // they read from memory, there could be an intervening write to memory
2450 // before the next instance is CSE'd, leading to an incorrect result.
2451 return !Def->mayReadFromMemory();
2452 }
2453
2454 /// Hash the underlying data of \p Def.
2455 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2456 const VPlan *Plan = Def->getParent()->getPlan();
2457 VPTypeAnalysis TypeInfo(*Plan);
2458 hash_code Result = hash_combine(
2459 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2460 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2462 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2463 if (RFlags->hasPredicate())
2464 return hash_combine(Result, RFlags->getPredicate());
2465 return Result;
2466 }
2467
2468 /// Check equality of underlying data of \p L and \p R.
2469 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2470 if (isSentinel(L) || isSentinel(R))
2471 return L == R;
2472 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2474 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2476 !equal(L->operands(), R->operands()))
2477 return false;
2479 "must have valid opcode info for both recipes");
2480 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2481 if (LFlags->hasPredicate() &&
2482 LFlags->getPredicate() !=
2483 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2484 return false;
2485 // Recipes in replicate regions implicitly depend on predicate. If either
2486 // recipe is in a replicate region, only consider them equal if both have
2487 // the same parent.
2488 const VPRegionBlock *RegionL = L->getRegion();
2489 const VPRegionBlock *RegionR = R->getRegion();
2490 if (((RegionL && RegionL->isReplicator()) ||
2491 (RegionR && RegionR->isReplicator())) &&
2492 L->getParent() != R->getParent())
2493 return false;
2494 const VPlan *Plan = L->getParent()->getPlan();
2495 VPTypeAnalysis TypeInfo(*Plan);
2496 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2497 }
2498};
2499} // end anonymous namespace
2500
2501/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2502/// Plan.
2504 VPDominatorTree VPDT(Plan);
2506
2508 vp_depth_first_deep(Plan.getEntry()))) {
2509 for (VPRecipeBase &R : *VPBB) {
2510 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2511 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2512 continue;
2513 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2514 // V must dominate Def for a valid replacement.
2515 if (!VPDT.dominates(V->getParent(), VPBB))
2516 continue;
2517 // Only keep flags present on both V and Def.
2518 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2519 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2520 Def->replaceAllUsesWith(V);
2521 continue;
2522 }
2523 CSEMap[Def] = Def;
2524 }
2525 }
2526}
2527
2528/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2529static void licm(VPlan &Plan) {
2530 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2531
2532 // Hoist any loop invariant recipes from the vector loop region to the
2533 // preheader. Preform a shallow traversal of the vector loop region, to
2534 // exclude recipes in replicate regions. Since the top-level blocks in the
2535 // vector loop region are guaranteed to execute if the vector pre-header is,
2536 // we don't need to check speculation safety.
2537 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2538 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2539 "Expected vector prehader's successor to be the vector loop region");
2541 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2542 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2544 continue;
2545 if (any_of(R.operands(), [](VPValue *Op) {
2546 return !Op->isDefinedOutsideLoopRegions();
2547 }))
2548 continue;
2549 R.moveBefore(*Preheader, Preheader->end());
2550 }
2551 }
2552}
2553
2555 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2556 if (Plan.hasScalarVFOnly())
2557 return;
2558 // Keep track of created truncates, so they can be re-used. Note that we
2559 // cannot use RAUW after creating a new truncate, as this would could make
2560 // other uses have different types for their operands, making them invalidly
2561 // typed.
2563 VPTypeAnalysis TypeInfo(Plan);
2564 VPBasicBlock *PH = Plan.getVectorPreheader();
2567 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2570 continue;
2571
2572 VPValue *ResultVPV = R.getVPSingleValue();
2573 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2574 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2575 if (!NewResSizeInBits)
2576 continue;
2577
2578 // If the value wasn't vectorized, we must maintain the original scalar
2579 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2580 // skip casts which do not need to be handled explicitly here, as
2581 // redundant casts will be removed during recipe simplification.
2583 continue;
2584
2585 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2586 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2587 assert(OldResTy->isIntegerTy() && "only integer types supported");
2588 (void)OldResSizeInBits;
2589
2590 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2591
2592 // Any wrapping introduced by shrinking this operation shouldn't be
2593 // considered undefined behavior. So, we can't unconditionally copy
2594 // arithmetic wrapping flags to VPW.
2595 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2596 VPW->dropPoisonGeneratingFlags();
2597
2598 if (OldResSizeInBits != NewResSizeInBits &&
2599 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2600 // Extend result to original width.
2601 auto *Ext =
2602 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2603 Ext->insertAfter(&R);
2604 ResultVPV->replaceAllUsesWith(Ext);
2605 Ext->setOperand(0, ResultVPV);
2606 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2607 } else {
2608 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2609 "Only ICmps should not need extending the result.");
2610 }
2611
2612 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2614 continue;
2615
2616 // Shrink operands by introducing truncates as needed.
2617 unsigned StartIdx =
2618 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2619 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2620 auto *Op = R.getOperand(Idx);
2621 unsigned OpSizeInBits =
2623 if (OpSizeInBits == NewResSizeInBits)
2624 continue;
2625 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2626 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2627 if (!IterIsEmpty) {
2628 R.setOperand(Idx, ProcessedIter->second);
2629 continue;
2630 }
2631
2632 VPBuilder Builder;
2633 if (isa<VPIRValue>(Op))
2634 Builder.setInsertPoint(PH);
2635 else
2636 Builder.setInsertPoint(&R);
2637 VPWidenCastRecipe *NewOp =
2638 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2639 ProcessedIter->second = NewOp;
2640 R.setOperand(Idx, NewOp);
2641 }
2642
2643 }
2644 }
2645}
2646
2650 VPValue *Cond;
2651 // Skip blocks that are not terminated by BranchOnCond.
2652 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2653 continue;
2654
2655 assert(VPBB->getNumSuccessors() == 2 &&
2656 "Two successors expected for BranchOnCond");
2657 unsigned RemovedIdx;
2658 if (match(Cond, m_True()))
2659 RemovedIdx = 1;
2660 else if (match(Cond, m_False()))
2661 RemovedIdx = 0;
2662 else
2663 continue;
2664
2665 VPBasicBlock *RemovedSucc =
2666 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2667 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2668 "There must be a single edge between VPBB and its successor");
2669 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2670 // these recipes.
2671 for (VPRecipeBase &R : RemovedSucc->phis())
2672 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2673
2674 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2675 // automatically on VPlan destruction if it becomes unreachable.
2676 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2677 VPBB->back().eraseFromParent();
2678 }
2679}
2680
2700
2701// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2702// the loop terminator with a branch-on-cond recipe with the negated
2703// active-lane-mask as operand. Note that this turns the loop into an
2704// uncountable one. Only the existing terminator is replaced, all other existing
2705// recipes/users remain unchanged, except for poison-generating flags being
2706// dropped from the canonical IV increment. Return the created
2707// VPActiveLaneMaskPHIRecipe.
2708//
2709// The function uses the following definitions:
2710//
2711// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2712// calculate-trip-count-minus-VF (original TC) : original TC
2713// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2714// CanonicalIVPhi : CanonicalIVIncrement
2715// %StartV is the canonical induction start value.
2716//
2717// The function adds the following recipes:
2718//
2719// vector.ph:
2720// %TripCount = calculate-trip-count-minus-VF (original TC)
2721// [if DataWithControlFlowWithoutRuntimeCheck]
2722// %EntryInc = canonical-iv-increment-for-part %StartV
2723// %EntryALM = active-lane-mask %EntryInc, %TripCount
2724//
2725// vector.body:
2726// ...
2727// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2728// ...
2729// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2730// %ALM = active-lane-mask %InLoopInc, TripCount
2731// %Negated = Not %ALM
2732// branch-on-cond %Negated
2733//
2736 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2737 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2738 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2739 VPValue *StartV = CanonicalIVPHI->getStartValue();
2740
2741 auto *CanonicalIVIncrement =
2742 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2743 // TODO: Check if dropping the flags is needed if
2744 // !DataAndControlFlowWithoutRuntimeCheck.
2745 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2746 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2747 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2748 // we have to take unrolling into account. Each part needs to start at
2749 // Part * VF
2750 auto *VecPreheader = Plan.getVectorPreheader();
2751 VPBuilder Builder(VecPreheader);
2752
2753 // Create the ActiveLaneMask instruction using the correct start values.
2754 VPValue *TC = Plan.getTripCount();
2755
2756 VPValue *TripCount, *IncrementValue;
2758 // When the loop is guarded by a runtime overflow check for the loop
2759 // induction variable increment by VF, we can increment the value before
2760 // the get.active.lane mask and use the unmodified tripcount.
2761 IncrementValue = CanonicalIVIncrement;
2762 TripCount = TC;
2763 } else {
2764 // When avoiding a runtime check, the active.lane.mask inside the loop
2765 // uses a modified trip count and the induction variable increment is
2766 // done after the active.lane.mask intrinsic is called.
2767 IncrementValue = CanonicalIVPHI;
2768 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2769 {TC}, DL);
2770 }
2771 auto *EntryIncrement = Builder.createOverflowingOp(
2772 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2773 "index.part.next");
2774
2775 // Create the active lane mask instruction in the VPlan preheader.
2776 VPValue *ALMMultiplier =
2777 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2778 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2779 {EntryIncrement, TC, ALMMultiplier}, DL,
2780 "active.lane.mask.entry");
2781
2782 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2783 // preheader ActiveLaneMask instruction.
2784 auto *LaneMaskPhi =
2786 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2787
2788 // Create the active lane mask for the next iteration of the loop before the
2789 // original terminator.
2790 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2791 Builder.setInsertPoint(OriginalTerminator);
2792 auto *InLoopIncrement =
2793 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2794 {IncrementValue}, {false, false}, DL);
2795 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2796 {InLoopIncrement, TripCount, ALMMultiplier},
2797 DL, "active.lane.mask.next");
2798 LaneMaskPhi->addOperand(ALM);
2799
2800 // Replace the original terminator with BranchOnCond. We have to invert the
2801 // mask here because a true condition means jumping to the exit block.
2802 auto *NotMask = Builder.createNot(ALM, DL);
2803 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2804 OriginalTerminator->eraseFromParent();
2805 return LaneMaskPhi;
2806}
2807
2808/// Collect the header mask with the pattern:
2809/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2810/// TODO: Introduce explicit recipe for header-mask instead of searching
2811/// for the header-mask pattern manually.
2813 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2814 SmallVector<VPValue *> WideCanonicalIVs;
2815 auto *FoundWidenCanonicalIVUser = find_if(
2817 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2819 "Must have at most one VPWideCanonicalIVRecipe");
2820 if (FoundWidenCanonicalIVUser !=
2821 LoopRegion->getCanonicalIV()->users().end()) {
2822 auto *WideCanonicalIV =
2823 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2824 WideCanonicalIVs.push_back(WideCanonicalIV);
2825 }
2826
2827 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2828 // version of the canonical induction.
2829 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2830 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2831 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2832 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2833 WideCanonicalIVs.push_back(WidenOriginalIV);
2834 }
2835
2836 // Walk users of wide canonical IVs and find the single compare of the form
2837 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2838 VPSingleDefRecipe *HeaderMask = nullptr;
2839 for (auto *Wide : WideCanonicalIVs) {
2840 for (VPUser *U : Wide->users()) {
2841 auto *VPI = dyn_cast<VPInstruction>(U);
2842 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2843 continue;
2844
2845 assert(VPI->getOperand(0) == Wide &&
2846 "WidenCanonicalIV must be the first operand of the compare");
2847 assert(!HeaderMask && "Multiple header masks found?");
2848 HeaderMask = VPI;
2849 }
2850 }
2851 return HeaderMask;
2852}
2853
2855 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2858 UseActiveLaneMaskForControlFlow) &&
2859 "DataAndControlFlowWithoutRuntimeCheck implies "
2860 "UseActiveLaneMaskForControlFlow");
2861
2862 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2863 auto *FoundWidenCanonicalIVUser = find_if(
2865 assert(FoundWidenCanonicalIVUser &&
2866 "Must have widened canonical IV when tail folding!");
2867 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2868 auto *WideCanonicalIV =
2869 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2870 VPSingleDefRecipe *LaneMask;
2871 if (UseActiveLaneMaskForControlFlow) {
2874 } else {
2875 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2876 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2877 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2878 LaneMask =
2879 B.createNaryOp(VPInstruction::ActiveLaneMask,
2880 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2881 nullptr, "active.lane.mask");
2882 }
2883
2884 // Walk users of WideCanonicalIV and replace the header mask of the form
2885 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2886 // removing the old one to ensure there is always only a single header mask.
2887 HeaderMask->replaceAllUsesWith(LaneMask);
2888 HeaderMask->eraseFromParent();
2889}
2890
2891template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2892 Op0_t In;
2894
2895 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2896
2897 template <typename OpTy> bool match(OpTy *V) const {
2898 if (m_Specific(In).match(V)) {
2899 Out = nullptr;
2900 return true;
2901 }
2902 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2903 }
2904};
2905
2906/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2907/// Returns the remaining part \p Out if so, or nullptr otherwise.
2908template <typename Op0_t, typename Op1_t>
2909static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2910 Op1_t &Out) {
2911 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2912}
2913
2914/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2915/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2916/// recipe could be created.
2917/// \p HeaderMask Header Mask.
2918/// \p CurRecipe Recipe to be transform.
2919/// \p TypeInfo VPlan-based type analysis.
2920/// \p EVL The explicit vector length parameter of vector-predication
2921/// intrinsics.
2923 VPRecipeBase &CurRecipe,
2924 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2925 VPlan *Plan = CurRecipe.getParent()->getPlan();
2926 DebugLoc DL = CurRecipe.getDebugLoc();
2927 VPValue *Addr, *Mask, *EndPtr;
2928
2929 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2930 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2931 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2932 EVLEndPtr->insertBefore(&CurRecipe);
2933 EVLEndPtr->setOperand(1, &EVL);
2934 return EVLEndPtr;
2935 };
2936
2937 if (match(&CurRecipe,
2938 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2939 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2940 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2941 EVL, Mask);
2942
2943 VPValue *ReversedVal;
2944 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2945 match(ReversedVal,
2946 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2947 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2948 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2949 auto *LoadR = new VPWidenLoadEVLRecipe(
2950 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2951 LoadR->insertBefore(&CurRecipe);
2952 return new VPWidenIntrinsicRecipe(
2953 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2954 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2955 }
2956
2957 VPValue *StoredVal;
2958 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2959 m_RemoveMask(HeaderMask, Mask))) &&
2960 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2961 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2962 StoredVal, EVL, Mask);
2963
2964 if (match(&CurRecipe,
2965 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2966 m_RemoveMask(HeaderMask, Mask))) &&
2967 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2968 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2969 auto *NewReverse = new VPWidenIntrinsicRecipe(
2970 Intrinsic::experimental_vp_reverse,
2971 {ReversedVal, Plan->getTrue(), &EVL},
2972 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2973 NewReverse->insertBefore(&CurRecipe);
2974 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2975 AdjustEndPtr(EndPtr), NewReverse, EVL,
2976 Mask);
2977 }
2978
2979 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2980 if (Rdx->isConditional() &&
2981 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2982 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2983
2984 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2985 if (Interleave->getMask() &&
2986 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2987 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2988
2989 VPValue *LHS, *RHS;
2990 if (match(&CurRecipe,
2991 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2992 return new VPWidenIntrinsicRecipe(
2993 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2994 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2995
2996 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2997 m_VPValue(RHS))))
2998 return new VPWidenIntrinsicRecipe(
2999 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3000 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3001
3002 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3003 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3004 VPValue *ZExt =
3005 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3006 return new VPInstruction(Instruction::Sub,
3007 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
3008 }
3009
3010 return nullptr;
3011}
3012
3013/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3014/// The transforms here need to preserve the original semantics.
3016 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3017 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3020 m_VPValue(EVL))) &&
3021 match(EVL, m_EVL(m_VPValue()))) {
3022 HeaderMask = R.getVPSingleValue();
3023 break;
3024 }
3025 }
3026 if (!HeaderMask)
3027 return;
3028
3029 VPTypeAnalysis TypeInfo(Plan);
3030 SmallVector<VPRecipeBase *> OldRecipes;
3031 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3033 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3034 NewR->insertBefore(R);
3035 for (auto [Old, New] :
3036 zip_equal(R->definedValues(), NewR->definedValues()))
3037 Old->replaceAllUsesWith(New);
3038 OldRecipes.push_back(R);
3039 }
3040 }
3041 // Erase old recipes at the end so we don't invalidate TypeInfo.
3042 for (VPRecipeBase *R : reverse(OldRecipes)) {
3043 SmallVector<VPValue *> PossiblyDead(R->operands());
3044 R->eraseFromParent();
3045 for (VPValue *Op : PossiblyDead)
3047 }
3048}
3049
3050/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3051/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3052/// iteration.
3053static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3054 VPTypeAnalysis TypeInfo(Plan);
3055 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3056 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3057
3058 assert(all_of(Plan.getVF().users(),
3061 "User of VF that we can't transform to EVL.");
3062 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3064 });
3065
3066 assert(all_of(Plan.getVFxUF().users(),
3067 [&LoopRegion, &Plan](VPUser *U) {
3068 return match(U,
3069 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3070 m_Specific(&Plan.getVFxUF()))) ||
3071 isa<VPWidenPointerInductionRecipe>(U);
3072 }) &&
3073 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3074 "increment of the canonical induction.");
3075 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3076 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3077 // canonical induction must not be updated.
3079 });
3080
3081 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3082 // contained.
3083 bool ContainsFORs =
3085 if (ContainsFORs) {
3086 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3087 VPValue *MaxEVL = &Plan.getVF();
3088 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3089 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3090 MaxEVL = Builder.createScalarZExtOrTrunc(
3091 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3092 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3093
3094 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3095 VPValue *PrevEVL = Builder.createScalarPhi(
3096 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3097
3100 for (VPRecipeBase &R : *VPBB) {
3101 VPValue *V1, *V2;
3102 if (!match(&R,
3104 m_VPValue(V1), m_VPValue(V2))))
3105 continue;
3106 VPValue *Imm = Plan.getOrAddLiveIn(
3109 Intrinsic::experimental_vp_splice,
3110 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3111 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3112 R.getDebugLoc());
3113 VPSplice->insertBefore(&R);
3114 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3115 }
3116 }
3117 }
3118
3119 VPValue *HeaderMask = findHeaderMask(Plan);
3120 if (!HeaderMask)
3121 return;
3122
3123 // Replace header masks with a mask equivalent to predicating by EVL:
3124 //
3125 // icmp ule widen-canonical-iv backedge-taken-count
3126 // ->
3127 // icmp ult step-vector, EVL
3128 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3129 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3130 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3131 VPValue *EVLMask = Builder.createICmp(
3133 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3134 HeaderMask->replaceAllUsesWith(EVLMask);
3135}
3136
3137/// Converts a tail folded vector loop region to step by
3138/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3139/// iteration.
3140///
3141/// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3142/// replaces all uses except the canonical IV increment of
3143/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
3144/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3145/// this transformation.
3146///
3147/// - The header mask is replaced with a header mask based on the EVL.
3148///
3149/// - Plans with FORs have a new phi added to keep track of the EVL of the
3150/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3151/// @llvm.vp.splice.
3152///
3153/// The function uses the following definitions:
3154/// %StartV is the canonical induction start value.
3155///
3156/// The function adds the following recipes:
3157///
3158/// vector.ph:
3159/// ...
3160///
3161/// vector.body:
3162/// ...
3163/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3164/// [ %NextEVLIV, %vector.body ]
3165/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3166/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3167/// ...
3168/// %OpEVL = cast i32 %VPEVL to IVSize
3169/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3170/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3171/// ...
3172///
3173/// If MaxSafeElements is provided, the function adds the following recipes:
3174/// vector.ph:
3175/// ...
3176///
3177/// vector.body:
3178/// ...
3179/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3180/// [ %NextEVLIV, %vector.body ]
3181/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3182/// %cmp = cmp ult %AVL, MaxSafeElements
3183/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3184/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3185/// ...
3186/// %OpEVL = cast i32 %VPEVL to IVSize
3187/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3188/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3189/// ...
3190///
3192 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3193 if (Plan.hasScalarVFOnly())
3194 return;
3195 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3196 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3197
3198 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3199 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3200 VPValue *StartV = CanonicalIVPHI->getStartValue();
3201
3202 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3203 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3204 EVLPhi->insertAfter(CanonicalIVPHI);
3205 VPBuilder Builder(Header, Header->getFirstNonPhi());
3206 // Create the AVL (application vector length), starting from TC -> 0 in steps
3207 // of EVL.
3208 VPPhi *AVLPhi = Builder.createScalarPhi(
3209 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3210 VPValue *AVL = AVLPhi;
3211
3212 if (MaxSafeElements) {
3213 // Support for MaxSafeDist for correct loop emission.
3214 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3215 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3216 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3217 "safe_avl");
3218 }
3219 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3220 DebugLoc::getUnknown(), "evl");
3221
3222 auto *CanonicalIVIncrement =
3223 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3224 Builder.setInsertPoint(CanonicalIVIncrement);
3225 VPValue *OpVPEVL = VPEVL;
3226
3227 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3228 OpVPEVL = Builder.createScalarZExtOrTrunc(
3229 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3230
3231 auto *NextEVLIV = Builder.createOverflowingOp(
3232 Instruction::Add, {OpVPEVL, EVLPhi},
3233 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3234 CanonicalIVIncrement->hasNoSignedWrap()},
3235 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3236 EVLPhi->addOperand(NextEVLIV);
3237
3238 VPValue *NextAVL = Builder.createOverflowingOp(
3239 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3240 DebugLoc::getCompilerGenerated(), "avl.next");
3241 AVLPhi->addOperand(NextAVL);
3242
3243 fixupVFUsersForEVL(Plan, *VPEVL);
3244 removeDeadRecipes(Plan);
3245
3246 // Replace all uses of VPCanonicalIVPHIRecipe by
3247 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3248 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3249 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3250 // TODO: support unroll factor > 1.
3251 Plan.setUF(1);
3252}
3253
3255 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3256 // There should be only one EVL PHI in the entire plan.
3257 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3258
3261 for (VPRecipeBase &R : VPBB->phis())
3262 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3263 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3264 EVLPhi = PhiR;
3265 }
3266
3267 // Early return if no EVL PHI is found.
3268 if (!EVLPhi)
3269 return;
3270
3271 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3272 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3273 VPValue *AVL;
3274 [[maybe_unused]] bool FoundAVL =
3275 match(EVLIncrement,
3276 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3277 assert(FoundAVL && "Didn't find AVL?");
3278
3279 // The AVL may be capped to a safe distance.
3280 VPValue *SafeAVL;
3281 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3282 AVL = SafeAVL;
3283
3284 VPValue *AVLNext;
3285 [[maybe_unused]] bool FoundAVLNext =
3287 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3288 assert(FoundAVLNext && "Didn't find AVL backedge?");
3289
3290 // Convert EVLPhi to concrete recipe.
3291 auto *ScalarR =
3292 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3293 EVLPhi->getDebugLoc(), "evl.based.iv");
3294 EVLPhi->replaceAllUsesWith(ScalarR);
3295 EVLPhi->eraseFromParent();
3296
3297 // Replace CanonicalIVInc with EVL-PHI increment.
3298 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3299 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3300 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3301 m_Specific(&Plan.getVFxUF()))) &&
3302 "Unexpected canonical iv");
3303 Backedge->replaceAllUsesWith(EVLIncrement);
3304
3305 // Remove unused phi and increment.
3306 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3307 CanonicalIVIncrement->eraseFromParent();
3308 CanonicalIV->eraseFromParent();
3309
3310 // Replace the use of VectorTripCount in the latch-exiting block.
3311 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3312 // After: (branch-on-cond icmp eq AVLNext, 0)
3313 VPBasicBlock *LatchExiting =
3314 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3315 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3316 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3317 return;
3318
3319 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3320 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3321 m_Specific(&Plan.getVectorTripCount())))) &&
3322 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3323 "trip count");
3324
3325 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3326 VPBuilder Builder(LatchExitingBr);
3327 LatchExitingBr->setOperand(0,
3328 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3329 Plan.getConstantInt(AVLTy, 0)));
3330}
3331
3333 VPlan &Plan, PredicatedScalarEvolution &PSE,
3334 const DenseMap<Value *, const SCEV *> &StridesMap) {
3335 // Replace VPValues for known constant strides guaranteed by predicate scalar
3336 // evolution.
3337 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3338 auto *R = cast<VPRecipeBase>(&U);
3339 return R->getRegion() ||
3340 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3341 };
3342 ValueToSCEVMapTy RewriteMap;
3343 for (const SCEV *Stride : StridesMap.values()) {
3344 using namespace SCEVPatternMatch;
3345 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3346 const APInt *StrideConst;
3347 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3348 // Only handle constant strides for now.
3349 continue;
3350
3351 auto *CI = Plan.getConstantInt(*StrideConst);
3352 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3353 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3354
3355 // The versioned value may not be used in the loop directly but through a
3356 // sext/zext. Add new live-ins in those cases.
3357 for (Value *U : StrideV->users()) {
3359 continue;
3360 VPValue *StrideVPV = Plan.getLiveIn(U);
3361 if (!StrideVPV)
3362 continue;
3363 unsigned BW = U->getType()->getScalarSizeInBits();
3364 APInt C =
3365 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3366 VPValue *CI = Plan.getConstantInt(C);
3367 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3368 }
3369 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3370 }
3371
3372 for (VPRecipeBase &R : *Plan.getEntry()) {
3373 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3374 if (!ExpSCEV)
3375 continue;
3376 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3377 auto *NewSCEV =
3378 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3379 if (NewSCEV != ScevExpr) {
3380 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3381 ExpSCEV->replaceAllUsesWith(NewExp);
3382 if (Plan.getTripCount() == ExpSCEV)
3383 Plan.resetTripCount(NewExp);
3384 }
3385 }
3386}
3387
3389 VPlan &Plan,
3390 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3391 // Collect recipes in the backward slice of `Root` that may generate a poison
3392 // value that is used after vectorization.
3394 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3396 Worklist.push_back(Root);
3397
3398 // Traverse the backward slice of Root through its use-def chain.
3399 while (!Worklist.empty()) {
3400 VPRecipeBase *CurRec = Worklist.pop_back_val();
3401
3402 if (!Visited.insert(CurRec).second)
3403 continue;
3404
3405 // Prune search if we find another recipe generating a widen memory
3406 // instruction. Widen memory instructions involved in address computation
3407 // will lead to gather/scatter instructions, which don't need to be
3408 // handled.
3410 VPHeaderPHIRecipe>(CurRec))
3411 continue;
3412
3413 // This recipe contributes to the address computation of a widen
3414 // load/store. If the underlying instruction has poison-generating flags,
3415 // drop them directly.
3416 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3417 VPValue *A, *B;
3418 // Dropping disjoint from an OR may yield incorrect results, as some
3419 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3420 // for dependence analysis). Instead, replace it with an equivalent Add.
3421 // This is possible as all users of the disjoint OR only access lanes
3422 // where the operands are disjoint or poison otherwise.
3423 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3424 RecWithFlags->isDisjoint()) {
3425 VPBuilder Builder(RecWithFlags);
3426 VPInstruction *New = Builder.createOverflowingOp(
3427 Instruction::Add, {A, B}, {false, false},
3428 RecWithFlags->getDebugLoc());
3429 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3430 RecWithFlags->replaceAllUsesWith(New);
3431 RecWithFlags->eraseFromParent();
3432 CurRec = New;
3433 } else
3434 RecWithFlags->dropPoisonGeneratingFlags();
3435 } else {
3438 (void)Instr;
3439 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3440 "found instruction with poison generating flags not covered by "
3441 "VPRecipeWithIRFlags");
3442 }
3443
3444 // Add new definitions to the worklist.
3445 for (VPValue *Operand : CurRec->operands())
3446 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3447 Worklist.push_back(OpDef);
3448 }
3449 });
3450
3451 // Traverse all the recipes in the VPlan and collect the poison-generating
3452 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3453 // VPInterleaveRecipe.
3454 auto Iter = vp_depth_first_deep(Plan.getEntry());
3456 for (VPRecipeBase &Recipe : *VPBB) {
3457 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3458 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3459 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3460 if (AddrDef && WidenRec->isConsecutive() &&
3461 BlockNeedsPredication(UnderlyingInstr.getParent()))
3462 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3463 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3464 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3465 if (AddrDef) {
3466 // Check if any member of the interleave group needs predication.
3467 const InterleaveGroup<Instruction> *InterGroup =
3468 InterleaveRec->getInterleaveGroup();
3469 bool NeedPredication = false;
3470 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3471 I < NumMembers; ++I) {
3472 Instruction *Member = InterGroup->getMember(I);
3473 if (Member)
3474 NeedPredication |= BlockNeedsPredication(Member->getParent());
3475 }
3476
3477 if (NeedPredication)
3478 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3479 }
3480 }
3481 }
3482 }
3483}
3484
3486 VPlan &Plan,
3488 &InterleaveGroups,
3489 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3490 if (InterleaveGroups.empty())
3491 return;
3492
3493 // Interleave memory: for each Interleave Group we marked earlier as relevant
3494 // for this VPlan, replace the Recipes widening its memory instructions with a
3495 // single VPInterleaveRecipe at its insertion point.
3496 VPDominatorTree VPDT(Plan);
3497 for (const auto *IG : InterleaveGroups) {
3498 auto *Start =
3499 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3500 VPIRMetadata InterleaveMD(*Start);
3501 SmallVector<VPValue *, 4> StoredValues;
3502 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3503 StoredValues.push_back(StoreR->getStoredValue());
3504 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3505 Instruction *MemberI = IG->getMember(I);
3506 if (!MemberI)
3507 continue;
3508 VPWidenMemoryRecipe *MemoryR =
3509 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3510 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3511 StoredValues.push_back(StoreR->getStoredValue());
3512 InterleaveMD.intersect(*MemoryR);
3513 }
3514
3515 bool NeedsMaskForGaps =
3516 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3517 (!StoredValues.empty() && !IG->isFull());
3518
3519 Instruction *IRInsertPos = IG->getInsertPos();
3520 auto *InsertPos =
3521 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3522
3524 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3525 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3526 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3527
3528 // Get or create the start address for the interleave group.
3529 VPValue *Addr = Start->getAddr();
3530 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3531 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3532 // We cannot re-use the address of member zero because it does not
3533 // dominate the insert position. Instead, use the address of the insert
3534 // position and create a PtrAdd adjusting it to the address of member
3535 // zero.
3536 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3537 // InsertPos or sink loads above zero members to join it.
3538 assert(IG->getIndex(IRInsertPos) != 0 &&
3539 "index of insert position shouldn't be zero");
3540 auto &DL = IRInsertPos->getDataLayout();
3541 APInt Offset(32,
3542 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3543 IG->getIndex(IRInsertPos),
3544 /*IsSigned=*/true);
3545 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3546 VPBuilder B(InsertPos);
3547 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3548 }
3549 // If the group is reverse, adjust the index to refer to the last vector
3550 // lane instead of the first. We adjust the index from the first vector
3551 // lane, rather than directly getting the pointer for lane VF - 1, because
3552 // the pointer operand of the interleaved access is supposed to be uniform.
3553 if (IG->isReverse()) {
3554 auto *ReversePtr = new VPVectorEndPointerRecipe(
3555 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3556 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3557 ReversePtr->insertBefore(InsertPos);
3558 Addr = ReversePtr;
3559 }
3560 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3561 InsertPos->getMask(), NeedsMaskForGaps,
3562 InterleaveMD, InsertPos->getDebugLoc());
3563 VPIG->insertBefore(InsertPos);
3564
3565 unsigned J = 0;
3566 for (unsigned i = 0; i < IG->getFactor(); ++i)
3567 if (Instruction *Member = IG->getMember(i)) {
3568 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3569 if (!Member->getType()->isVoidTy()) {
3570 VPValue *OriginalV = MemberR->getVPSingleValue();
3571 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3572 J++;
3573 }
3574 MemberR->eraseFromParent();
3575 }
3576 }
3577}
3578
3579/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3580/// value, phi and backedge value. In the following example:
3581///
3582/// vector.ph:
3583/// Successor(s): vector loop
3584///
3585/// <x1> vector loop: {
3586/// vector.body:
3587/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3588/// ...
3589/// EMIT branch-on-count ...
3590/// No successors
3591/// }
3592///
3593/// WIDEN-INDUCTION will get expanded to:
3594///
3595/// vector.ph:
3596/// ...
3597/// vp<%induction.start> = ...
3598/// vp<%induction.increment> = ...
3599///
3600/// Successor(s): vector loop
3601///
3602/// <x1> vector loop: {
3603/// vector.body:
3604/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3605/// ...
3606/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3607/// EMIT branch-on-count ...
3608/// No successors
3609/// }
3610static void
3612 VPTypeAnalysis &TypeInfo) {
3613 VPlan *Plan = WidenIVR->getParent()->getPlan();
3614 VPValue *Start = WidenIVR->getStartValue();
3615 VPValue *Step = WidenIVR->getStepValue();
3616 VPValue *VF = WidenIVR->getVFValue();
3617 DebugLoc DL = WidenIVR->getDebugLoc();
3618
3619 // The value from the original loop to which we are mapping the new induction
3620 // variable.
3621 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3622
3623 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3626 VPIRFlags Flags = *WidenIVR;
3627 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3628 AddOp = Instruction::Add;
3629 MulOp = Instruction::Mul;
3630 } else {
3631 AddOp = ID.getInductionOpcode();
3632 MulOp = Instruction::FMul;
3633 }
3634
3635 // If the phi is truncated, truncate the start and step values.
3636 VPBuilder Builder(Plan->getVectorPreheader());
3637 Type *StepTy = TypeInfo.inferScalarType(Step);
3638 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3639 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3640 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3641 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3642 // Truncation doesn't preserve WrapFlags.
3643 Flags.dropPoisonGeneratingFlags();
3644 StepTy = Ty;
3645 }
3646
3647 // Construct the initial value of the vector IV in the vector loop preheader.
3648 Type *IVIntTy =
3650 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3651 if (StepTy->isFloatingPointTy())
3652 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3653
3654 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3655 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3656
3657 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3658 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3659 DebugLoc::getUnknown(), "induction");
3660
3661 // Create the widened phi of the vector IV.
3662 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3663 WidenIVR->getDebugLoc(), "vec.ind");
3664 WidePHI->insertBefore(WidenIVR);
3665
3666 // Create the backedge value for the vector IV.
3667 VPValue *Inc;
3668 VPValue *Prev;
3669 // If unrolled, use the increment and prev value from the operands.
3670 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3671 Inc = SplatVF;
3672 Prev = WidenIVR->getLastUnrolledPartOperand();
3673 } else {
3674 if (VPRecipeBase *R = VF->getDefiningRecipe())
3675 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3676 // Multiply the vectorization factor by the step using integer or
3677 // floating-point arithmetic as appropriate.
3678 if (StepTy->isFloatingPointTy())
3679 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3680 DL);
3681 else
3682 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3683 TypeInfo.inferScalarType(VF), DL);
3684
3685 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3686 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3687 Prev = WidePHI;
3688 }
3689
3691 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3692 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3693 WidenIVR->getDebugLoc(), "vec.ind.next");
3694
3695 WidePHI->addOperand(Next);
3696
3697 WidenIVR->replaceAllUsesWith(WidePHI);
3698}
3699
3700/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3701/// initial value, phi and backedge value. In the following example:
3702///
3703/// <x1> vector loop: {
3704/// vector.body:
3705/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3706/// ...
3707/// EMIT branch-on-count ...
3708/// }
3709///
3710/// WIDEN-POINTER-INDUCTION will get expanded to:
3711///
3712/// <x1> vector loop: {
3713/// vector.body:
3714/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3715/// EMIT %mul = mul %stepvector, %step
3716/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3717/// ...
3718/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3719/// EMIT branch-on-count ...
3720/// }
3722 VPTypeAnalysis &TypeInfo) {
3723 VPlan *Plan = R->getParent()->getPlan();
3724 VPValue *Start = R->getStartValue();
3725 VPValue *Step = R->getStepValue();
3726 VPValue *VF = R->getVFValue();
3727
3728 assert(R->getInductionDescriptor().getKind() ==
3730 "Not a pointer induction according to InductionDescriptor!");
3731 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3732 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3733 "Recipe should have been replaced");
3734
3735 VPBuilder Builder(R);
3736 DebugLoc DL = R->getDebugLoc();
3737
3738 // Build a scalar pointer phi.
3739 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3740
3741 // Create actual address geps that use the pointer phi as base and a
3742 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3743 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3744 Type *StepTy = TypeInfo.inferScalarType(Step);
3745 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3746 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3747 VPValue *PtrAdd = Builder.createNaryOp(
3748 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3749 R->replaceAllUsesWith(PtrAdd);
3750
3751 // Create the backedge value for the scalar pointer phi.
3753 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3754 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3755 DL);
3756 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3757
3758 VPValue *InductionGEP =
3759 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3760 ScalarPtrPhi->addOperand(InductionGEP);
3761}
3762
3764 // Replace loop regions with explicity CFG.
3765 SmallVector<VPRegionBlock *> LoopRegions;
3767 vp_depth_first_deep(Plan.getEntry()))) {
3768 if (!R->isReplicator())
3769 LoopRegions.push_back(R);
3770 }
3771 for (VPRegionBlock *R : LoopRegions)
3772 R->dissolveToCFGLoop();
3773}
3774
3777 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3778 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3781 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3782 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3783 }
3784
3785 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3786 // single-condition branches:
3787 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3788 // the first condition is true, and otherwise jumps to a new interim block.
3789 // 2. A branch that ends the interim block, jumps to the second successor if
3790 // the second condition is true, and otherwise jumps to the third
3791 // successor.
3792 for (VPInstruction *Br : WorkList) {
3793 assert(Br->getNumOperands() == 2 &&
3794 "BranchOnTwoConds must have exactly 2 conditions");
3795 DebugLoc DL = Br->getDebugLoc();
3796 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3797 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3798 assert(Successors.size() == 3 &&
3799 "BranchOnTwoConds must have exactly 3 successors");
3800
3801 for (VPBlockBase *Succ : Successors)
3802 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3803
3804 VPValue *Cond0 = Br->getOperand(0);
3805 VPValue *Cond1 = Br->getOperand(1);
3806 VPBlockBase *Succ0 = Successors[0];
3807 VPBlockBase *Succ1 = Successors[1];
3808 VPBlockBase *Succ2 = Successors[2];
3809 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3810 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3811
3812 VPBasicBlock *InterimBB =
3813 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3814
3815 VPBuilder(BrOnTwoCondsBB)
3817 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3818 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3819
3821 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3822 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3823 Br->eraseFromParent();
3824 }
3825}
3826
3828 VPTypeAnalysis TypeInfo(Plan);
3831 vp_depth_first_deep(Plan.getEntry()))) {
3832 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3833 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3834 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3835 ToRemove.push_back(WidenIVR);
3836 continue;
3837 }
3838
3839 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3840 // If the recipe only generates scalars, scalarize it instead of
3841 // expanding it.
3842 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3843 VPBuilder Builder(WidenIVR);
3844 VPValue *PtrAdd =
3845 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3846 WidenIVR->replaceAllUsesWith(PtrAdd);
3847 ToRemove.push_back(WidenIVR);
3848 continue;
3849 }
3850 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3851 ToRemove.push_back(WidenIVR);
3852 continue;
3853 }
3854
3855 // Expand VPBlendRecipe into VPInstruction::Select.
3856 VPBuilder Builder(&R);
3857 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3858 VPValue *Select = Blend->getIncomingValue(0);
3859 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3860 Select = Builder.createSelect(Blend->getMask(I),
3861 Blend->getIncomingValue(I), Select,
3862 R.getDebugLoc(), "predphi");
3863 Blend->replaceAllUsesWith(Select);
3864 ToRemove.push_back(Blend);
3865 }
3866
3867 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3868 Expr->decompose();
3869 ToRemove.push_back(Expr);
3870 }
3871
3872 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3873 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3874 if (LastActiveL &&
3875 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3876 // Create Not(Mask) for all operands.
3878 for (VPValue *Op : LastActiveL->operands()) {
3879 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3880 NotMasks.push_back(NotMask);
3881 }
3882
3883 // Create FirstActiveLane on the inverted masks.
3884 VPValue *FirstInactiveLane = Builder.createNaryOp(
3886 LastActiveL->getDebugLoc(), "first.inactive.lane");
3887
3888 // Subtract 1 to get the last active lane.
3889 VPValue *One = Plan.getOrAddLiveIn(
3890 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3891 VPValue *LastLane = Builder.createNaryOp(
3892 Instruction::Sub, {FirstInactiveLane, One},
3893 LastActiveL->getDebugLoc(), "last.active.lane");
3894
3895 LastActiveL->replaceAllUsesWith(LastLane);
3896 ToRemove.push_back(LastActiveL);
3897 continue;
3898 }
3899
3900 // Lower BranchOnCount to ICmp + BranchOnCond.
3901 VPValue *IV, *TC;
3902 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3903 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3904 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3905 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3906 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3907 ToRemove.push_back(BranchOnCountInst);
3908 continue;
3909 }
3910
3911 VPValue *VectorStep;
3912 VPValue *ScalarStep;
3914 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3915 continue;
3916
3917 // Expand WideIVStep.
3918 auto *VPI = cast<VPInstruction>(&R);
3919 Type *IVTy = TypeInfo.inferScalarType(VPI);
3920 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3922 ? Instruction::UIToFP
3923 : Instruction::Trunc;
3924 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3925 }
3926
3927 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3928 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3929 ScalarStep =
3930 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3931 }
3932
3933 VPIRFlags Flags;
3934 if (IVTy->isFloatingPointTy())
3935 Flags = {VPI->getFastMathFlags()};
3936
3937 unsigned MulOpc =
3938 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3939 VPInstruction *Mul = Builder.createNaryOp(
3940 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3941 VectorStep = Mul;
3942 VPI->replaceAllUsesWith(VectorStep);
3943 ToRemove.push_back(VPI);
3944 }
3945 }
3946
3947 for (VPRecipeBase *R : ToRemove)
3948 R->eraseFromParent();
3949}
3950
3952 VPBasicBlock *EarlyExitVPBB,
3953 VPlan &Plan,
3954 VPBasicBlock *HeaderVPBB,
3955 VPBasicBlock *LatchVPBB) {
3956 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3957 if (!EarlyExitVPBB->getSinglePredecessor() &&
3958 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3959 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3960 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3961 "unsupported early exit VPBB");
3962 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3963 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3964 // of the phis.
3965 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3966 cast<VPIRPhi>(&R)->swapOperands();
3967 }
3968
3969 VPBuilder Builder(LatchVPBB->getTerminator());
3970 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3971 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3972 "Terminator must be be BranchOnCond");
3973 VPValue *CondOfEarlyExitingVPBB =
3974 EarlyExitingVPBB->getTerminator()->getOperand(0);
3975 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3976 ? CondOfEarlyExitingVPBB
3977 : Builder.createNot(CondOfEarlyExitingVPBB);
3978
3979 // Create a BranchOnTwoConds in the latch that branches to:
3980 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3981 VPValue *IsEarlyExitTaken =
3982 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3983 VPBasicBlock *VectorEarlyExitVPBB =
3984 Plan.createVPBasicBlock("vector.early.exit");
3985 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3986
3987 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3988
3989 // Update the exit phis in the early exit block.
3990 VPBuilder MiddleBuilder(MiddleVPBB);
3991 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3992 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3993 auto *ExitIRI = cast<VPIRPhi>(&R);
3994 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3995 // a single predecessor and 1 if it has two.
3996 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3997 if (ExitIRI->getNumOperands() != 1) {
3998 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3999 // predecessor. Extract its final lane.
4000 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
4001 }
4002
4003 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
4004 if (!isa<VPIRValue>(IncomingFromEarlyExit)) {
4005 // Update the incoming value from the early exit.
4006 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
4007 VPInstruction::FirstActiveLane, {CondToEarlyExit},
4008 DebugLoc::getUnknown(), "first.active.lane");
4009 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
4010 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
4011 DebugLoc::getUnknown(), "early.exit.value");
4012 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
4013 }
4014 }
4015
4016 // Replace the conditional branch controlling the latch exit from the vector
4017 // loop with a multi-conditional branch exiting to vector early exit if the
4018 // early exit has been taken, exiting to middle block if the original
4019 // condition of the vector latch is true, otherwise continuing back to header.
4020 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4021 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4022 "Unexpected terminator");
4023 auto *IsLatchExitTaken =
4024 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4025 LatchExitingBranch->getOperand(1));
4026
4027 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4028 LatchExitingBranch->eraseFromParent();
4029
4030 Builder.setInsertPoint(LatchVPBB);
4031 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4032 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
4033 LatchVPBB->clearSuccessors();
4034 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4035 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4036}
4037
4038/// This function tries convert extended in-loop reductions to
4039/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4040/// valid. The created recipe must be decomposed to its constituent
4041/// recipes before execution.
4042static VPExpressionRecipe *
4044 VFRange &Range) {
4045 Type *RedTy = Ctx.Types.inferScalarType(Red);
4046 VPValue *VecOp = Red->getVecOp();
4047
4048 // Clamp the range if using extended-reduction is profitable.
4049 auto IsExtendedRedValidAndClampRange =
4050 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4052 [&](ElementCount VF) {
4053 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4055
4057 InstructionCost ExtCost =
4058 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4059 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4060
4061 if (Red->isPartialReduction()) {
4064 // FIXME: Move partial reduction creation, costing and clamping
4065 // here from LoopVectorize.cpp.
4066 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4067 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4068 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4069 RedTy->isFloatingPointTy()
4070 ? std::optional{Red->getFastMathFlags()}
4071 : std::nullopt);
4072 } else if (!RedTy->isFloatingPointTy()) {
4073 // TTI::getExtendedReductionCost only supports integer types.
4074 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4075 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4076 Red->getFastMathFlags(), CostKind);
4077 }
4078 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4079 },
4080 Range);
4081 };
4082
4083 VPValue *A;
4084 // Match reduce(ext)).
4085 if (isa<VPWidenCastRecipe>(VecOp) &&
4086 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4087 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4088 IsExtendedRedValidAndClampRange(
4089 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4090 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4091 Ctx.Types.inferScalarType(A)))
4092 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4093
4094 return nullptr;
4095}
4096
4097/// This function tries convert extended in-loop reductions to
4098/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4099/// and valid. The created VPExpressionRecipe must be decomposed to its
4100/// constituent recipes before execution. Patterns of the
4101/// VPExpressionRecipe:
4102/// reduce.add(mul(...)),
4103/// reduce.add(mul(ext(A), ext(B))),
4104/// reduce.add(ext(mul(ext(A), ext(B)))).
4105/// reduce.fadd(fmul(ext(A), ext(B)))
4106static VPExpressionRecipe *
4108 VPCostContext &Ctx, VFRange &Range) {
4109 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4110 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4111 Opcode != Instruction::FAdd)
4112 return nullptr;
4113
4114 Type *RedTy = Ctx.Types.inferScalarType(Red);
4115
4116 // Clamp the range if using multiply-accumulate-reduction is profitable.
4117 auto IsMulAccValidAndClampRange =
4119 VPWidenCastRecipe *OuterExt) -> bool {
4121 [&](ElementCount VF) {
4123 Type *SrcTy =
4124 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4125 InstructionCost MulAccCost;
4126
4127 if (Red->isPartialReduction()) {
4128 Type *SrcTy2 =
4129 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4130 // FIXME: Move partial reduction creation, costing and clamping
4131 // here from LoopVectorize.cpp.
4132 MulAccCost = Ctx.TTI.getPartialReductionCost(
4133 Opcode, SrcTy, SrcTy2, RedTy, VF,
4135 Ext0->getOpcode())
4138 Ext1->getOpcode())
4140 Mul->getOpcode(), CostKind,
4141 RedTy->isFloatingPointTy()
4142 ? std::optional{Red->getFastMathFlags()}
4143 : std::nullopt);
4144 } else {
4145 // Only partial reductions support mixed or floating-point extends
4146 // at the moment.
4147 if (Ext0 && Ext1 &&
4148 (Ext0->getOpcode() != Ext1->getOpcode() ||
4149 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4150 return false;
4151
4152 bool IsZExt =
4153 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4154 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4155 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4156 SrcVecTy, CostKind);
4157 }
4158
4159 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4160 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4161 InstructionCost ExtCost = 0;
4162 if (Ext0)
4163 ExtCost += Ext0->computeCost(VF, Ctx);
4164 if (Ext1)
4165 ExtCost += Ext1->computeCost(VF, Ctx);
4166 if (OuterExt)
4167 ExtCost += OuterExt->computeCost(VF, Ctx);
4168
4169 return MulAccCost.isValid() &&
4170 MulAccCost < ExtCost + MulCost + RedCost;
4171 },
4172 Range);
4173 };
4174
4175 VPValue *VecOp = Red->getVecOp();
4176 VPRecipeBase *Sub = nullptr;
4177 VPValue *A, *B;
4178 VPValue *Tmp = nullptr;
4179
4180 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4181 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4182 assert(Opcode == Instruction::FAdd &&
4183 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4184 "instruction");
4185 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4186 if (!FMul)
4187 return nullptr;
4188
4189 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4190 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4191
4192 if (RecipeA && RecipeB &&
4193 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4194 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4195 }
4196 }
4197 if (RedTy->isFloatingPointTy())
4198 return nullptr;
4199
4200 // Sub reductions could have a sub between the add reduction and vec op.
4201 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4202 Sub = VecOp->getDefiningRecipe();
4203 VecOp = Tmp;
4204 }
4205
4206 // If ValB is a constant and can be safely extended, truncate it to the same
4207 // type as ExtA's operand, then extend it to the same type as ExtA. This
4208 // creates two uniform extends that can more easily be matched by the rest of
4209 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4210 // replaced with the new extend of the constant.
4211 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4212 VPWidenCastRecipe *&ExtB,
4213 VPValue *&ValB, VPWidenRecipe *Mul) {
4214 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4215 return;
4216 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4217 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4218 const APInt *Const;
4219 if (!match(ValB, m_APInt(Const)) ||
4221 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4222 return;
4223 // The truncate ensures that the type of each extended operand is the
4224 // same, and it's been proven that the constant can be extended from
4225 // NarrowTy safely. Necessary since ExtA's extended operand would be
4226 // e.g. an i8, while the const will likely be an i32. This will be
4227 // elided by later optimisations.
4228 VPBuilder Builder(Mul);
4229 auto *Trunc =
4230 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4231 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4232 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4233 Mul->setOperand(1, ExtB);
4234 };
4235
4236 // Try to match reduce.add(mul(...)).
4237 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4240 auto *Mul = cast<VPWidenRecipe>(VecOp);
4241
4242 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4243 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4244
4245 // Match reduce.add/sub(mul(ext, ext)).
4246 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4247 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4248 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4249 if (Sub)
4250 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4251 cast<VPWidenRecipe>(Sub), Red);
4252 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4253 }
4254 // TODO: Add an expression type for this variant with a negated mul
4255 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4256 return new VPExpressionRecipe(Mul, Red);
4257 }
4258 // TODO: Add an expression type for negated versions of other expression
4259 // variants.
4260 if (Sub)
4261 return nullptr;
4262
4263 // Match reduce.add(ext(mul(A, B))).
4264 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4265 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4266 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4269
4270 // reduce.add(ext(mul(ext, const)))
4271 // -> reduce.add(ext(mul(ext, ext(const))))
4272 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4273
4274 // reduce.add(ext(mul(ext(A), ext(B))))
4275 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4276 // The inner extends must either have the same opcode as the outer extend or
4277 // be the same, in which case the multiply can never result in a negative
4278 // value and the outer extend can be folded away by doing wider
4279 // extends for the operands of the mul.
4280 if (Ext0 && Ext1 &&
4281 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4282 Ext0->getOpcode() == Ext1->getOpcode() &&
4283 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4284 auto *NewExt0 = new VPWidenCastRecipe(
4285 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4286 *Ext0, *Ext0, Ext0->getDebugLoc());
4287 NewExt0->insertBefore(Ext0);
4288
4289 VPWidenCastRecipe *NewExt1 = NewExt0;
4290 if (Ext0 != Ext1) {
4291 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4292 Ext->getResultType(), nullptr, *Ext1,
4293 *Ext1, Ext1->getDebugLoc());
4294 NewExt1->insertBefore(Ext1);
4295 }
4296 Mul->setOperand(0, NewExt0);
4297 Mul->setOperand(1, NewExt1);
4298 Red->setOperand(1, Mul);
4299 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4300 }
4301 }
4302 return nullptr;
4303}
4304
4305/// This function tries to create abstract recipes from the reduction recipe for
4306/// following optimizations and cost estimation.
4308 VPCostContext &Ctx,
4309 VFRange &Range) {
4310 VPExpressionRecipe *AbstractR = nullptr;
4311 auto IP = std::next(Red->getIterator());
4312 auto *VPBB = Red->getParent();
4313 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4314 AbstractR = MulAcc;
4315 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4316 AbstractR = ExtRed;
4317 // Cannot create abstract inloop reduction recipes.
4318 if (!AbstractR)
4319 return;
4320
4321 AbstractR->insertBefore(*VPBB, IP);
4322 Red->replaceAllUsesWith(AbstractR);
4323}
4324
4335
4337 if (Plan.hasScalarVFOnly())
4338 return;
4339
4340#ifndef NDEBUG
4341 VPDominatorTree VPDT(Plan);
4342#endif
4343
4344 SmallVector<VPValue *> VPValues;
4347 append_range(VPValues, Plan.getLiveIns());
4348 for (VPRecipeBase &R : *Plan.getEntry())
4349 append_range(VPValues, R.definedValues());
4350
4351 auto *VectorPreheader = Plan.getVectorPreheader();
4352 for (VPValue *VPV : VPValues) {
4354 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4355 continue;
4356
4357 // Add explicit broadcast at the insert point that dominates all users.
4358 VPBasicBlock *HoistBlock = VectorPreheader;
4359 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4360 for (VPUser *User : VPV->users()) {
4361 if (User->usesScalars(VPV))
4362 continue;
4363 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4364 HoistPoint = HoistBlock->begin();
4365 else
4366 assert(VPDT.dominates(VectorPreheader,
4367 cast<VPRecipeBase>(User)->getParent()) &&
4368 "All users must be in the vector preheader or dominated by it");
4369 }
4370
4371 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4372 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4373 VPV->replaceUsesWithIf(Broadcast,
4374 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4375 return Broadcast != &U && !U.usesScalars(VPV);
4376 });
4377 }
4378}
4379
4381 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4382
4383 // Collect candidate loads with invariant addresses and noalias scopes
4384 // metadata and memory-writing recipes with noalias metadata.
4388 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4389 for (VPRecipeBase &R : *VPBB) {
4390 // Only handle single-scalar replicated loads with invariant addresses.
4391 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4392 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4393 RepR->getOpcode() != Instruction::Load)
4394 continue;
4395
4396 VPValue *Addr = RepR->getOperand(0);
4397 if (Addr->isDefinedOutsideLoopRegions()) {
4399 if (!Loc.AATags.Scope)
4400 continue;
4401 CandidateLoads.push_back({RepR, Loc});
4402 }
4403 }
4404 if (R.mayWriteToMemory()) {
4406 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4407 return;
4408 Stores.push_back(*Loc);
4409 }
4410 }
4411 }
4412
4413 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4414 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4415 // Hoist the load to the preheader if it doesn't alias with any stores
4416 // according to the noalias metadata. Other loads should have been hoisted
4417 // by other passes
4418 const AAMDNodes &LoadAA = LoadLoc.AATags;
4419 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4421 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4422 })) {
4423 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4424 }
4425 }
4426}
4427
4428// Collect common metadata from a group of replicate recipes by intersecting
4429// metadata from all recipes in the group.
4431 VPIRMetadata CommonMetadata = *Recipes.front();
4432 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4433 CommonMetadata.intersect(*Recipe);
4434 return CommonMetadata;
4435}
4436
4437template <unsigned Opcode>
4441 const Loop *L) {
4442 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4443 "Only Load and Store opcodes supported");
4444 constexpr bool IsLoad = (Opcode == Instruction::Load);
4445 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4446 VPTypeAnalysis TypeInfo(Plan);
4447
4448 // Group predicated operations by their address SCEV.
4450 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4451 auto *VPBB = cast<VPBasicBlock>(Block);
4452 for (VPRecipeBase &R : *VPBB) {
4453 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4454 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4455 continue;
4456
4457 // For loads, operand 0 is address; for stores, operand 1 is address.
4458 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4459 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4460 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4461 RecipesByAddress[AddrSCEV].push_back(RepR);
4462 }
4463 }
4464
4465 // For each address, collect operations with the same or complementary masks.
4467 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4468 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4469 };
4470 for (auto &[Addr, Recipes] : RecipesByAddress) {
4471 if (Recipes.size() < 2)
4472 continue;
4473
4474 // Collect groups with the same or complementary masks.
4475 for (VPReplicateRecipe *&RecipeI : Recipes) {
4476 if (!RecipeI)
4477 continue;
4478
4479 VPValue *MaskI = RecipeI->getMask();
4480 Type *TypeI = GetLoadStoreValueType(RecipeI);
4482 Group.push_back(RecipeI);
4483 RecipeI = nullptr;
4484
4485 // Find all operations with the same or complementary masks.
4486 bool HasComplementaryMask = false;
4487 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4488 if (!RecipeJ)
4489 continue;
4490
4491 VPValue *MaskJ = RecipeJ->getMask();
4492 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4493 if (TypeI == TypeJ) {
4494 // Check if any operation in the group has a complementary mask with
4495 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4496 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4497 match(MaskJ, m_Not(m_Specific(MaskI)));
4498 Group.push_back(RecipeJ);
4499 RecipeJ = nullptr;
4500 }
4501 }
4502
4503 if (HasComplementaryMask) {
4504 assert(Group.size() >= 2 && "must have at least 2 entries");
4505 AllGroups.push_back(std::move(Group));
4506 }
4507 }
4508 }
4509
4510 return AllGroups;
4511}
4512
4513// Find the recipe with minimum alignment in the group.
4514template <typename InstType>
4515static VPReplicateRecipe *
4517 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4518 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4519 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4520 });
4521}
4522
4525 const Loop *L) {
4526 auto Groups =
4528 if (Groups.empty())
4529 return;
4530
4531 VPDominatorTree VPDT(Plan);
4532
4533 // Process each group of loads.
4534 for (auto &Group : Groups) {
4535 // Sort loads by dominance order, with earliest (most dominating) first.
4536 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4537 return VPDT.properlyDominates(A, B);
4538 });
4539
4540 // Try to use the earliest (most dominating) load to replace all others.
4541 VPReplicateRecipe *EarliestLoad = Group[0];
4542 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4543 VPBasicBlock *LastBB = Group.back()->getParent();
4544
4545 // Check that the load doesn't alias with stores between first and last.
4546 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4547 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4548 continue;
4549
4550 // Collect common metadata from all loads in the group.
4551 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4552
4553 // Find the load with minimum alignment to use.
4554 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4555
4556 // Create an unpredicated version of the earliest load with common
4557 // metadata.
4558 auto *UnpredicatedLoad = new VPReplicateRecipe(
4559 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4560 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4561 CommonMetadata);
4562
4563 UnpredicatedLoad->insertBefore(EarliestLoad);
4564
4565 // Replace all loads in the group with the unpredicated load.
4566 for (VPReplicateRecipe *Load : Group) {
4567 Load->replaceAllUsesWith(UnpredicatedLoad);
4568 Load->eraseFromParent();
4569 }
4570 }
4571}
4572
4573static bool
4575 PredicatedScalarEvolution &PSE, const Loop &L,
4576 VPTypeAnalysis &TypeInfo) {
4577 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4578 if (!StoreLoc || !StoreLoc->AATags.Scope)
4579 return false;
4580
4581 // When sinking a group of stores, all members of the group alias each other.
4582 // Skip them during the alias checks.
4583 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4584 StoresToSink.end());
4585
4586 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4587 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4588 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4589 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4590}
4591
4594 const Loop *L) {
4595 auto Groups =
4597 if (Groups.empty())
4598 return;
4599
4600 VPDominatorTree VPDT(Plan);
4601 VPTypeAnalysis TypeInfo(Plan);
4602
4603 for (auto &Group : Groups) {
4604 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4605 return VPDT.properlyDominates(A, B);
4606 });
4607
4608 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4609 continue;
4610
4611 // Use the last (most dominated) store's location for the unconditional
4612 // store.
4613 VPReplicateRecipe *LastStore = Group.back();
4614 VPBasicBlock *InsertBB = LastStore->getParent();
4615
4616 // Collect common alias metadata from all stores in the group.
4617 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4618
4619 // Build select chain for stored values.
4620 VPValue *SelectedValue = Group[0]->getOperand(0);
4621 VPBuilder Builder(InsertBB, LastStore->getIterator());
4622
4623 for (unsigned I = 1; I < Group.size(); ++I) {
4624 VPValue *Mask = Group[I]->getMask();
4625 VPValue *Value = Group[I]->getOperand(0);
4626 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4627 Group[I]->getDebugLoc());
4628 }
4629
4630 // Find the store with minimum alignment to use.
4631 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4632
4633 // Create unconditional store with selected value and common metadata.
4634 auto *UnpredicatedStore =
4635 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4636 {SelectedValue, LastStore->getOperand(1)},
4637 /*IsSingleScalar=*/false,
4638 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4639 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4640
4641 // Remove all predicated stores from the group.
4642 for (VPReplicateRecipe *Store : Group)
4643 Store->eraseFromParent();
4644 }
4645}
4646
4648 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4650 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4651 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4652
4653 VPValue *TC = Plan.getTripCount();
4654 // Skip cases for which the trip count may be non-trivial to materialize.
4655 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4656 // tail is required.
4657 if (!Plan.hasScalarTail() ||
4659 Plan.getScalarPreheader() ||
4660 !isa<VPIRValue>(TC))
4661 return;
4662
4663 // Materialize vector trip counts for constants early if it can simply
4664 // be computed as (Original TC / VF * UF) * VF * UF.
4665 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4666 // tail-folded loops.
4667 ScalarEvolution &SE = *PSE.getSE();
4668 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4669 if (!isa<SCEVConstant>(TCScev))
4670 return;
4671 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4672 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4673 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4674 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4675}
4676
4678 VPBasicBlock *VectorPH) {
4680 if (BTC->getNumUsers() == 0)
4681 return;
4682
4683 VPBuilder Builder(VectorPH, VectorPH->begin());
4684 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4685 auto *TCMO = Builder.createNaryOp(
4686 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4687 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4688 BTC->replaceAllUsesWith(TCMO);
4689}
4690
4692 if (Plan.hasScalarVFOnly())
4693 return;
4694
4695 VPTypeAnalysis TypeInfo(Plan);
4696 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4697 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4699 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4700 vp_depth_first_shallow(LoopRegion->getEntry()));
4701 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4702 // VPInstructions, excluding ones in replicate regions. Those are not
4703 // materialized explicitly yet. Those vector users are still handled in
4704 // VPReplicateRegion::execute(), via shouldPack().
4705 // TODO: materialize build vectors for replicating recipes in replicating
4706 // regions.
4707 for (VPBasicBlock *VPBB :
4708 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4709 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4711 continue;
4712 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4713 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4714 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4715 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4716 };
4717 if ((isa<VPReplicateRecipe>(DefR) &&
4718 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4719 (isa<VPInstruction>(DefR) &&
4721 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4722 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4723 continue;
4724
4725 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4726 unsigned Opcode = ScalarTy->isStructTy()
4729 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4730 BuildVector->insertAfter(DefR);
4731
4732 DefR->replaceUsesWithIf(
4733 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4734 VPUser &U, unsigned) {
4735 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4736 });
4737 }
4738 }
4739
4740 // Create explicit VPInstructions to convert vectors to scalars. The current
4741 // implementation is conservative - it may miss some cases that may or may not
4742 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4743 // if they are known to operate on scalar values.
4744 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4745 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4748 continue;
4749 for (VPValue *Def : R.definedValues()) {
4750 // Skip recipes that are single-scalar or only have their first lane
4751 // used.
4752 // TODO: The Defs skipped here may or may not be vector values.
4753 // Introduce Unpacks, and remove them later, if they are guaranteed to
4754 // produce scalar values.
4756 continue;
4757
4758 // At the moment, we create unpacks only for scalar users outside
4759 // replicate regions. Recipes inside replicate regions still extract the
4760 // required lanes implicitly.
4761 // TODO: Remove once replicate regions are unrolled completely.
4762 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4763 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4764 return U->usesScalars(Def) &&
4765 (!ParentRegion || !ParentRegion->isReplicator());
4766 };
4767 if (none_of(Def->users(), IsCandidateUnpackUser))
4768 continue;
4769
4770 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4771 if (R.isPhi())
4772 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4773 else
4774 Unpack->insertAfter(&R);
4775 Def->replaceUsesWithIf(Unpack,
4776 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4777 return IsCandidateUnpackUser(&U);
4778 });
4779 }
4780 }
4781 }
4782}
4783
4785 VPBasicBlock *VectorPHVPBB,
4786 bool TailByMasking,
4787 bool RequiresScalarEpilogue) {
4788 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4789 // There's nothing to do if there are no users of the vector trip count or its
4790 // IR value has already been set.
4791 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4792 return;
4793
4794 VPValue *TC = Plan.getTripCount();
4795 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4796 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4797 VPValue *Step = &Plan.getVFxUF();
4798
4799 // If the tail is to be folded by masking, round the number of iterations N
4800 // up to a multiple of Step instead of rounding down. This is done by first
4801 // adding Step-1 and then rounding down. Note that it's ok if this addition
4802 // overflows: the vector induction variable will eventually wrap to zero given
4803 // that it starts at zero and its Step is a power of two; the loop will then
4804 // exit, with the last early-exit vector comparison also producing all-true.
4805 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4806 // is accounted for in emitIterationCountCheck that adds an overflow check.
4807 if (TailByMasking) {
4808 TC = Builder.createNaryOp(
4809 Instruction::Add,
4810 {TC, Builder.createNaryOp(Instruction::Sub,
4811 {Step, Plan.getConstantInt(TCTy, 1)})},
4812 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4813 }
4814
4815 // Now we need to generate the expression for the part of the loop that the
4816 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4817 // iterations are not required for correctness, or N - Step, otherwise. Step
4818 // is equal to the vectorization factor (number of SIMD elements) times the
4819 // unroll factor (number of SIMD instructions).
4820 VPValue *R =
4821 Builder.createNaryOp(Instruction::URem, {TC, Step},
4822 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4823
4824 // There are cases where we *must* run at least one iteration in the remainder
4825 // loop. See the cost model for when this can happen. If the step evenly
4826 // divides the trip count, we set the remainder to be equal to the step. If
4827 // the step does not evenly divide the trip count, no adjustment is necessary
4828 // since there will already be scalar iterations. Note that the minimum
4829 // iterations check ensures that N >= Step.
4830 if (RequiresScalarEpilogue) {
4831 assert(!TailByMasking &&
4832 "requiring scalar epilogue is not supported with fail folding");
4833 VPValue *IsZero =
4834 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4835 R = Builder.createSelect(IsZero, Step, R);
4836 }
4837
4838 VPValue *Res = Builder.createNaryOp(
4839 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4840 VectorTC.replaceAllUsesWith(Res);
4841}
4842
4844 ElementCount VFEC) {
4845 VPBuilder Builder(VectorPH, VectorPH->begin());
4846 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4847 VPValue &VF = Plan.getVF();
4848 VPValue &VFxUF = Plan.getVFxUF();
4849 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4850 // used.
4851 // TODO: Assert that they aren't used.
4852
4853 // If there are no users of the runtime VF, compute VFxUF by constant folding
4854 // the multiplication of VF and UF.
4855 if (VF.getNumUsers() == 0) {
4856 VPValue *RuntimeVFxUF =
4857 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4858 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4859 return;
4860 }
4861
4862 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4863 // vscale) * UF.
4864 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4866 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4868 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4869 }
4870 VF.replaceAllUsesWith(RuntimeVF);
4871
4872 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4873 VPValue *MulByUF = Builder.createOverflowingOp(
4874 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4875 VFxUF.replaceAllUsesWith(MulByUF);
4876}
4877
4880 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4881
4882 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4883 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4884 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4885 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4887 continue;
4888 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4889 if (!ExpSCEV)
4890 break;
4891 const SCEV *Expr = ExpSCEV->getSCEV();
4892 Value *Res =
4893 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4894 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4895 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4896 ExpSCEV->replaceAllUsesWith(Exp);
4897 if (Plan.getTripCount() == ExpSCEV)
4898 Plan.resetTripCount(Exp);
4899 ExpSCEV->eraseFromParent();
4900 }
4902 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4903 "before any VPIRInstructions");
4904 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4905 // to the VPIRBasicBlock.
4906 auto EI = Entry->begin();
4907 for (Instruction &I : drop_end(*EntryBB)) {
4908 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4909 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4910 EI++;
4911 continue;
4912 }
4914 }
4915
4916 return ExpandedSCEVs;
4917}
4918
4919/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4920/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4921/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4922/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4923/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4924/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4925/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4926/// is defined at \p Idx of a load interleave group.
4927static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4928 VPValue *OpV, unsigned Idx) {
4929 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4930 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4931 if (!Member0OpR)
4932 return Member0Op == OpV;
4933 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4934 return !W->getMask() && Member0Op == OpV;
4935 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4936 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4937 return false;
4938}
4939
4940/// Returns true if \p IR is a full interleave group with factor and number of
4941/// members both equal to \p VF. The interleave group must also access the full
4942/// vector width \p VectorRegWidth.
4944 ElementCount VF,
4945 VPTypeAnalysis &TypeInfo,
4946 TypeSize VectorRegWidth) {
4947 if (!InterleaveR || InterleaveR->getMask())
4948 return false;
4949
4950 Type *GroupElementTy = nullptr;
4951 if (InterleaveR->getStoredValues().empty()) {
4952 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4953 if (!all_of(InterleaveR->definedValues(),
4954 [&TypeInfo, GroupElementTy](VPValue *Op) {
4955 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4956 }))
4957 return false;
4958 } else {
4959 GroupElementTy =
4960 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4961 if (!all_of(InterleaveR->getStoredValues(),
4962 [&TypeInfo, GroupElementTy](VPValue *Op) {
4963 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4964 }))
4965 return false;
4966 }
4967
4968 unsigned VFMin = VF.getKnownMinValue();
4969 TypeSize GroupSize = TypeSize::get(
4970 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4971 const auto *IG = InterleaveR->getInterleaveGroup();
4972 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4973 GroupSize == VectorRegWidth;
4974}
4975
4976/// Returns true if \p VPValue is a narrow VPValue.
4977static bool isAlreadyNarrow(VPValue *VPV) {
4978 if (isa<VPIRValue>(VPV))
4979 return true;
4980 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4981 return RepR && RepR->isSingleScalar();
4982}
4983
4984// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4985// a narrow variant.
4986static VPValue *
4988 auto *R = V->getDefiningRecipe();
4989 if (!R || NarrowedOps.contains(V))
4990 return V;
4991
4992 if (isAlreadyNarrow(V))
4993 return V;
4994
4995 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4996 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4997 WideMember0->setOperand(
4998 Idx,
4999 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5000 return V;
5001 }
5002
5003 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5004 // Narrow interleave group to wide load, as transformed VPlan will only
5005 // process one original iteration.
5006 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5007 auto *L = new VPWidenLoadRecipe(
5008 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5009 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5010 L->insertBefore(LoadGroup);
5011 NarrowedOps.insert(L);
5012 return L;
5013 }
5014
5015 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5016 assert(RepR->isSingleScalar() &&
5017 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5018 "must be a single scalar load");
5019 NarrowedOps.insert(RepR);
5020 return RepR;
5021 }
5022
5023 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5024 VPValue *PtrOp = WideLoad->getAddr();
5025 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5026 PtrOp = VecPtr->getOperand(0);
5027 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5028 // process one original iteration.
5029 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5030 /*IsUniform*/ true,
5031 /*Mask*/ nullptr, {}, *WideLoad);
5032 N->insertBefore(WideLoad);
5033 NarrowedOps.insert(N);
5034 return N;
5035}
5036
5038 TypeSize VectorRegWidth) {
5039 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5040 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
5041 return;
5042
5043 VPTypeAnalysis TypeInfo(Plan);
5044
5046 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5048 continue;
5049
5052 continue;
5053
5054 // Bail out on recipes not supported at the moment:
5055 // * phi recipes other than the canonical induction
5056 // * recipes writing to memory except interleave groups
5057 // Only support plans with a canonical induction phi.
5058 if (R.isPhi())
5059 return;
5060
5061 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5062 if (R.mayWriteToMemory() && !InterleaveR)
5063 return;
5064
5065 // Do not narrow interleave groups if there are VectorPointer recipes and
5066 // the plan was unrolled. The recipe implicitly uses VF from
5067 // VPTransformState.
5068 // TODO: Remove restriction once the VF for the VectorPointer offset is
5069 // modeled explicitly as operand.
5070 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5071 return;
5072
5073 // All other ops are allowed, but we reject uses that cannot be converted
5074 // when checking all allowed consumers (store interleave groups) below.
5075 if (!InterleaveR)
5076 continue;
5077
5078 // Bail out on non-consecutive interleave groups.
5079 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5080 VectorRegWidth))
5081 return;
5082
5083 // Skip read interleave groups.
5084 if (InterleaveR->getStoredValues().empty())
5085 continue;
5086
5087 // Narrow interleave groups, if all operands are already matching narrow
5088 // ops.
5089 auto *Member0 = InterleaveR->getStoredValues()[0];
5090 if (isAlreadyNarrow(Member0) &&
5091 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5092 StoreGroups.push_back(InterleaveR);
5093 continue;
5094 }
5095
5096 // For now, we only support full interleave groups storing load interleave
5097 // groups.
5098 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5099 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5100 if (!DefR)
5101 return false;
5102 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5103 return IR && IR->getInterleaveGroup()->isFull() &&
5104 IR->getVPValue(Op.index()) == Op.value();
5105 })) {
5106 StoreGroups.push_back(InterleaveR);
5107 continue;
5108 }
5109
5110 // Check if all values feeding InterleaveR are matching wide recipes, which
5111 // operands that can be narrowed.
5112 auto *WideMember0 =
5113 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5114 if (!WideMember0)
5115 return;
5116 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5118 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5119 R->getNumOperands() > 2)
5120 return;
5121 if (any_of(enumerate(R->operands()),
5122 [WideMember0, Idx = I](const auto &P) {
5123 const auto &[OpIdx, OpV] = P;
5124 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5125 }))
5126 return;
5127 }
5128 StoreGroups.push_back(InterleaveR);
5129 }
5130
5131 if (StoreGroups.empty())
5132 return;
5133
5134 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5135 SmallPtrSet<VPValue *, 4> NarrowedOps;
5136 // Narrow operation tree rooted at store groups.
5137 for (auto *StoreGroup : StoreGroups) {
5138 VPValue *Res =
5139 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5140 auto *SI =
5141 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5142 auto *S = new VPWidenStoreRecipe(
5143 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5144 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5145 S->insertBefore(StoreGroup);
5146 StoreGroup->eraseFromParent();
5147 }
5148
5149 // Adjust induction to reflect that the transformed plan only processes one
5150 // original iteration.
5151 auto *CanIV = VectorLoop->getCanonicalIV();
5152 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5153 VPBuilder PHBuilder(Plan.getVectorPreheader());
5154
5155 VPValue *UF = Plan.getOrAddLiveIn(
5156 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5157 if (VF.isScalable()) {
5158 VPValue *VScale = PHBuilder.createElementCount(
5160 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5161 Instruction::Mul, {VScale, UF}, {true, false});
5162 Inc->setOperand(1, VScaleUF);
5163 Plan.getVF().replaceAllUsesWith(VScale);
5164 } else {
5165 Inc->setOperand(1, UF);
5167 Plan.getConstantInt(CanIV->getScalarType(), 1));
5168 }
5169 removeDeadRecipes(Plan);
5170}
5171
5172/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5173/// BranchOnCond recipe.
5175 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5176 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5177 auto *MiddleTerm =
5179 // Only add branch metadata if there is a (conditional) terminator.
5180 if (!MiddleTerm)
5181 return;
5182
5183 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5184 "must have a BranchOnCond");
5185 // Assume that `TripCount % VectorStep ` is equally distributed.
5186 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5187 if (VF.isScalable() && VScaleForTuning.has_value())
5188 VectorStep *= *VScaleForTuning;
5189 assert(VectorStep > 0 && "trip count should not be zero");
5190 MDBuilder MDB(Plan.getContext());
5191 MDNode *BranchWeights =
5192 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5193 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5194}
5195
5196/// Compute and return the end value for \p WideIV, unless it is truncated. If
5197/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5198/// compute the end value of the induction.
5200 VPBuilder &VectorPHBuilder,
5201 VPTypeAnalysis &TypeInfo,
5202 VPValue *VectorTC) {
5203 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5204 // Truncated wide inductions resume from the last lane of their vector value
5205 // in the last vector iteration which is handled elsewhere.
5206 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5207 return nullptr;
5208
5209 VPIRValue *Start = WideIV->getStartValue();
5210 VPValue *Step = WideIV->getStepValue();
5212 VPValue *EndValue = VectorTC;
5213 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5214 EndValue = VectorPHBuilder.createDerivedIV(
5215 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5216 Start, VectorTC, Step);
5217 }
5218
5219 // EndValue is derived from the vector trip count (which has the same type as
5220 // the widest induction) and thus may be wider than the induction here.
5221 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5222 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5223 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5224 ScalarTypeOfWideIV,
5225 WideIV->getDebugLoc());
5226 }
5227
5228 return EndValue;
5229}
5230
5232 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5233 VPTypeAnalysis TypeInfo(Plan);
5234 auto *ScalarPH = Plan.getScalarPreheader();
5235 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5236 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5237 VPBuilder VectorPHBuilder(
5238 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5239 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5240 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5241 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5242
5243 // TODO: Extract final value from induction recipe initially, optimize to
5244 // pre-computed end value together in optimizeInductionExitUsers.
5245 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5246 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5248 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5249 IVEndValues[WideIVR] = EndValue;
5250 ResumePhiR->setOperand(0, EndValue);
5251 ResumePhiR->setName("bc.resume.val");
5252 continue;
5253 }
5254 // TODO: Also handle truncated inductions here. Computing end-values
5255 // separately should be done as VPlan-to-VPlan optimization, after
5256 // legalizing all resume values to use the last lane from the loop.
5257 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5258 "should only skip truncated wide inductions");
5259 continue;
5260 }
5261
5262 // The backedge value provides the value to resume coming out of a loop,
5263 // which for FORs is a vector whose last element needs to be extracted. The
5264 // start value provides the value if the loop is bypassed.
5265 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5266 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5267 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5268 "Cannot handle loops with uncountable early exits");
5269 if (IsFOR) {
5270 auto *ExtractPart = MiddleBuilder.createNaryOp(
5271 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5272 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5274 "vector.recur.extract");
5275 }
5276 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5277 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5278 }
5279}
5280
5282 VFRange &Range) {
5283 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5284 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5285 auto *MiddleVPBB = Plan.getMiddleBlock();
5286 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5287 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5288
5289 auto IsScalableOne = [](ElementCount VF) -> bool {
5290 return VF == ElementCount::getScalable(1);
5291 };
5292
5293 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5294 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5295 if (!FOR)
5296 continue;
5297
5298 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5299 "Cannot handle loops with uncountable early exits");
5300
5301 // This is the second phase of vectorizing first-order recurrences, creating
5302 // extract for users outside the loop. An overview of the transformation is
5303 // described below. Suppose we have the following loop with some use after
5304 // the loop of the last a[i-1],
5305 //
5306 // for (int i = 0; i < n; ++i) {
5307 // t = a[i - 1];
5308 // b[i] = a[i] - t;
5309 // }
5310 // use t;
5311 //
5312 // There is a first-order recurrence on "a". For this loop, the shorthand
5313 // scalar IR looks like:
5314 //
5315 // scalar.ph:
5316 // s.init = a[-1]
5317 // br scalar.body
5318 //
5319 // scalar.body:
5320 // i = phi [0, scalar.ph], [i+1, scalar.body]
5321 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5322 // s2 = a[i]
5323 // b[i] = s2 - s1
5324 // br cond, scalar.body, exit.block
5325 //
5326 // exit.block:
5327 // use = lcssa.phi [s1, scalar.body]
5328 //
5329 // In this example, s1 is a recurrence because it's value depends on the
5330 // previous iteration. In the first phase of vectorization, we created a
5331 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5332 // for users in the scalar preheader and exit block.
5333 //
5334 // vector.ph:
5335 // v_init = vector(..., ..., ..., a[-1])
5336 // br vector.body
5337 //
5338 // vector.body
5339 // i = phi [0, vector.ph], [i+4, vector.body]
5340 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5341 // v2 = a[i, i+1, i+2, i+3]
5342 // b[i] = v2 - v1
5343 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5344 // b[i, i+1, i+2, i+3] = v2 - v1
5345 // br cond, vector.body, middle.block
5346 //
5347 // middle.block:
5348 // vector.recur.extract.for.phi = v2(2)
5349 // vector.recur.extract = v2(3)
5350 // br cond, scalar.ph, exit.block
5351 //
5352 // scalar.ph:
5353 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5354 // [s.init, otherwise]
5355 // br scalar.body
5356 //
5357 // scalar.body:
5358 // i = phi [0, scalar.ph], [i+1, scalar.body]
5359 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5360 // s2 = a[i]
5361 // b[i] = s2 - s1
5362 // br cond, scalar.body, exit.block
5363 //
5364 // exit.block:
5365 // lo = lcssa.phi [s1, scalar.body],
5366 // [vector.recur.extract.for.phi, middle.block]
5367 //
5368 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5369 // Extract the penultimate value of the recurrence and use it as operand for
5370 // the VPIRInstruction modeling the phi.
5372 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5374 continue;
5375
5376 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5377 // penultimate value of the recurrence. Instead we rely on the existing
5378 // extract of the last element from the result of
5379 // VPInstruction::FirstOrderRecurrenceSplice.
5380 // TODO: Consider vscale_range info and UF.
5382 Range))
5383 return;
5384 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5385 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5386 "vector.recur.extract.for.phi");
5387 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5388 }
5389 }
5390}
5391
5392namespace {
5393
5394/// A chain of recipes that form a partial reduction. Matches either
5395/// reduction_bin_op (extend (A), accumulator), or
5396/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5397struct VPPartialReductionChain {
5398 /// The top-level binary operation that forms the reduction to a scalar
5399 /// after the loop body.
5400 VPWidenRecipe *ReductionBinOp;
5401 /// The extension of each of the inner binary operation's operands.
5402 VPWidenCastRecipe *ExtendA;
5403 VPWidenCastRecipe *ExtendB;
5404 /// The user of the extends that is then reduced.
5405 VPWidenRecipe *BinOp;
5406 unsigned ScaleFactor;
5407};
5408
5409// Helper to transform a partial reduction chain into a partial reduction
5410// recipe. Returns true if transformation succeeded. Checks profitability and
5411// clamps VF range.
5412static bool transformToPartialReduction(const VPPartialReductionChain &Chain,
5413 VFRange &Range, VPCostContext &CostCtx,
5414 VPlan &Plan) {
5415 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5416 unsigned ScaleFactor = Chain.ScaleFactor;
5417 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5418
5419 VPValue *BinOp = WidenRecipe->getOperand(0);
5420 VPValue *Accumulator = WidenRecipe->getOperand(1);
5421
5422 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5424 std::swap(BinOp, Accumulator);
5425
5426 // For chained reductions, only transform if accumulator is already a PHI or
5427 // partial reduction. Otherwise, it needs to be transformed first.
5428 auto *AccumRecipe = Accumulator->getDefiningRecipe();
5430 return false;
5431
5432 // Check if the partial reduction is profitable for the VF range.
5433 Type *PhiType = CostCtx.Types.inferScalarType(Accumulator);
5434
5435 // Derive extend info from the stored extends.
5436 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
5437 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
5438 if (!Ext)
5439 return {nullptr, TargetTransformInfo::PR_None};
5440 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
5442 static_cast<Instruction::CastOps>(Ext->getOpcode()));
5443 return {ExtOpType, ExtKind};
5444 };
5445 auto ExtInfoA = GetExtInfo(Chain.ExtendA);
5446 auto ExtInfoB = GetExtInfo(Chain.ExtendB);
5447 Type *ExtOpTypeA = ExtInfoA.first;
5448 Type *ExtOpTypeB = ExtInfoB.first;
5449 auto ExtKindA = ExtInfoA.second;
5450 auto ExtKindB = ExtInfoB.second;
5451 // If ExtendB is nullptr but there's a separate BinOp, the second operand
5452 // was a constant that can use the same extend kind as the first.
5453 if (!Chain.ExtendB && Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp) {
5454 // Validate that the constant can be extended to the narrow type.
5455 const APInt *Const = nullptr;
5456 for (VPValue *Op : Chain.BinOp->operands()) {
5457 if (match(Op, m_APInt(Const)))
5458 break;
5459 }
5460 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
5461 return false;
5462 ExtOpTypeB = ExtOpTypeA;
5463 ExtKindB = ExtKindA;
5464 }
5465
5466 // BinOpc is only set when there's a separate binary op (not when BinOp is
5467 // the reduction itself).
5468 std::optional<unsigned> BinOpc =
5469 (Chain.BinOp && Chain.BinOp != Chain.ReductionBinOp)
5470 ? std::make_optional(Chain.BinOp->getOpcode())
5471 : std::nullopt;
5472
5474 [&](ElementCount VF) {
5475 return CostCtx.TTI
5477 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType,
5478 VF, ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
5479 PhiType->isFloatingPointTy()
5480 ? std::optional{WidenRecipe->getFastMathFlags()}
5481 : std::nullopt)
5482 .isValid();
5483 },
5484 Range))
5485 return false;
5486
5487 VPValue *Cond = nullptr;
5488 VPValue *ExitValue = nullptr;
5489 if (auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(AccumRecipe)) {
5490 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
5491 RdxPhi->setVFScaleFactor(ScaleFactor);
5492
5493 // Update ReductionStartVector instruction scale factor.
5494 VPValue *StartValue = RdxPhi->getOperand(0);
5495 auto *StartInst = cast<VPInstruction>(StartValue);
5496 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
5497 auto *NewScaleFactor = Plan.getConstantInt(32, ScaleFactor);
5498 StartInst->setOperand(2, NewScaleFactor);
5499
5500 // Find the ComputeReductionResult that uses the WidenRecipe (the exit
5501 // value). Look through selects for predicated reductions.
5502 if (auto *RdxResult = vputils::findComputeReductionResult(RdxPhi)) {
5503 ExitValue = RdxResult->getOperand(0);
5504 match(ExitValue, m_Select(m_VPValue(Cond), m_VPValue(), m_VPValue()));
5505 }
5506 }
5507
5508 // Handle SUB by negating the operand and using ADD for the partial reduction.
5509 if (WidenRecipe->getOpcode() == Instruction::Sub) {
5510 VPBuilder Builder(WidenRecipe);
5511 Type *ElemTy = CostCtx.Types.inferScalarType(BinOp);
5512 auto *Zero = Plan.getConstantInt(ElemTy, 0);
5513 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5514 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5515 : VPIRFlags();
5516 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5518 Builder.insert(NegRecipe);
5519 BinOp = NegRecipe;
5520 }
5521
5522 RecurKind RdxKind =
5524 auto *PartialRed = new VPReductionRecipe(
5525 RdxKind,
5526 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5527 : FastMathFlags(),
5528 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
5529 RdxUnordered{/*VFScaleFactor=*/ScaleFactor});
5530 PartialRed->insertBefore(WidenRecipe);
5531
5532 if (Cond)
5533 ExitValue->replaceAllUsesWith(PartialRed);
5534 WidenRecipe->replaceAllUsesWith(PartialRed);
5535 return true;
5536}
5537
5538/// Examines reduction operations to see if the target can use a cheaper
5539/// operation with a wider per-iteration input VF and narrower PHI VF.
5540/// Recursively calls itself to identify chained scaled reductions.
5541/// Returns true if this invocation added an entry to Chains, otherwise false.
5542static bool
5543getScaledReductions(VPSingleDefRecipe *RedPhiR, VPValue *PrevValue,
5545 VPTypeAnalysis &TypeInfo) {
5546 auto *UpdateR = dyn_cast<VPWidenRecipe>(PrevValue);
5547 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
5548 return false;
5549
5550 VPValue *Op = UpdateR->getOperand(0);
5551 VPValue *PhiOp = UpdateR->getOperand(1);
5552 if (Op == RedPhiR)
5553 std::swap(Op, PhiOp);
5554
5555 // If Op is an extend, then it's still a valid partial reduction if the
5556 // extended mul fulfills the other requirements.
5557 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
5558 // reduction since the inner extends will be widened. We already have oneUse
5559 // checks on the inner extends so widening them is safe.
5560 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
5563 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Op);
5564 if (!CastRecipe)
5565 return false;
5566 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
5567 OuterExtKind = TTI::getPartialReductionExtendKind(CastOp);
5568 Op = CastRecipe->getOperand(0);
5569 }
5570
5571 // Try and get a scaled reduction from the first non-phi operand.
5572 // If one is found, we use the discovered reduction instruction in
5573 // place of the accumulator for costing.
5574 if (getScaledReductions(RedPhiR, Op, Chains, TypeInfo)) {
5575 RedPhiR = Chains.rbegin()->ReductionBinOp;
5576 Op = UpdateR->getOperand(0);
5577 PhiOp = UpdateR->getOperand(1);
5578 if (Op == RedPhiR)
5579 std::swap(Op, PhiOp);
5580 }
5581 if (RedPhiR != PhiOp)
5582 return false;
5583
5584 // If the update is a binary op, check both of its operands to see if
5585 // they are extends. Otherwise, see if the update comes directly from an
5586 // extend.
5587 VPWidenCastRecipe *CastRecipes[2] = {nullptr};
5588
5589 // Match extends and populate CastRecipes. Returns false if matching fails.
5590 auto MatchExtends = [OuterExtKind,
5591 &CastRecipes](ArrayRef<VPValue *> Operands) {
5592 assert(Operands.size() <= 2 && "expected at most 2 operands");
5593
5594 for (const auto &[I, OpVal] : enumerate(Operands)) {
5595 // Allow constant as second operand - validation happens in transform.
5596 const APInt *Unused;
5597 if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused)))
5598 continue;
5599
5600 VPValue *ExtInput;
5601 if (!match(OpVal, m_ZExtOrSExt(m_VPValue(ExtInput))) &&
5602 !match(OpVal, m_FPExt(m_VPValue(ExtInput))))
5603 return false;
5604
5605 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(OpVal);
5606 if (!CastRecipes[I])
5607 return false;
5608
5609 // The outer extend kind must match the inner extends for folding.
5610 if (OuterExtKind) {
5611 auto CastOp =
5612 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
5613 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOp))
5614 return false;
5615 }
5616 }
5617 return CastRecipes[0] != nullptr;
5618 };
5619
5620 // If Op is a binary operator, check both of its operands to see if they are
5621 // extends. Otherwise, see if the update comes directly from an extend.
5622 auto *BinOp = dyn_cast<VPWidenRecipe>(Op);
5623 if (BinOp && Instruction::isBinaryOp(BinOp->getOpcode())) {
5624 if (!BinOp->hasOneUse())
5625 return false;
5626
5627 // Handle neg(binop(ext, ext)) pattern.
5628 VPValue *OtherOp = nullptr;
5629 if (match(BinOp, m_Sub(m_ZeroInt(), m_VPValue(OtherOp))))
5630 BinOp = dyn_cast<VPWidenRecipe>(OtherOp);
5631
5632 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5633 !MatchExtends(BinOp->operands()))
5634 return false;
5635 } else if (match(UpdateR, m_Add(m_VPValue(), m_VPValue())) ||
5636 match(UpdateR, m_FAdd(m_VPValue(), m_VPValue()))) {
5637 // We already know the operands for Update are Op and PhiOp.
5638 if (!MatchExtends({Op}))
5639 return false;
5640 BinOp = UpdateR;
5641 } else {
5642 return false;
5643 }
5644
5645 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
5646 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
5647 Type *ExtOpType = TypeInfo.inferScalarType(CastRecipes[0]->getOperand(0));
5648 TypeSize ASize = ExtOpType->getPrimitiveSizeInBits();
5649 if (!PHISize.hasKnownScalarFactor(ASize))
5650 return false;
5651
5652 Chains.push_back(
5653 {UpdateR, CastRecipes[0], CastRecipes[1], BinOp,
5654 static_cast<unsigned>(PHISize.getKnownScalarFactor(ASize))});
5655 return true;
5656}
5657} // namespace
5658
5660 VPCostContext &CostCtx,
5661 VFRange &Range) {
5662 // Find all possible partial reductions, grouping chains by their PHI. This
5663 // grouping allows invalidating the whole chain, if any link is not a valid
5664 // partial reduction.
5666 ChainsByPhi;
5667 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
5668 for (VPRecipeBase &R : HeaderVPBB->phis()) {
5669 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
5670 if (!RedPhiR)
5671 continue;
5672
5673 // Get the backedge value from the reduction PHI and find the
5674 // ComputeReductionResult that uses it (directly or through a select for
5675 // predicated reductions).
5676 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
5677 VPValue *ExitValue = RdxResult->getOperand(0);
5678 match(ExitValue,
5679 m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
5680 getScaledReductions(RedPhiR, ExitValue, ChainsByPhi[RedPhiR],
5681 CostCtx.Types);
5682 }
5683 }
5684
5685 if (ChainsByPhi.empty())
5686 return;
5687
5688 // Build set of partial reduction operations for extend user validation and
5689 // a map of reduction bin ops to their scale factors for scale validation.
5690 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
5691 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
5692 for (const auto &[_, Chains] : ChainsByPhi)
5693 for (const VPPartialReductionChain &Chain : Chains) {
5694 PartialReductionOps.insert(Chain.BinOp);
5695 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
5696 }
5697
5698 // A partial reduction is invalid if any of its extends are used by
5699 // something that isn't another partial reduction. This is because the
5700 // extends are intended to be lowered along with the reduction itself.
5701 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
5702 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
5703 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
5704 });
5705 };
5706
5707 // Validate chains: check that extends are only used by partial reductions,
5708 // and that reduction bin ops are only used by other partial reductions with
5709 // matching scale factors, are outside the loop region or the select
5710 // introduced by tail-folding. Otherwise we would create users of scaled
5711 // reductions where the types of the other operands don't match.
5712 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
5713 for (const VPPartialReductionChain &Chain : Chains) {
5714 if (!ExtendUsersValid(Chain.ExtendA) ||
5715 !ExtendUsersValid(Chain.ExtendB)) {
5716 Chains.clear();
5717 break;
5718 }
5719 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
5720 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
5721 return PhiR == RedPhiR;
5722 auto *R = cast<VPSingleDefRecipe>(U);
5723 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
5725 m_Specific(Chain.ReductionBinOp))) ||
5726 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
5727 m_Specific(RedPhiR)));
5728 };
5729 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
5730 Chains.clear();
5731 break;
5732 }
5733
5734 // Check if the compute-reduction-result is used by a sunk store.
5735 // TODO: Also form partial reductions in those cases.
5736 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
5737 if (any_of(RdxResult->users(), [](VPUser *U) {
5738 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
5739 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
5740 })) {
5741 Chains.clear();
5742 break;
5743 }
5744 }
5745 }
5746 }
5747
5748 for (const auto &[_, Chains] : ChainsByPhi)
5749 for (const VPPartialReductionChain &Chain : Chains)
5750 transformToPartialReduction(Chain, Range, CostCtx, Plan);
5751}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1558
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3722
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4090
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4165
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4117
iterator end()
Definition VPlan.h:4127
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4125
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4178
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:230
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:591
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:563
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:637
const VPRecipeBase & back() const
Definition VPlan.h:4139
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2626
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2660
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2650
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2666
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2646
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:200
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:175
const std::string & getName() const
Definition VPlan.h:164
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:180
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:263
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:284
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:196
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:215
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:233
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3131
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3664
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3834
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3754
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3176
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2141
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2184
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2173
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4243
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4267
Class to record and manage LLVM IR flags.
Definition VPlan.h:665
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1087
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1141
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1242
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1185
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1236
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1180
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1177
@ CanonicalIVIncrementForPart
Definition VPlan.h:1161
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2769
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2761
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2790
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2843
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2801
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3318
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4395
VPBasicBlock * getParent()
Definition VPlan.h:462
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:536
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3005
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2894
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4278
const VPBlockBase * getEntry() const
Definition VPlan.h:4314
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4389
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4346
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4331
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4376
const VPBlockBase * getExiting() const
Definition VPlan.h:4326
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4339
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3050
bool isSingleScalar() const
Definition VPlan.h:3091
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3115
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3906
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:588
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:651
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
unsigned getNumOperands() const
Definition VPlanValue.h:296
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:135
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1391
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:125
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
bool hasOneUse() const
Definition VPlanValue.h:142
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:172
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1394
unsigned getNumUsers() const
Definition VPlanValue.h:104
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1398
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1999
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3797
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1688
Instruction::CastOps getOpcode() const
Definition VPlan.h:1724
A recipe for handling GEP instructions.
Definition VPlan.h:1936
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2208
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2236
PHINode * getPHINode() const
Definition VPlan.h:2253
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2239
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2256
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2287
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2334
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2338
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2365
A recipe for widening vector intrinsics.
Definition VPlan.h:1738
A common base class for widening memory operations.
Definition VPlan.h:3361
A recipe for widened phis.
Definition VPlan.h:2423
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1632
unsigned getOpcode() const
Definition VPlan.h:1669
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4408
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4691
bool hasVF(ElementCount VF) const
Definition VPlan.h:4605
LLVMContext & getContext() const
Definition VPlan.h:4593
VPBasicBlock * getEntry()
Definition VPlan.h:4497
bool hasScalableVF() const
Definition VPlan.h:4606
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4591
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4587
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4555
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4576
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4670
unsigned getUF() const
Definition VPlan.h:4625
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4739
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4694
bool hasUF(unsigned UF) const
Definition VPlan.h:4623
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4545
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4584
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4647
void setVF(ElementCount VF)
Definition VPlan.h:4599
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4638
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1031
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4569
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4522
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4717
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4667
bool hasScalarVFOnly() const
Definition VPlan.h:4616
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4536
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4541
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4502
void setUF(unsigned UF)
Definition VPlan.h:4630
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4771
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4673
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1751
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2514
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2470
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:183
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:139
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:223
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3494
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3452
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3579
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3535
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...