LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
158 VPReplicateRecipe &GroupLeader;
160 const Loop &L;
161 VPTypeAnalysis &TypeInfo;
162
163 // Return true if \p A and \p B are known to not alias for all VFs in the
164 // plan, checked via the distance between the accesses
165 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
166 if (A->getOpcode() != Instruction::Store ||
167 B->getOpcode() != Instruction::Store)
168 return false;
169
170 VPValue *AddrA = A->getOperand(1);
171 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
172 VPValue *AddrB = B->getOperand(1);
173 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
175 return false;
176
177 const APInt *Distance;
178 ScalarEvolution &SE = *PSE.getSE();
179 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
180 return false;
181
182 const DataLayout &DL = SE.getDataLayout();
183 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
184 uint64_t SizeA = DL.getTypeStoreSize(TyA);
185 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
186 uint64_t SizeB = DL.getTypeStoreSize(TyB);
187
188 // Use the maximum store size to ensure no overlap from either direction.
189 // Currently only handles fixed sizes, as it is only used for
190 // replicating VPReplicateRecipes.
191 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
192
193 auto VFs = B->getParent()->getPlan()->vectorFactors();
195 if (MaxVF.isScalable())
196 return false;
197 return Distance->abs().uge(
198 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
199 }
200
201public:
204 const Loop &L, VPTypeAnalysis &TypeInfo)
205 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
206 L(L), TypeInfo(TypeInfo) {}
207
208 /// Return true if \p R should be skipped during alias checking, either
209 /// because it's in the exclude set or because no-alias can be proven via
210 /// SCEV.
211 bool shouldSkip(VPRecipeBase &R) const {
212 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
213 return ExcludeRecipes.contains(&R) ||
214 (Store && isNoAliasViaDistance(Store, &GroupLeader));
215 }
216};
217
218/// Check if a memory operation doesn't alias with memory operations using
219/// scoped noalias metadata, in blocks in the single-successor chain between \p
220/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
221/// write to memory are checked (for load hoisting). Otherwise recipes that both
222/// read and write memory are checked, and SCEV is used to prove no-alias
223/// between the group leader and other replicate recipes (for store sinking).
224static bool
226 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
227 std::optional<SinkStoreInfo> SinkInfo = {}) {
228 bool CheckReads = SinkInfo.has_value();
229 if (!MemLoc.AATags.Scope)
230 return false;
231
232 for (VPBasicBlock *VPBB :
234 for (VPRecipeBase &R : *VPBB) {
235 if (SinkInfo && SinkInfo->shouldSkip(R))
236 continue;
237
238 // Skip recipes that don't need checking.
239 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
240 continue;
241
243 if (!Loc)
244 // Conservatively assume aliasing for memory operations without
245 // location.
246 return false;
247
249 return false;
250 }
251 }
252 return true;
253}
254
255/// Collect either replicated Loads or Stores grouped by their address SCEV, in
256/// a deep-traversal of the vector loop region in \p Plan.
257template <unsigned Opcode>
260 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
261 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
262 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
263 "Only Load and Store opcodes supported");
264 constexpr bool IsLoad = (Opcode == Instruction::Load);
266 RecipesByAddress;
269 for (VPRecipeBase &R : *VPBB) {
270 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
271 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
272 continue;
273
274 // For loads, operand 0 is address; for stores, operand 1 is address.
275 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
276 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
277 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
278 RecipesByAddress[AddrSCEV].push_back(RepR);
279 }
280 }
281 auto Groups = to_vector(RecipesByAddress.values());
282 VPDominatorTree VPDT(Plan);
283 for (auto &Group : Groups) {
284 // Sort mem ops by dominance order, with earliest (most dominating) first.
286 return VPDT.properlyDominates(A, B);
287 });
288 }
289 return Groups;
290}
291
292/// Return true if we do not know how to (mechanically) hoist or sink \p R out
293/// of a loop region.
295 // Assumes don't alias anything or throw; as long as they're guaranteed to
296 // execute, they're safe to hoist.
298 return false;
299
300 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
301 // memory location is not modified in the vector loop.
302 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
303 return true;
304
305 // Allocas cannot be hoisted.
306 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
307 return RepR && RepR->getOpcode() == Instruction::Alloca;
308}
309
310static bool sinkScalarOperands(VPlan &Plan) {
311 auto Iter = vp_depth_first_deep(Plan.getEntry());
312 bool ScalarVFOnly = Plan.hasScalarVFOnly();
313 bool Changed = false;
314
316 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
317 VPBasicBlock *SinkTo, VPValue *Op) {
318 auto *Candidate =
319 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
320 if (!Candidate)
321 return;
322
323 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
324 // for now.
326 return;
327
328 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
329 return;
330
331 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
332 if (!ScalarVFOnly && RepR->isSingleScalar())
333 return;
334
335 WorkList.insert({SinkTo, Candidate});
336 };
337
338 // First, collect the operands of all recipes in replicate blocks as seeds for
339 // sinking.
341 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
342 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
343 continue;
344 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
345 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
346 continue;
347 for (auto &Recipe : *VPBB)
348 for (VPValue *Op : Recipe.operands())
349 InsertIfValidSinkCandidate(VPBB, Op);
350 }
351
352 // Try to sink each replicate or scalar IV steps recipe in the worklist.
353 for (unsigned I = 0; I != WorkList.size(); ++I) {
354 VPBasicBlock *SinkTo;
355 VPSingleDefRecipe *SinkCandidate;
356 std::tie(SinkTo, SinkCandidate) = WorkList[I];
357
358 // All recipe users of SinkCandidate must be in the same block SinkTo or all
359 // users outside of SinkTo must only use the first lane of SinkCandidate. In
360 // the latter case, we need to duplicate SinkCandidate.
361 auto UsersOutsideSinkTo =
362 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
363 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
364 });
365 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
366 return !U->usesFirstLaneOnly(SinkCandidate);
367 }))
368 continue;
369 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
370
371 if (NeedsDuplicating) {
372 if (ScalarVFOnly)
373 continue;
374 VPSingleDefRecipe *Clone;
375 if (auto *SinkCandidateRepR =
376 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
377 // TODO: Handle converting to uniform recipes as separate transform,
378 // then cloning should be sufficient here.
379 Instruction *I = SinkCandidate->getUnderlyingInstr();
380 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
381 nullptr /*Mask*/, *SinkCandidateRepR,
382 *SinkCandidateRepR);
383 // TODO: add ".cloned" suffix to name of Clone's VPValue.
384 } else {
385 Clone = SinkCandidate->clone();
386 }
387
388 Clone->insertBefore(SinkCandidate);
389 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
390 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
391 });
392 }
393 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
394 for (VPValue *Op : SinkCandidate->operands())
395 InsertIfValidSinkCandidate(SinkTo, Op);
396 Changed = true;
397 }
398 return Changed;
399}
400
401/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
402/// the mask.
404 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
405 if (!EntryBB || EntryBB->size() != 1 ||
406 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
407 return nullptr;
408
409 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
410}
411
412/// If \p R is a triangle region, return the 'then' block of the triangle.
414 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
415 if (EntryBB->getNumSuccessors() != 2)
416 return nullptr;
417
418 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
419 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
420 if (!Succ0 || !Succ1)
421 return nullptr;
422
423 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
424 return nullptr;
425 if (Succ0->getSingleSuccessor() == Succ1)
426 return Succ0;
427 if (Succ1->getSingleSuccessor() == Succ0)
428 return Succ1;
429 return nullptr;
430}
431
432// Merge replicate regions in their successor region, if a replicate region
433// is connected to a successor replicate region with the same predicate by a
434// single, empty VPBasicBlock.
436 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
437
438 // Collect replicate regions followed by an empty block, followed by another
439 // replicate region with matching masks to process front. This is to avoid
440 // iterator invalidation issues while merging regions.
443 vp_depth_first_deep(Plan.getEntry()))) {
444 if (!Region1->isReplicator())
445 continue;
446 auto *MiddleBasicBlock =
447 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
448 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
449 continue;
450
451 auto *Region2 =
452 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
453 if (!Region2 || !Region2->isReplicator())
454 continue;
455
456 VPValue *Mask1 = getPredicatedMask(Region1);
457 VPValue *Mask2 = getPredicatedMask(Region2);
458 if (!Mask1 || Mask1 != Mask2)
459 continue;
460
461 assert(Mask1 && Mask2 && "both region must have conditions");
462 WorkList.push_back(Region1);
463 }
464
465 // Move recipes from Region1 to its successor region, if both are triangles.
466 for (VPRegionBlock *Region1 : WorkList) {
467 if (TransformedRegions.contains(Region1))
468 continue;
469 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
470 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
471
472 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
473 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
474 if (!Then1 || !Then2)
475 continue;
476
477 // Note: No fusion-preventing memory dependencies are expected in either
478 // region. Such dependencies should be rejected during earlier dependence
479 // checks, which guarantee accesses can be re-ordered for vectorization.
480 //
481 // Move recipes to the successor region.
482 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
483 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
484
485 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
486 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
487
488 // Move VPPredInstPHIRecipes from the merge block to the successor region's
489 // merge block. Update all users inside the successor region to use the
490 // original values.
491 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
492 VPValue *PredInst1 =
493 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
494 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
495 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
496 return cast<VPRecipeBase>(&U)->getParent() == Then2;
497 });
498
499 // Remove phi recipes that are unused after merging the regions.
500 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
501 Phi1ToMove.eraseFromParent();
502 continue;
503 }
504 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
505 }
506
507 // Remove the dead recipes in Region1's entry block.
508 for (VPRecipeBase &R :
509 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
510 R.eraseFromParent();
511
512 // Finally, remove the first region.
513 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
514 VPBlockUtils::disconnectBlocks(Pred, Region1);
515 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
516 }
517 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
518 TransformedRegions.insert(Region1);
519 }
520
521 return !TransformedRegions.empty();
522}
523
525 VPlan &Plan) {
526 Instruction *Instr = PredRecipe->getUnderlyingInstr();
527 // Build the triangular if-then region.
528 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
529 assert(Instr->getParent() && "Predicated instruction not in any basic block");
530 auto *BlockInMask = PredRecipe->getMask();
531 auto *MaskDef = BlockInMask->getDefiningRecipe();
532 auto *BOMRecipe = new VPBranchOnMaskRecipe(
533 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
534 auto *Entry =
535 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
536
537 // Replace predicated replicate recipe with a replicate recipe without a
538 // mask but in the replicate region.
539 auto *RecipeWithoutMask = new VPReplicateRecipe(
540 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
541 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
542 PredRecipe->getDebugLoc());
543 auto *Pred =
544 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
545
546 VPPredInstPHIRecipe *PHIRecipe = nullptr;
547 if (PredRecipe->getNumUsers() != 0) {
548 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
549 RecipeWithoutMask->getDebugLoc());
550 PredRecipe->replaceAllUsesWith(PHIRecipe);
551 PHIRecipe->setOperand(0, RecipeWithoutMask);
552 }
553 PredRecipe->eraseFromParent();
554 auto *Exiting =
555 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
557 Plan.createReplicateRegion(Entry, Exiting, RegionName);
558
559 // Note: first set Entry as region entry and then connect successors starting
560 // from it in order, to propagate the "parent" of each VPBasicBlock.
561 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
562 VPBlockUtils::connectBlocks(Pred, Exiting);
563
564 return Region;
565}
566
567static void addReplicateRegions(VPlan &Plan) {
570 vp_depth_first_deep(Plan.getEntry()))) {
571 for (VPRecipeBase &R : *VPBB)
572 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
573 if (RepR->isPredicated())
574 WorkList.push_back(RepR);
575 }
576 }
577
578 unsigned BBNum = 0;
579 for (VPReplicateRecipe *RepR : WorkList) {
580 VPBasicBlock *CurrentBlock = RepR->getParent();
581 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
582
583 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
584 SplitBlock->setName(
585 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
586 // Record predicated instructions for above packing optimizations.
588 Region->setParent(CurrentBlock->getParent());
590
591 VPRegionBlock *ParentRegion = Region->getParent();
592 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
593 ParentRegion->setExiting(SplitBlock);
594 }
595}
596
600 vp_depth_first_deep(Plan.getEntry()))) {
601 // Don't fold the blocks in the skeleton of the Plan into their single
602 // predecessors for now.
603 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
604 if (!VPBB->getParent())
605 continue;
606 auto *PredVPBB =
607 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
608 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
609 isa<VPIRBasicBlock>(PredVPBB))
610 continue;
611 WorkList.push_back(VPBB);
612 }
613
614 for (VPBasicBlock *VPBB : WorkList) {
615 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
616 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
617 R.moveBefore(*PredVPBB, PredVPBB->end());
618 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
619 auto *ParentRegion = VPBB->getParent();
620 if (ParentRegion && ParentRegion->getExiting() == VPBB)
621 ParentRegion->setExiting(PredVPBB);
622 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
623 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
624 }
625 return !WorkList.empty();
626}
627
629 // Convert masked VPReplicateRecipes to if-then region blocks.
631
632 bool ShouldSimplify = true;
633 while (ShouldSimplify) {
634 ShouldSimplify = sinkScalarOperands(Plan);
635 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
636 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
637 }
638}
639
640/// Remove redundant casts of inductions.
641///
642/// Such redundant casts are casts of induction variables that can be ignored,
643/// because we already proved that the casted phi is equal to the uncasted phi
644/// in the vectorized loop. There is no need to vectorize the cast - the same
645/// value can be used for both the phi and casts in the vector loop.
647 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
649 if (!IV || IV->getTruncInst())
650 continue;
651
652 // A sequence of IR Casts has potentially been recorded for IV, which
653 // *must be bypassed* when the IV is vectorized, because the vectorized IV
654 // will produce the desired casted value. This sequence forms a def-use
655 // chain and is provided in reverse order, ending with the cast that uses
656 // the IV phi. Search for the recipe of the last cast in the chain and
657 // replace it with the original IV. Note that only the final cast is
658 // expected to have users outside the cast-chain and the dead casts left
659 // over will be cleaned up later.
660 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
661 VPValue *FindMyCast = IV;
662 for (Instruction *IRCast : reverse(Casts)) {
663 VPSingleDefRecipe *FoundUserCast = nullptr;
664 for (auto *U : FindMyCast->users()) {
665 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
666 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
667 FoundUserCast = UserCast;
668 break;
669 }
670 }
671 FindMyCast = FoundUserCast;
672 }
673 FindMyCast->replaceAllUsesWith(IV);
674 }
675}
676
677/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
678/// recipe, if it exists.
680 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
681 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
682 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
683 for (VPUser *U : CanonicalIV->users()) {
685 if (WidenNewIV)
686 break;
687 }
688
689 if (!WidenNewIV)
690 return;
691
692 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
693 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
694 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
695
696 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
697 continue;
698
699 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
700 // everything WidenNewIV's users need. That is, WidenOriginalIV will
701 // generate a vector phi or all users of WidenNewIV demand the first lane
702 // only.
703 if (Plan.hasScalarVFOnly() ||
704 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
705 vputils::onlyFirstLaneUsed(WidenNewIV)) {
706 // We are replacing a wide canonical iv with a suitable wide induction.
707 // This is used to compute header mask, hence all lanes will be used and
708 // we need to drop wrap flags only applying to lanes guranteed to execute
709 // in the original scalar loop.
710 WidenOriginalIV->dropPoisonGeneratingFlags();
711 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
712 WidenNewIV->eraseFromParent();
713 return;
714 }
715 }
716}
717
718/// Returns true if \p R is dead and can be removed.
719static bool isDeadRecipe(VPRecipeBase &R) {
720 // Do remove conditional assume instructions as their conditions may be
721 // flattened.
722 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
723 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
725 if (IsConditionalAssume)
726 return true;
727
728 if (R.mayHaveSideEffects())
729 return false;
730
731 // Recipe is dead if no user keeps the recipe alive.
732 return all_of(R.definedValues(),
733 [](VPValue *V) { return V->getNumUsers() == 0; });
734}
735
738 Plan.getEntry());
740 // The recipes in the block are processed in reverse order, to catch chains
741 // of dead recipes.
742 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
743 if (isDeadRecipe(R)) {
744 R.eraseFromParent();
745 continue;
746 }
747
748 // Check if R is a dead VPPhi <-> update cycle and remove it.
749 VPValue *Start, *Incoming;
750 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
751 continue;
752 auto *PhiR = cast<VPPhi>(&R);
753 VPUser *PhiUser = PhiR->getSingleUser();
754 if (!PhiUser)
755 continue;
756 if (PhiUser != Incoming->getDefiningRecipe() ||
757 Incoming->getNumUsers() != 1)
758 continue;
759 PhiR->replaceAllUsesWith(Start);
760 PhiR->eraseFromParent();
761 Incoming->getDefiningRecipe()->eraseFromParent();
762 }
763 }
764}
765
768 Instruction::BinaryOps InductionOpcode,
769 FPMathOperator *FPBinOp, Instruction *TruncI,
770 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
771 VPBuilder &Builder) {
772 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
773 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
774 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
775 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
776 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
777
778 // Truncate base induction if needed.
779 VPTypeAnalysis TypeInfo(Plan);
780 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
781 if (TruncI) {
782 Type *TruncTy = TruncI->getType();
783 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
784 "Not truncating.");
785 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
786 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
787 ResultTy = TruncTy;
788 }
789
790 // Truncate step if needed.
791 Type *StepTy = TypeInfo.inferScalarType(Step);
792 if (ResultTy != StepTy) {
793 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
794 "Not truncating.");
795 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
796 auto *VecPreheader =
798 VPBuilder::InsertPointGuard Guard(Builder);
799 Builder.setInsertPoint(VecPreheader);
800 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
801 }
802 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
803 &Plan.getVF(), DL);
804}
805
808 for (unsigned I = 0; I != Users.size(); ++I) {
810 if (isa<VPHeaderPHIRecipe>(Cur))
811 continue;
812 for (VPValue *V : Cur->definedValues())
813 Users.insert_range(V->users());
814 }
815 return Users.takeVector();
816}
817
818/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
819/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
820/// generates scalar values.
821static VPValue *
823 VPlan &Plan, VPBuilder &Builder) {
825 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
826 VPValue *StepV = PtrIV->getOperand(1);
828 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
829 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
830
831 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
832 PtrIV->getDebugLoc(), "next.gep");
833}
834
835/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
836/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
837/// VPWidenPointerInductionRecipe will generate vectors only. If some users
838/// require vectors while other require scalars, the scalar uses need to extract
839/// the scalars from the generated vectors (Note that this is different to how
840/// int/fp inductions are handled). Legalize extract-from-ends using uniform
841/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
842/// the correct end value is available. Also optimize
843/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
844/// providing them scalar steps built on the canonical scalar IV and update the
845/// original IV's users. This is an optional optimization to reduce the needs of
846/// vector extracts.
849 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
850 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
851 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
852 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
853 if (!PhiR)
854 continue;
855
856 // Try to narrow wide and replicating recipes to uniform recipes, based on
857 // VPlan analysis.
858 // TODO: Apply to all recipes in the future, to replace legacy uniformity
859 // analysis.
860 auto Users = collectUsersRecursively(PhiR);
861 for (VPUser *U : reverse(Users)) {
862 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
863 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
864 // Skip recipes that shouldn't be narrowed.
865 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
866 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
867 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
868 continue;
869
870 // Skip recipes that may have other lanes than their first used.
872 continue;
873
874 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
875 Def->operands(), /*IsUniform*/ true,
876 /*Mask*/ nullptr, /*Flags*/ *Def);
877 Clone->insertAfter(Def);
878 Def->replaceAllUsesWith(Clone);
879 }
880
881 // Replace wide pointer inductions which have only their scalars used by
882 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
883 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
884 if (!Plan.hasScalarVFOnly() &&
885 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
886 continue;
887
888 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
889 PtrIV->replaceAllUsesWith(PtrAdd);
890 continue;
891 }
892
893 // Replace widened induction with scalar steps for users that only use
894 // scalars.
895 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
896 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
897 return U->usesScalars(WideIV);
898 }))
899 continue;
900
901 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
903 Plan, ID.getKind(), ID.getInductionOpcode(),
904 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
905 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
906 WideIV->getDebugLoc(), Builder);
907
908 // Update scalar users of IV to use Step instead.
909 if (!HasOnlyVectorVFs) {
910 assert(!Plan.hasScalableVF() &&
911 "plans containing a scalar VF cannot also include scalable VFs");
912 WideIV->replaceAllUsesWith(Steps);
913 } else {
914 bool HasScalableVF = Plan.hasScalableVF();
915 WideIV->replaceUsesWithIf(Steps,
916 [WideIV, HasScalableVF](VPUser &U, unsigned) {
917 if (HasScalableVF)
918 return U.usesFirstLaneOnly(WideIV);
919 return U.usesScalars(WideIV);
920 });
921 }
922 }
923}
924
925/// Check if \p VPV is an untruncated wide induction, either before or after the
926/// increment. If so return the header IV (before the increment), otherwise
927/// return null.
930 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
931 if (WideIV) {
932 // VPV itself is a wide induction, separately compute the end value for exit
933 // users if it is not a truncated IV.
934 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
935 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
936 }
937
938 // Check if VPV is an optimizable induction increment.
939 VPRecipeBase *Def = VPV->getDefiningRecipe();
940 if (!Def || Def->getNumOperands() != 2)
941 return nullptr;
942 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
943 if (!WideIV)
944 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
945 if (!WideIV)
946 return nullptr;
947
948 auto IsWideIVInc = [&]() {
949 auto &ID = WideIV->getInductionDescriptor();
950
951 // Check if VPV increments the induction by the induction step.
952 VPValue *IVStep = WideIV->getStepValue();
953 switch (ID.getInductionOpcode()) {
954 case Instruction::Add:
955 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
956 case Instruction::FAdd:
957 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
958 case Instruction::FSub:
959 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
960 m_Specific(IVStep)));
961 case Instruction::Sub: {
962 // IVStep will be the negated step of the subtraction. Check if Step == -1
963 // * IVStep.
964 VPValue *Step;
965 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
966 return false;
967 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
968 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
969 ScalarEvolution &SE = *PSE.getSE();
970 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
971 !isa<SCEVCouldNotCompute>(StepSCEV) &&
972 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
973 }
974 default:
975 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
976 match(VPV, m_GetElementPtr(m_Specific(WideIV),
977 m_Specific(WideIV->getStepValue())));
978 }
979 llvm_unreachable("should have been covered by switch above");
980 };
981 return IsWideIVInc() ? WideIV : nullptr;
982}
983
984/// Attempts to optimize the induction variable exit values for users in the
985/// early exit block.
987 VPTypeAnalysis &TypeInfo,
988 VPBlockBase *PredVPBB,
989 VPValue *Op,
991 VPValue *Incoming, *Mask;
994 return nullptr;
995
996 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
997 if (!WideIV)
998 return nullptr;
999
1000 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1001 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1002 return nullptr;
1003
1004 // Calculate the final index.
1005 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1006 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1007 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1008 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1009
1010 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1011 VPValue *FirstActiveLane =
1012 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1013 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1014 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1015 FirstActiveLaneType, DL);
1016 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1017
1018 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1019 // changed it means the exit is using the incremented value, so we need to
1020 // add the step.
1021 if (Incoming != WideIV) {
1022 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1023 EndValue = B.createAdd(EndValue, One, DL);
1024 }
1025
1026 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1027 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1028 VPIRValue *Start = WideIV->getStartValue();
1029 VPValue *Step = WideIV->getStepValue();
1030 EndValue = B.createDerivedIV(
1031 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1032 Start, EndValue, Step);
1033 }
1034
1035 return EndValue;
1036}
1037
1038/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1039/// VPDerivedIVRecipe for non-canonical inductions.
1041 VPBuilder &VectorPHBuilder,
1042 VPTypeAnalysis &TypeInfo,
1043 VPValue *VectorTC) {
1044 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1045 // Truncated wide inductions resume from the last lane of their vector value
1046 // in the last vector iteration which is handled elsewhere.
1047 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1048 return nullptr;
1049
1050 VPIRValue *Start = WideIV->getStartValue();
1051 VPValue *Step = WideIV->getStepValue();
1053 VPValue *EndValue = VectorTC;
1054 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1055 EndValue = VectorPHBuilder.createDerivedIV(
1056 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1057 Start, VectorTC, Step);
1058 }
1059
1060 // EndValue is derived from the vector trip count (which has the same type as
1061 // the widest induction) and thus may be wider than the induction here.
1062 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1063 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1064 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1065 ScalarTypeOfWideIV,
1066 WideIV->getDebugLoc());
1067 }
1068
1069 return EndValue;
1070}
1071
1072/// Attempts to optimize the induction variable exit values for users in the
1073/// exit block coming from the latch in the original scalar loop.
1075 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1078 VPWidenInductionRecipe *WideIV = nullptr;
1080 WideIV = getOptimizableIVOf(Incoming, PSE);
1081
1082 if (!WideIV)
1083 return nullptr;
1084
1085 VPValue *EndValue = EndValues.lookup(WideIV);
1086 assert(EndValue && "Must have computed the end value up front");
1087
1088 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1089 // changed it means the exit is using the incremented value, so we don't
1090 // need to subtract the step.
1091 if (Incoming != WideIV)
1092 return EndValue;
1093
1094 // Otherwise, subtract the step from the EndValue.
1095 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1096 VPValue *Step = WideIV->getStepValue();
1097 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1098 if (ScalarTy->isIntegerTy())
1099 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1100 if (ScalarTy->isPointerTy()) {
1101 Type *StepTy = TypeInfo.inferScalarType(Step);
1102 auto *Zero = Plan.getZero(StepTy);
1103 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1104 DebugLoc::getUnknown(), "ind.escape");
1105 }
1106 if (ScalarTy->isFloatingPointTy()) {
1107 const auto &ID = WideIV->getInductionDescriptor();
1108 return B.createNaryOp(
1109 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1110 ? Instruction::FSub
1111 : Instruction::FAdd,
1112 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1113 }
1114 llvm_unreachable("all possible induction types must be handled");
1115 return nullptr;
1116}
1117
1119 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1120 // Compute end values for all inductions.
1121 VPTypeAnalysis TypeInfo(Plan);
1122 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1123 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1124 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1126 VPValue *ResumeTC =
1127 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1128 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1129 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1130 if (!WideIV)
1131 continue;
1133 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1134 EndValues[WideIV] = EndValue;
1135 }
1136
1137 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1138 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1139 VPValue *Op;
1140 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1141 continue;
1142 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1143 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1144 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1145 R.eraseFromParent();
1146 }
1147 }
1148
1149 // Then, optimize exit block users.
1150 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1151 for (VPRecipeBase &R : ExitVPBB->phis()) {
1152 auto *ExitIRI = cast<VPIRPhi>(&R);
1153
1154 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1155 VPValue *Escape = nullptr;
1156 if (PredVPBB == MiddleVPBB)
1157 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1158 ExitIRI->getOperand(Idx),
1159 EndValues, PSE);
1160 else
1162 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1163 if (Escape)
1164 ExitIRI->setOperand(Idx, Escape);
1165 }
1166 }
1167 }
1168}
1169
1170/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1171/// them with already existing recipes expanding the same SCEV expression.
1174
1175 for (VPRecipeBase &R :
1177 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1178 if (!ExpR)
1179 continue;
1180
1181 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1182 if (Inserted)
1183 continue;
1184 ExpR->replaceAllUsesWith(V->second);
1185 ExpR->eraseFromParent();
1186 }
1187}
1188
1190 SmallVector<VPValue *> WorkList;
1192 WorkList.push_back(V);
1193
1194 while (!WorkList.empty()) {
1195 VPValue *Cur = WorkList.pop_back_val();
1196 if (!Seen.insert(Cur).second)
1197 continue;
1198 VPRecipeBase *R = Cur->getDefiningRecipe();
1199 if (!R)
1200 continue;
1201 if (!isDeadRecipe(*R))
1202 continue;
1203 append_range(WorkList, R->operands());
1204 R->eraseFromParent();
1205 }
1206}
1207
1208/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1209/// Returns an optional pair, where the first element indicates whether it is
1210/// an intrinsic ID.
1211static std::optional<std::pair<bool, unsigned>>
1213 return TypeSwitch<const VPSingleDefRecipe *,
1214 std::optional<std::pair<bool, unsigned>>>(R)
1217 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1218 .Case([](const VPWidenIntrinsicRecipe *I) {
1219 return std::make_pair(true, I->getVectorIntrinsicID());
1220 })
1221 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1222 // For recipes that do not directly map to LLVM IR instructions,
1223 // assign opcodes after the last VPInstruction opcode (which is also
1224 // after the last IR Instruction opcode), based on the VPRecipeID.
1225 return std::make_pair(false,
1226 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1227 })
1228 .Default([](auto *) { return std::nullopt; });
1229}
1230
1231/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1232/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1233/// Operands are foldable live-ins.
1235 ArrayRef<VPValue *> Operands,
1236 const DataLayout &DL,
1237 VPTypeAnalysis &TypeInfo) {
1238 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1239 if (!OpcodeOrIID)
1240 return nullptr;
1241
1243 for (VPValue *Op : Operands) {
1244 if (!match(Op, m_LiveIn()))
1245 return nullptr;
1246 Value *V = Op->getUnderlyingValue();
1247 if (!V)
1248 return nullptr;
1249 Ops.push_back(V);
1250 }
1251
1252 auto FoldToIRValue = [&]() -> Value * {
1253 InstSimplifyFolder Folder(DL);
1254 if (OpcodeOrIID->first) {
1255 if (R.getNumOperands() != 2)
1256 return nullptr;
1257 unsigned ID = OpcodeOrIID->second;
1258 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1259 TypeInfo.inferScalarType(&R));
1260 }
1261 unsigned Opcode = OpcodeOrIID->second;
1262 if (Instruction::isBinaryOp(Opcode))
1263 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1264 Ops[0], Ops[1]);
1265 if (Instruction::isCast(Opcode))
1266 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1267 TypeInfo.inferScalarType(R.getVPSingleValue()));
1268 switch (Opcode) {
1270 return Folder.FoldSelect(Ops[0], Ops[1],
1272 case VPInstruction::Not:
1273 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1275 case Instruction::Select:
1276 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1277 case Instruction::ICmp:
1278 case Instruction::FCmp:
1279 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1280 Ops[1]);
1281 case Instruction::GetElementPtr: {
1282 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1283 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1284 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1285 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1286 }
1289 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1290 Ops[0], Ops[1],
1291 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1292 // An extract of a live-in is an extract of a broadcast, so return the
1293 // broadcasted element.
1294 case Instruction::ExtractElement:
1295 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1296 return Ops[0];
1297 }
1298 return nullptr;
1299 };
1300
1301 if (Value *V = FoldToIRValue())
1302 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1303 return nullptr;
1304}
1305
1306/// Try to simplify VPSingleDefRecipe \p Def.
1308 VPlan *Plan = Def->getParent()->getPlan();
1309
1310 // Simplification of live-in IR values for SingleDef recipes using
1311 // InstSimplifyFolder.
1312 const DataLayout &DL = Plan->getDataLayout();
1313 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1314 return Def->replaceAllUsesWith(V);
1315
1316 // Fold PredPHI LiveIn -> LiveIn.
1317 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1318 VPValue *Op = PredPHI->getOperand(0);
1319 if (isa<VPIRValue>(Op))
1320 PredPHI->replaceAllUsesWith(Op);
1321 }
1322
1323 VPBuilder Builder(Def);
1324
1325 // Avoid replacing VPInstructions with underlying values with new
1326 // VPInstructions, as we would fail to create widen/replicate recpes from the
1327 // new VPInstructions without an underlying value, and miss out on some
1328 // transformations that only apply to widened/replicated recipes later, by
1329 // doing so.
1330 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1331 // VPInstructions without underlying values, as those will get skipped during
1332 // cost computation.
1333 bool CanCreateNewRecipe =
1334 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1335
1336 VPValue *A;
1337 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1338 Type *TruncTy = TypeInfo.inferScalarType(Def);
1339 Type *ATy = TypeInfo.inferScalarType(A);
1340 if (TruncTy == ATy) {
1341 Def->replaceAllUsesWith(A);
1342 } else {
1343 // Don't replace a non-widened cast recipe with a widened cast.
1344 if (!isa<VPWidenCastRecipe>(Def))
1345 return;
1346 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1347
1348 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1349 ? Instruction::SExt
1350 : Instruction::ZExt;
1351 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1352 TruncTy);
1353 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1354 // UnderlyingExt has distinct return type, used to retain legacy cost.
1355 Ext->setUnderlyingValue(UnderlyingExt);
1356 }
1357 Def->replaceAllUsesWith(Ext);
1358 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1359 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1360 Def->replaceAllUsesWith(Trunc);
1361 }
1362 }
1363#ifndef NDEBUG
1364 // Verify that the cached type info is for both A and its users is still
1365 // accurate by comparing it to freshly computed types.
1366 VPTypeAnalysis TypeInfo2(*Plan);
1367 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1368 for (VPUser *U : A->users()) {
1369 auto *R = cast<VPRecipeBase>(U);
1370 for (VPValue *VPV : R->definedValues())
1371 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1372 }
1373#endif
1374 }
1375
1376 // Simplify (X && Y) | (X && !Y) -> X.
1377 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1378 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1379 // recipes to be visited during simplification.
1380 VPValue *X, *Y, *Z;
1381 if (match(Def,
1384 Def->replaceAllUsesWith(X);
1385 Def->eraseFromParent();
1386 return;
1387 }
1388
1389 // x | AllOnes -> AllOnes
1390 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1391 return Def->replaceAllUsesWith(
1392 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1393
1394 // x | 0 -> x
1395 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1396 return Def->replaceAllUsesWith(X);
1397
1398 // x | !x -> AllOnes
1400 return Def->replaceAllUsesWith(
1401 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1402
1403 // x & 0 -> 0
1404 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1405 return Def->replaceAllUsesWith(
1406 Plan->getZero(TypeInfo.inferScalarType(Def)));
1407
1408 // x & AllOnes -> x
1409 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1410 return Def->replaceAllUsesWith(X);
1411
1412 // x && false -> false
1413 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1414 return Def->replaceAllUsesWith(Plan->getFalse());
1415
1416 // x && true -> x
1417 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1418 return Def->replaceAllUsesWith(X);
1419
1420 // (x && y) | (x && z) -> x && (y | z)
1421 if (CanCreateNewRecipe &&
1424 // Simplify only if one of the operands has one use to avoid creating an
1425 // extra recipe.
1426 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1427 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1428 return Def->replaceAllUsesWith(
1429 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1430
1431 // x && (x && y) -> x && y
1432 if (match(Def, m_LogicalAnd(m_VPValue(X),
1434 return Def->replaceAllUsesWith(Def->getOperand(1));
1435
1436 // x && (y && x) -> x && y
1437 if (match(Def, m_LogicalAnd(m_VPValue(X),
1439 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1440
1441 // x && !x -> 0
1443 return Def->replaceAllUsesWith(Plan->getFalse());
1444
1445 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1446 return Def->replaceAllUsesWith(X);
1447
1448 // select c, false, true -> not c
1449 VPValue *C;
1450 if (CanCreateNewRecipe &&
1451 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1452 return Def->replaceAllUsesWith(Builder.createNot(C));
1453
1454 // select !c, x, y -> select c, y, x
1455 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1456 Def->setOperand(0, C);
1457 Def->setOperand(1, Y);
1458 Def->setOperand(2, X);
1459 return;
1460 }
1461
1462 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1463 return Def->replaceAllUsesWith(A);
1464
1465 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1466 return Def->replaceAllUsesWith(A);
1467
1468 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1469 return Def->replaceAllUsesWith(
1470 Plan->getZero(TypeInfo.inferScalarType(Def)));
1471
1472 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1473 // Preserve nsw from the Mul on the new Sub.
1475 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1476 return Def->replaceAllUsesWith(
1477 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1478 Def->getDebugLoc(), "", NW));
1479 }
1480
1481 const APInt *APC;
1482 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1483 APC->isPowerOf2())
1484 return Def->replaceAllUsesWith(Builder.createNaryOp(
1485 Instruction::Shl,
1486 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1487 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1488
1489 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1490 APC->isPowerOf2())
1491 return Def->replaceAllUsesWith(Builder.createNaryOp(
1492 Instruction::LShr,
1493 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1494 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1495
1496 if (match(Def, m_Not(m_VPValue(A)))) {
1497 if (match(A, m_Not(m_VPValue(A))))
1498 return Def->replaceAllUsesWith(A);
1499
1500 // Try to fold Not into compares by adjusting the predicate in-place.
1501 CmpPredicate Pred;
1502 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1503 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1504 if (all_of(Cmp->users(),
1506 m_Not(m_Specific(Cmp)),
1507 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1508 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1509 for (VPUser *U : to_vector(Cmp->users())) {
1510 auto *R = cast<VPSingleDefRecipe>(U);
1511 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1512 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1513 R->setOperand(1, Y);
1514 R->setOperand(2, X);
1515 } else {
1516 // not (cmp pred) -> cmp inv_pred
1517 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1518 R->replaceAllUsesWith(Cmp);
1519 }
1520 }
1521 // If Cmp doesn't have a debug location, use the one from the negation,
1522 // to preserve the location.
1523 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1524 Cmp->setDebugLoc(Def->getDebugLoc());
1525 }
1526 }
1527 }
1528
1529 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1530 // any-of (fcmp uno %A, %B), ...
1531 if (match(Def, m_AnyOf())) {
1533 VPRecipeBase *UnpairedCmp = nullptr;
1534 for (VPValue *Op : Def->operands()) {
1535 VPValue *X;
1536 if (Op->getNumUsers() > 1 ||
1538 m_Deferred(X)))) {
1539 NewOps.push_back(Op);
1540 } else if (!UnpairedCmp) {
1541 UnpairedCmp = Op->getDefiningRecipe();
1542 } else {
1543 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1544 UnpairedCmp->getOperand(0), X));
1545 UnpairedCmp = nullptr;
1546 }
1547 }
1548
1549 if (UnpairedCmp)
1550 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1551
1552 if (NewOps.size() < Def->getNumOperands()) {
1553 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1554 return Def->replaceAllUsesWith(NewAnyOf);
1555 }
1556 }
1557
1558 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1559 // This is useful for fmax/fmin without fast-math flags, where we need to
1560 // check if any operand is NaN.
1561 if (CanCreateNewRecipe &&
1563 m_Deferred(X)),
1565 m_Deferred(Y))))) {
1566 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1567 return Def->replaceAllUsesWith(NewCmp);
1568 }
1569
1570 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1571 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1572 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1573 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1574 TypeInfo.inferScalarType(Def))
1575 return Def->replaceAllUsesWith(Def->getOperand(1));
1576
1578 m_One()))) {
1579 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1580 if (TypeInfo.inferScalarType(X) != WideStepTy)
1581 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1582 Def->replaceAllUsesWith(X);
1583 return;
1584 }
1585
1586 // For i1 vp.merges produced by AnyOf reductions:
1587 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1589 m_VPValue(X), m_VPValue())) &&
1591 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1592 Def->setOperand(1, Def->getOperand(0));
1593 Def->setOperand(0, Y);
1594 return;
1595 }
1596
1597 // Simplify MaskedCond with no block mask to its single operand.
1599 !cast<VPInstruction>(Def)->isMasked())
1600 return Def->replaceAllUsesWith(Def->getOperand(0));
1601
1602 // Look through ExtractLastLane.
1603 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1604 if (match(A, m_BuildVector())) {
1605 auto *BuildVector = cast<VPInstruction>(A);
1606 Def->replaceAllUsesWith(
1607 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1608 return;
1609 }
1610 if (Plan->hasScalarVFOnly())
1611 return Def->replaceAllUsesWith(A);
1612 }
1613
1614 // Look through ExtractPenultimateElement (BuildVector ....).
1616 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1617 Def->replaceAllUsesWith(
1618 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1619 return;
1620 }
1621
1622 uint64_t Idx;
1624 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1625 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1626 return;
1627 }
1628
1629 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1630 Def->replaceAllUsesWith(
1631 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1632 return;
1633 }
1634
1635 // Look through broadcast of single-scalar when used as select conditions; in
1636 // that case the scalar condition can be used directly.
1637 if (match(Def,
1640 "broadcast operand must be single-scalar");
1641 Def->setOperand(0, C);
1642 return;
1643 }
1644
1646 if (Def->getNumOperands() == 1) {
1647 Def->replaceAllUsesWith(Def->getOperand(0));
1648 return;
1649 }
1650 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1651 if (all_equal(Phi->incoming_values()))
1652 Phi->replaceAllUsesWith(Phi->getOperand(0));
1653 }
1654 return;
1655 }
1656
1657 VPIRValue *IRV;
1658 if (Def->getNumOperands() == 1 &&
1660 return Def->replaceAllUsesWith(IRV);
1661
1662 // Some simplifications can only be applied after unrolling. Perform them
1663 // below.
1664 if (!Plan->isUnrolled())
1665 return;
1666
1667 // After unrolling, extract-lane may be used to extract values from multiple
1668 // scalar sources. Only simplify when extracting from a single scalar source.
1669 VPValue *LaneToExtract;
1670 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1671 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1673 return Def->replaceAllUsesWith(A);
1674
1675 // Simplify extract-lane with single source to extract-element.
1676 Def->replaceAllUsesWith(Builder.createNaryOp(
1677 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1678 return;
1679 }
1680
1681 // Look for cycles where Def is of the form:
1682 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1683 // IVInc = X + Step ; used by X and Def
1684 // Def = IVInc + Y
1685 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1686 // and if Inc exists, replace it with X.
1687 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1689 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1690 auto *Phi = cast<VPPhi>(X);
1691 auto *IVInc = Def->getOperand(0);
1692 if (IVInc->getNumUsers() == 2) {
1693 // If Phi has a second user (besides IVInc's defining recipe), it must
1694 // be Inc = Phi + Y for the fold to apply.
1697 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1698 Def->replaceAllUsesWith(IVInc);
1699 if (Inc)
1700 Inc->replaceAllUsesWith(Phi);
1701 Phi->setOperand(0, Y);
1702 return;
1703 }
1704 }
1705 }
1706
1707 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1708 // just the pointer operand.
1709 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1710 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1711 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1712
1713 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1714 // the start index is zero and only the first lane 0 is demanded.
1715 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1716 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1717 Steps->replaceAllUsesWith(Steps->getOperand(0));
1718 return;
1719 }
1720 }
1721 // Simplify redundant ReductionStartVector recipes after unrolling.
1722 VPValue *StartV;
1724 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1725 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1726 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1727 return PhiR && PhiR->isInLoop();
1728 });
1729 return;
1730 }
1731
1733 Def->replaceAllUsesWith(A);
1734 return;
1735 }
1736
1737 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1740 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1741 all_of(A->users(),
1742 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1743 return Def->replaceAllUsesWith(A);
1744 }
1745
1746 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1747 return Def->replaceAllUsesWith(A);
1748}
1749
1752 Plan.getEntry());
1753 VPTypeAnalysis TypeInfo(Plan);
1755 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1756 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1757 simplifyRecipe(Def, TypeInfo);
1758 }
1759}
1760
1761/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1762/// header mask to be simplified further when tail folding, e.g. in
1763/// optimizeEVLMasks.
1764static void reassociateHeaderMask(VPlan &Plan) {
1765 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1766 if (!HeaderMask)
1767 return;
1768
1769 SmallVector<VPUser *> Worklist;
1770 for (VPUser *U : HeaderMask->users())
1771 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1773
1774 while (!Worklist.empty()) {
1775 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1776 VPValue *X, *Y;
1777 if (!R || !match(R, m_LogicalAnd(
1778 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1779 m_VPValue(Y))))
1780 continue;
1781 append_range(Worklist, R->users());
1782 VPBuilder Builder(R);
1783 R->replaceAllUsesWith(
1784 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1785 }
1786}
1787
1789 if (Plan.hasScalarVFOnly())
1790 return;
1791
1792 // Try to narrow wide and replicating recipes to single scalar recipes,
1793 // based on VPlan analysis. Only process blocks in the loop region for now,
1794 // without traversing into nested regions, as recipes in replicate regions
1795 // cannot be converted yet.
1798 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1800 VPWidenStoreRecipe>(&R))
1801 continue;
1802 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1803 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1804 continue;
1805
1806 // Convert an unmasked scatter with an uniform address into
1807 // extract-last-lane + scalar store.
1808 // TODO: Add a profitability check comparing the cost of a scatter vs.
1809 // extract + scalar store.
1810 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1811 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1812 !WidenStoreR->isConsecutive()) {
1813 assert(!WidenStoreR->isReverse() &&
1814 "Not consecutive memory recipes shouldn't be reversed");
1815 VPValue *Mask = WidenStoreR->getMask();
1816
1817 // Only convert the scatter to a scalar store if it is unmasked.
1818 // TODO: Support converting scatter masked by the header mask to scalar
1819 // store.
1820 if (Mask)
1821 continue;
1822
1824 {WidenStoreR->getOperand(1)});
1825 Extract->insertBefore(WidenStoreR);
1826
1827 // TODO: Sink the scalar store recipe to middle block if possible.
1828 auto *ScalarStore = new VPReplicateRecipe(
1829 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1830 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1831 *WidenStoreR /*Metadata*/);
1832 ScalarStore->insertBefore(WidenStoreR);
1833 WidenStoreR->eraseFromParent();
1834 continue;
1835 }
1836
1837 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1838 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1839 vputils::isSingleScalar(RepR->getOperand(1))) {
1840 auto *Clone = new VPReplicateRecipe(
1841 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1842 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1843 *RepR /*Metadata*/, RepR->getDebugLoc());
1844 Clone->insertBefore(RepOrWidenR);
1845 VPBuilder Builder(Clone);
1846 VPValue *ExtractOp = Clone->getOperand(0);
1847 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1848 ExtractOp =
1849 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1850 ExtractOp =
1851 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1852 Clone->setOperand(0, ExtractOp);
1853 RepR->eraseFromParent();
1854 continue;
1855 }
1856
1857 // Skip recipes that aren't single scalars.
1858 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1859 continue;
1860
1861 // Predicate to check if a user of Op introduces extra broadcasts.
1862 auto IntroducesBCastOf = [](const VPValue *Op) {
1863 return [Op](const VPUser *U) {
1864 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1868 VPI->getOpcode()))
1869 return false;
1870 }
1871 return !U->usesScalars(Op);
1872 };
1873 };
1874
1875 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1876 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1877 if (any_of(
1878 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1879 IntroducesBCastOf(Op)))
1880 return false;
1881 // Non-constant live-ins require broadcasts, while constants do not
1882 // need explicit broadcasts.
1883 auto *IRV = dyn_cast<VPIRValue>(Op);
1884 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1885 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1886 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1887 }))
1888 continue;
1889
1890 auto *Clone = new VPReplicateRecipe(
1891 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1892 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1893 Clone->insertBefore(RepOrWidenR);
1894 RepOrWidenR->replaceAllUsesWith(Clone);
1895 if (isDeadRecipe(*RepOrWidenR))
1896 RepOrWidenR->eraseFromParent();
1897 }
1898 }
1899}
1900
1901/// Try to see if all of \p Blend's masks share a common value logically and'ed
1902/// and remove it from the masks.
1904 if (Blend->isNormalized())
1905 return;
1906 VPValue *CommonEdgeMask;
1907 if (!match(Blend->getMask(0),
1908 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1909 return;
1910 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1911 if (!match(Blend->getMask(I),
1912 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1913 return;
1914 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1915 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1916}
1917
1918/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1919/// to make sure the masks are simplified.
1920static void simplifyBlends(VPlan &Plan) {
1923 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1924 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1925 if (!Blend)
1926 continue;
1927
1928 removeCommonBlendMask(Blend);
1929
1930 // Try to remove redundant blend recipes.
1931 SmallPtrSet<VPValue *, 4> UniqueValues;
1932 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1933 UniqueValues.insert(Blend->getIncomingValue(0));
1934 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1935 if (!match(Blend->getMask(I), m_False()))
1936 UniqueValues.insert(Blend->getIncomingValue(I));
1937
1938 if (UniqueValues.size() == 1) {
1939 Blend->replaceAllUsesWith(*UniqueValues.begin());
1940 Blend->eraseFromParent();
1941 continue;
1942 }
1943
1944 if (Blend->isNormalized())
1945 continue;
1946
1947 // Normalize the blend so its first incoming value is used as the initial
1948 // value with the others blended into it.
1949
1950 unsigned StartIndex = 0;
1951 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1952 // If a value's mask is used only by the blend then is can be deadcoded.
1953 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1954 // that's used by multiple blends where it can be removed from them all.
1955 VPValue *Mask = Blend->getMask(I);
1956 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1957 StartIndex = I;
1958 break;
1959 }
1960 }
1961
1962 SmallVector<VPValue *, 4> OperandsWithMask;
1963 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1964
1965 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1966 if (I == StartIndex)
1967 continue;
1968 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1969 OperandsWithMask.push_back(Blend->getMask(I));
1970 }
1971
1972 auto *NewBlend =
1973 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1974 OperandsWithMask, *Blend, Blend->getDebugLoc());
1975 NewBlend->insertBefore(&R);
1976
1977 VPValue *DeadMask = Blend->getMask(StartIndex);
1978 Blend->replaceAllUsesWith(NewBlend);
1979 Blend->eraseFromParent();
1981
1982 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1983 VPValue *NewMask;
1984 if (NewBlend->getNumOperands() == 3 &&
1985 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1986 VPValue *Inc0 = NewBlend->getOperand(0);
1987 VPValue *Inc1 = NewBlend->getOperand(1);
1988 VPValue *OldMask = NewBlend->getOperand(2);
1989 NewBlend->setOperand(0, Inc1);
1990 NewBlend->setOperand(1, Inc0);
1991 NewBlend->setOperand(2, NewMask);
1992 if (OldMask->getNumUsers() == 0)
1993 cast<VPInstruction>(OldMask)->eraseFromParent();
1994 }
1995 }
1996 }
1997}
1998
1999/// Optimize the width of vector induction variables in \p Plan based on a known
2000/// constant Trip Count, \p BestVF and \p BestUF.
2002 ElementCount BestVF,
2003 unsigned BestUF) {
2004 // Only proceed if we have not completely removed the vector region.
2005 if (!Plan.getVectorLoopRegion())
2006 return false;
2007
2008 const APInt *TC;
2009 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2010 return false;
2011
2012 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2013 // and UF. Returns at least 8.
2014 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2015 APInt AlignedTC =
2018 APInt MaxVal = AlignedTC - 1;
2019 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2020 };
2021 unsigned NewBitWidth =
2022 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2023
2024 LLVMContext &Ctx = Plan.getContext();
2025 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2026
2027 bool MadeChange = false;
2028
2029 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2030 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2031 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2032
2033 // Currently only handle canonical IVs as it is trivial to replace the start
2034 // and stop values, and we currently only perform the optimization when the
2035 // IV has a single use.
2036 if (!WideIV || !WideIV->isCanonical() ||
2037 WideIV->hasMoreThanOneUniqueUser() ||
2038 NewIVTy == WideIV->getScalarType())
2039 continue;
2040
2041 // Currently only handle cases where the single user is a header-mask
2042 // comparison with the backedge-taken-count.
2043 VPUser *SingleUser = WideIV->getSingleUser();
2044 if (!SingleUser ||
2045 !match(SingleUser,
2046 m_ICmp(m_Specific(WideIV),
2048 continue;
2049
2050 // Update IV operands and comparison bound to use new narrower type.
2051 auto *NewStart = Plan.getZero(NewIVTy);
2052 WideIV->setStartValue(NewStart);
2053 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2054 WideIV->setStepValue(NewStep);
2055
2056 auto *NewBTC = new VPWidenCastRecipe(
2057 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2058 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2059 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2060 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2061 Cmp->setOperand(1, NewBTC);
2062
2063 MadeChange = true;
2064 }
2065
2066 return MadeChange;
2067}
2068
2069/// Return true if \p Cond is known to be true for given \p BestVF and \p
2070/// BestUF.
2072 ElementCount BestVF, unsigned BestUF,
2075 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2076 &PSE](VPValue *C) {
2077 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2078 });
2079
2080 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2082 m_Specific(CanIV->getBackedgeValue()),
2083 m_Specific(&Plan.getVectorTripCount()))))
2084 return false;
2085
2086 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2087 // count is not conveniently available as SCEV so far, so we compare directly
2088 // against the original trip count. This is stricter than necessary, as we
2089 // will only return true if the trip count == vector trip count.
2090 const SCEV *VectorTripCount =
2092 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2093 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2094 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2095 "Trip count SCEV must be computable");
2096 ScalarEvolution &SE = *PSE.getSE();
2097 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2098 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2099 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2100}
2101
2102/// Try to replace multiple active lane masks used for control flow with
2103/// a single, wide active lane mask instruction followed by multiple
2104/// extract subvector intrinsics. This applies to the active lane mask
2105/// instructions both in the loop and in the preheader.
2106/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2107/// new extracts from the first active lane mask, which has it's last
2108/// operand (multiplier) set to UF.
2110 unsigned UF) {
2111 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2112 return false;
2113
2114 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2115 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2116 auto *Term = &ExitingVPBB->back();
2117
2118 using namespace llvm::VPlanPatternMatch;
2120 m_VPValue(), m_VPValue(), m_VPValue())))))
2121 return false;
2122
2123 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2124 LLVMContext &Ctx = Plan.getContext();
2125
2126 auto ExtractFromALM = [&](VPInstruction *ALM,
2127 SmallVectorImpl<VPValue *> &Extracts) {
2128 DebugLoc DL = ALM->getDebugLoc();
2129 for (unsigned Part = 0; Part < UF; ++Part) {
2131 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2132 auto *Ext =
2133 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2134 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2135 Extracts[Part] = Ext;
2136 Ext->insertAfter(ALM);
2137 }
2138 };
2139
2140 // Create a list of each active lane mask phi, ordered by unroll part.
2142 for (VPRecipeBase &R : Header->phis()) {
2144 if (!Phi)
2145 continue;
2146 VPValue *Index = nullptr;
2147 match(Phi->getBackedgeValue(),
2149 assert(Index && "Expected index from ActiveLaneMask instruction");
2150
2151 uint64_t Part;
2152 if (match(Index,
2154 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2155 Phis[Part] = Phi;
2156 else {
2157 // Anything other than a CanonicalIVIncrementForPart is part 0
2158 assert(!match(
2159 Index,
2161 Phis[0] = Phi;
2162 }
2163 }
2164
2165 assert(all_of(Phis, not_equal_to(nullptr)) &&
2166 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2167
2168 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2169 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2170
2171 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2172 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2173 "Expected incoming values of Phi to be ActiveLaneMasks");
2174
2175 // When using wide lane masks, the return type of the get.active.lane.mask
2176 // intrinsic is VF x UF (last operand).
2177 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2178 EntryALM->setOperand(2, ALMMultiplier);
2179 LoopALM->setOperand(2, ALMMultiplier);
2180
2181 // Create UF x extract vectors and insert into preheader.
2182 SmallVector<VPValue *> EntryExtracts(UF);
2183 ExtractFromALM(EntryALM, EntryExtracts);
2184
2185 // Create UF x extract vectors and insert before the loop compare & branch,
2186 // updating the compare to use the first extract.
2187 SmallVector<VPValue *> LoopExtracts(UF);
2188 ExtractFromALM(LoopALM, LoopExtracts);
2189 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2190 Not->setOperand(0, LoopExtracts[0]);
2191
2192 // Update the incoming values of active lane mask phis.
2193 for (unsigned Part = 0; Part < UF; ++Part) {
2194 Phis[Part]->setStartValue(EntryExtracts[Part]);
2195 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2196 }
2197
2198 return true;
2199}
2200
2201/// Try to simplify the branch condition of \p Plan. This may restrict the
2202/// resulting plan to \p BestVF and \p BestUF.
2204 unsigned BestUF,
2206 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2207 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2208 auto *Term = &ExitingVPBB->back();
2209 VPValue *Cond;
2210 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2211 // Check if the branch condition compares the canonical IV increment (for main
2212 // loop), or the canonical IV increment plus an offset (for epilog loop).
2213 if (match(Term, m_BranchOnCount(
2214 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2215 m_VPValue())) ||
2217 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2218 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2219 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2220 const SCEV *VectorTripCount =
2222 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2223 VectorTripCount =
2225 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2226 "Trip count SCEV must be computable");
2227 ScalarEvolution &SE = *PSE.getSE();
2228 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2229 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2230 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2231 return false;
2232 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2234 // For BranchOnCond, check if we can prove the condition to be true using VF
2235 // and UF.
2236 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2237 return false;
2238 } else {
2239 return false;
2240 }
2241
2242 // The vector loop region only executes once. Convert terminator of the
2243 // exiting block to exit in the first iteration.
2244 if (match(Term, m_BranchOnTwoConds())) {
2245 Term->setOperand(1, Plan.getTrue());
2246 return true;
2247 }
2248
2249 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2250 {}, Term->getDebugLoc());
2251 ExitingVPBB->appendRecipe(BOC);
2252 Term->eraseFromParent();
2253
2254 return true;
2255}
2256
2257/// From the definition of llvm.experimental.get.vector.length,
2258/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2262 vp_depth_first_deep(Plan.getEntry()))) {
2263 for (VPRecipeBase &R : *VPBB) {
2264 VPValue *AVL;
2265 if (!match(&R, m_EVL(m_VPValue(AVL))))
2266 continue;
2267
2268 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2269 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2270 continue;
2271 ScalarEvolution &SE = *PSE.getSE();
2272 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2273 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2274 continue;
2275
2277 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2278 R.getDebugLoc());
2279 if (Trunc != AVL) {
2280 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2281 const DataLayout &DL = Plan.getDataLayout();
2282 VPTypeAnalysis TypeInfo(Plan);
2283 if (VPValue *Folded =
2284 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2285 Trunc = Folded;
2286 }
2287 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2288 return true;
2289 }
2290 }
2291 return false;
2292}
2293
2295 unsigned BestUF,
2297 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2298 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2299
2300 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2301 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2302 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2303
2304 if (MadeChange) {
2305 Plan.setVF(BestVF);
2306 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2307 }
2308}
2309
2310/// Sink users of \p FOR after the recipe defining the previous value \p
2311/// Previous of the recurrence. \returns true if all users of \p FOR could be
2312/// re-arranged as needed or false if it is not possible.
2313static bool
2315 VPRecipeBase *Previous,
2316 VPDominatorTree &VPDT) {
2317 // If Previous is a live-in (no defining recipe), it naturally dominates all
2318 // recipes in the loop, so no sinking is needed.
2319 if (!Previous)
2320 return true;
2321
2322 // Collect recipes that need sinking.
2325 Seen.insert(Previous);
2326 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2327 // The previous value must not depend on the users of the recurrence phi. In
2328 // that case, FOR is not a fixed order recurrence.
2329 if (SinkCandidate == Previous)
2330 return false;
2331
2332 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2333 !Seen.insert(SinkCandidate).second ||
2334 VPDT.properlyDominates(Previous, SinkCandidate))
2335 return true;
2336
2337 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2338 return false;
2339
2340 WorkList.push_back(SinkCandidate);
2341 return true;
2342 };
2343
2344 // Recursively sink users of FOR after Previous.
2345 WorkList.push_back(FOR);
2346 for (unsigned I = 0; I != WorkList.size(); ++I) {
2347 VPRecipeBase *Current = WorkList[I];
2348 assert(Current->getNumDefinedValues() == 1 &&
2349 "only recipes with a single defined value expected");
2350
2351 for (VPUser *User : Current->getVPSingleValue()->users()) {
2352 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2353 return false;
2354 }
2355 }
2356
2357 // Keep recipes to sink ordered by dominance so earlier instructions are
2358 // processed first.
2359 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2360 return VPDT.properlyDominates(A, B);
2361 });
2362
2363 for (VPRecipeBase *SinkCandidate : WorkList) {
2364 if (SinkCandidate == FOR)
2365 continue;
2366
2367 SinkCandidate->moveAfter(Previous);
2368 Previous = SinkCandidate;
2369 }
2370 return true;
2371}
2372
2373/// Try to hoist \p Previous and its operands before all users of \p FOR.
2375 VPRecipeBase *Previous,
2376 VPDominatorTree &VPDT) {
2377 if (cannotHoistOrSinkRecipe(*Previous))
2378 return false;
2379
2380 // Collect recipes that need hoisting.
2381 SmallVector<VPRecipeBase *> HoistCandidates;
2383 VPRecipeBase *HoistPoint = nullptr;
2384 // Find the closest hoist point by looking at all users of FOR and selecting
2385 // the recipe dominating all other users.
2386 for (VPUser *U : FOR->users()) {
2387 auto *R = cast<VPRecipeBase>(U);
2388 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2389 HoistPoint = R;
2390 }
2391 assert(all_of(FOR->users(),
2392 [&VPDT, HoistPoint](VPUser *U) {
2393 auto *R = cast<VPRecipeBase>(U);
2394 return HoistPoint == R ||
2395 VPDT.properlyDominates(HoistPoint, R);
2396 }) &&
2397 "HoistPoint must dominate all users of FOR");
2398
2399 auto NeedsHoisting = [HoistPoint, &VPDT,
2400 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2401 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2402 if (!HoistCandidate)
2403 return nullptr;
2404 VPRegionBlock *EnclosingLoopRegion =
2405 HoistCandidate->getParent()->getEnclosingLoopRegion();
2406 assert((!HoistCandidate->getRegion() ||
2407 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2408 "CFG in VPlan should still be flat, without replicate regions");
2409 // Hoist candidate was already visited, no need to hoist.
2410 if (!Visited.insert(HoistCandidate).second)
2411 return nullptr;
2412
2413 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2414 // hoisting.
2415 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2416 return nullptr;
2417
2418 // If we reached a recipe that dominates HoistPoint, we don't need to
2419 // hoist the recipe.
2420 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2421 return nullptr;
2422 return HoistCandidate;
2423 };
2424
2425 if (!NeedsHoisting(Previous->getVPSingleValue()))
2426 return true;
2427
2428 // Recursively try to hoist Previous and its operands before all users of FOR.
2429 HoistCandidates.push_back(Previous);
2430
2431 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2432 VPRecipeBase *Current = HoistCandidates[I];
2433 assert(Current->getNumDefinedValues() == 1 &&
2434 "only recipes with a single defined value expected");
2435 if (cannotHoistOrSinkRecipe(*Current))
2436 return false;
2437
2438 for (VPValue *Op : Current->operands()) {
2439 // If we reach FOR, it means the original Previous depends on some other
2440 // recurrence that in turn depends on FOR. If that is the case, we would
2441 // also need to hoist recipes involving the other FOR, which may break
2442 // dependencies.
2443 if (Op == FOR)
2444 return false;
2445
2446 if (auto *R = NeedsHoisting(Op)) {
2447 // Bail out if the recipe defines multiple values.
2448 // TODO: Hoisting such recipes requires additional handling.
2449 if (R->getNumDefinedValues() != 1)
2450 return false;
2451 HoistCandidates.push_back(R);
2452 }
2453 }
2454 }
2455
2456 // Order recipes to hoist by dominance so earlier instructions are processed
2457 // first.
2458 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2459 return VPDT.properlyDominates(A, B);
2460 });
2461
2462 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2463 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2464 HoistPoint->getIterator());
2465 }
2466
2467 return true;
2468}
2469
2471 VPBuilder &LoopBuilder) {
2472 VPDominatorTree VPDT(Plan);
2473 VPTypeAnalysis TypeInfo(Plan);
2474
2476 for (VPRecipeBase &R :
2479 RecurrencePhis.push_back(FOR);
2480
2481 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2483 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2484 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2485 // to terminate.
2486 while (auto *PrevPhi =
2488 assert(PrevPhi->getParent() == FOR->getParent());
2489 assert(SeenPhis.insert(PrevPhi).second);
2490 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2491 }
2492
2493 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2494 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2495 return false;
2496
2497 // Introduce a recipe to combine the incoming and previous values of a
2498 // fixed-order recurrence.
2499 VPBasicBlock *InsertBlock =
2500 Previous ? Previous->getParent() : FOR->getParent();
2501 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2502 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2503 else
2504 LoopBuilder.setInsertPoint(InsertBlock,
2505 std::next(Previous->getIterator()));
2506
2507 auto *RecurSplice =
2509 {FOR, FOR->getBackedgeValue()});
2510
2511 FOR->replaceAllUsesWith(RecurSplice);
2512 // Set the first operand of RecurSplice to FOR again, after replacing
2513 // all users.
2514 RecurSplice->setOperand(0, FOR);
2515
2516 // Check for users extracting at the penultimate active lane of the FOR.
2517 // If only a single lane is active in the current iteration, we need to
2518 // select the last element from the previous iteration (from the FOR phi
2519 // directly).
2520 for (VPUser *U : RecurSplice->users()) {
2522 m_Specific(RecurSplice))))
2523 continue;
2524
2526 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2527 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2528 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2529 VPValue *One = Plan.getConstantInt(Ty, 1);
2530 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2531 VPValue *PenultimateLastIter =
2532 B.createNaryOp(VPInstruction::ExtractLane,
2533 {PenultimateIndex, FOR->getBackedgeValue()});
2534 VPValue *LastPrevIter =
2535 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2536
2537 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2538 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2539 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2540 }
2541 }
2542 return true;
2543}
2544
2546 for (VPRecipeBase &R :
2548 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2549 if (!PhiR)
2550 continue;
2551 RecurKind RK = PhiR->getRecurrenceKind();
2552 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2554 continue;
2555
2556 for (VPUser *U : collectUsersRecursively(PhiR))
2557 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2558 RecWithFlags->dropPoisonGeneratingFlags();
2559 }
2560 }
2561}
2562
2563namespace {
2564struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2565 static bool isSentinel(const VPSingleDefRecipe *Def) {
2566 return Def == getEmptyKey() || Def == getTombstoneKey();
2567 }
2568
2569 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2570 /// return that source element type.
2571 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2572 // All VPInstructions that lower to GEPs must have the i8 source element
2573 // type (as they are PtrAdds), so we omit it.
2575 .Case([](const VPReplicateRecipe *I) -> Type * {
2576 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2577 return GEP->getSourceElementType();
2578 return nullptr;
2579 })
2580 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2581 [](auto *I) { return I->getSourceElementType(); })
2582 .Default([](auto *) { return nullptr; });
2583 }
2584
2585 /// Returns true if recipe \p Def can be safely handed for CSE.
2586 static bool canHandle(const VPSingleDefRecipe *Def) {
2587 // We can extend the list of handled recipes in the future,
2588 // provided we account for the data embedded in them while checking for
2589 // equality or hashing.
2590 auto C = getOpcodeOrIntrinsicID(Def);
2591
2592 // The issue with (Insert|Extract)Value is that the index of the
2593 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2594 // VPlan.
2595 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2596 C->second == Instruction::ExtractValue)))
2597 return false;
2598
2599 // During CSE, we can only handle recipes that don't read from memory: if
2600 // they read from memory, there could be an intervening write to memory
2601 // before the next instance is CSE'd, leading to an incorrect result.
2602 return !Def->mayReadFromMemory();
2603 }
2604
2605 /// Hash the underlying data of \p Def.
2606 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2607 const VPlan *Plan = Def->getParent()->getPlan();
2608 VPTypeAnalysis TypeInfo(*Plan);
2609 hash_code Result = hash_combine(
2610 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2611 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2613 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2614 if (RFlags->hasPredicate())
2615 return hash_combine(Result, RFlags->getPredicate());
2616 return Result;
2617 }
2618
2619 /// Check equality of underlying data of \p L and \p R.
2620 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2621 if (isSentinel(L) || isSentinel(R))
2622 return L == R;
2623 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2625 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2627 !equal(L->operands(), R->operands()))
2628 return false;
2630 "must have valid opcode info for both recipes");
2631 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2632 if (LFlags->hasPredicate() &&
2633 LFlags->getPredicate() !=
2634 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2635 return false;
2636 // Recipes in replicate regions implicitly depend on predicate. If either
2637 // recipe is in a replicate region, only consider them equal if both have
2638 // the same parent.
2639 const VPRegionBlock *RegionL = L->getRegion();
2640 const VPRegionBlock *RegionR = R->getRegion();
2641 if (((RegionL && RegionL->isReplicator()) ||
2642 (RegionR && RegionR->isReplicator())) &&
2643 L->getParent() != R->getParent())
2644 return false;
2645 const VPlan *Plan = L->getParent()->getPlan();
2646 VPTypeAnalysis TypeInfo(*Plan);
2647 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2648 }
2649};
2650} // end anonymous namespace
2651
2652/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2653/// Plan.
2655 VPDominatorTree VPDT(Plan);
2657
2659 Plan.getEntry());
2661 for (VPRecipeBase &R : *VPBB) {
2662 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2663 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2664 continue;
2665 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2666 // V must dominate Def for a valid replacement.
2667 if (!VPDT.dominates(V->getParent(), VPBB))
2668 continue;
2669 // Only keep flags present on both V and Def.
2670 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2671 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2672 Def->replaceAllUsesWith(V);
2673 continue;
2674 }
2675 CSEMap[Def] = Def;
2676 }
2677 }
2678}
2679
2680/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2681static void licm(VPlan &Plan) {
2682 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2683
2684 // Hoist any loop invariant recipes from the vector loop region to the
2685 // preheader. Preform a shallow traversal of the vector loop region, to
2686 // exclude recipes in replicate regions. Since the top-level blocks in the
2687 // vector loop region are guaranteed to execute if the vector pre-header is,
2688 // we don't need to check speculation safety.
2689 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2690 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2691 "Expected vector prehader's successor to be the vector loop region");
2693 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2694 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2696 continue;
2697 if (any_of(R.operands(), [](VPValue *Op) {
2698 return !Op->isDefinedOutsideLoopRegions();
2699 }))
2700 continue;
2701 R.moveBefore(*Preheader, Preheader->end());
2702 }
2703 }
2704
2705#ifndef NDEBUG
2706 VPDominatorTree VPDT(Plan);
2707#endif
2708 // Sink recipes with no users inside the vector loop region if all users are
2709 // in the same exit block of the region.
2710 // TODO: Extend to sink recipes from inner loops.
2712 LoopRegion->getEntry());
2714 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2716 continue;
2717
2718 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2719 assert(!RepR->isPredicated() &&
2720 "Expected prior transformation of predicated replicates to "
2721 "replicate regions");
2722 // narrowToSingleScalarRecipes should have already maximally narrowed
2723 // replicates to single-scalar replicates.
2724 // TODO: When unrolling, replicateByVF doesn't handle sunk
2725 // non-single-scalar replicates correctly.
2726 if (!RepR->isSingleScalar())
2727 continue;
2728 }
2729
2730 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2731 // support recipes with multiple defined values (e.g., interleaved loads).
2732 auto *Def = cast<VPSingleDefRecipe>(&R);
2733 // Skip recipes without users as we cannot determine a sink block.
2734 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2735 // their execution frequency.
2736 if (Def->getNumUsers() == 0)
2737 continue;
2738
2739 VPBasicBlock *SinkBB = nullptr;
2740 // Cannot sink the recipe if any user
2741 // * is defined in any loop region, or
2742 // * is a phi, or
2743 // * multiple users in different blocks.
2744 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2745 auto *UserR = cast<VPRecipeBase>(U);
2746 VPBasicBlock *Parent = UserR->getParent();
2747 // TODO: If the user is a PHI node, we should check the block of
2748 // incoming value. Support PHI node users if needed.
2749 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2750 return true;
2751 // TODO: Support sinking when users are in multiple blocks.
2752 if (SinkBB && SinkBB != Parent)
2753 return true;
2754 SinkBB = Parent;
2755 return false;
2756 }))
2757 continue;
2758
2759 // Only sink to dedicated exit blocks of the loop region.
2760 if (SinkBB->getSinglePredecessor() != LoopRegion)
2761 continue;
2762
2763 // TODO: This will need to be a check instead of a assert after
2764 // conditional branches in vectorized loops are supported.
2765 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2766 "Defining block must dominate sink block");
2767 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2768 // just moving.
2769 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2770 }
2771 }
2772}
2773
2775 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2776 if (Plan.hasScalarVFOnly())
2777 return;
2778 // Keep track of created truncates, so they can be re-used. Note that we
2779 // cannot use RAUW after creating a new truncate, as this would could make
2780 // other uses have different types for their operands, making them invalidly
2781 // typed.
2783 VPTypeAnalysis TypeInfo(Plan);
2784 VPBasicBlock *PH = Plan.getVectorPreheader();
2787 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2790 continue;
2791
2792 VPValue *ResultVPV = R.getVPSingleValue();
2793 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2794 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2795 if (!NewResSizeInBits)
2796 continue;
2797
2798 // If the value wasn't vectorized, we must maintain the original scalar
2799 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2800 // skip casts which do not need to be handled explicitly here, as
2801 // redundant casts will be removed during recipe simplification.
2803 continue;
2804
2805 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2806 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2807 assert(OldResTy->isIntegerTy() && "only integer types supported");
2808 (void)OldResSizeInBits;
2809
2810 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2811
2812 // Any wrapping introduced by shrinking this operation shouldn't be
2813 // considered undefined behavior. So, we can't unconditionally copy
2814 // arithmetic wrapping flags to VPW.
2815 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2816 VPW->dropPoisonGeneratingFlags();
2817
2818 if (OldResSizeInBits != NewResSizeInBits &&
2819 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2820 // Extend result to original width.
2821 auto *Ext = new VPWidenCastRecipe(
2822 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2823 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2824 Ext->insertAfter(&R);
2825 ResultVPV->replaceAllUsesWith(Ext);
2826 Ext->setOperand(0, ResultVPV);
2827 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2828 } else {
2829 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2830 "Only ICmps should not need extending the result.");
2831 }
2832
2833 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2835 continue;
2836
2837 // Shrink operands by introducing truncates as needed.
2838 unsigned StartIdx =
2839 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2840 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2841 auto *Op = R.getOperand(Idx);
2842 unsigned OpSizeInBits =
2844 if (OpSizeInBits == NewResSizeInBits)
2845 continue;
2846 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2847 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2848 if (!IterIsEmpty) {
2849 R.setOperand(Idx, ProcessedIter->second);
2850 continue;
2851 }
2852
2853 VPBuilder Builder;
2854 if (isa<VPIRValue>(Op))
2855 Builder.setInsertPoint(PH);
2856 else
2857 Builder.setInsertPoint(&R);
2858 VPWidenCastRecipe *NewOp =
2859 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2860 ProcessedIter->second = NewOp;
2861 R.setOperand(Idx, NewOp);
2862 }
2863
2864 }
2865 }
2866}
2867
2868void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2869 std::optional<VPDominatorTree> VPDT;
2870 if (OnlyLatches)
2871 VPDT.emplace(Plan);
2872
2875 VPValue *Cond;
2876 // Skip blocks that are not terminated by BranchOnCond.
2877 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2878 continue;
2879
2880 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2881 continue;
2882
2883 assert(VPBB->getNumSuccessors() == 2 &&
2884 "Two successors expected for BranchOnCond");
2885 unsigned RemovedIdx;
2886 if (match(Cond, m_True()))
2887 RemovedIdx = 1;
2888 else if (match(Cond, m_False()))
2889 RemovedIdx = 0;
2890 else
2891 continue;
2892
2893 VPBasicBlock *RemovedSucc =
2894 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2895 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2896 "There must be a single edge between VPBB and its successor");
2897 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2898 // these recipes.
2899 for (VPRecipeBase &R : RemovedSucc->phis())
2900 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2901
2902 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2903 // automatically on VPlan destruction if it becomes unreachable.
2904 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2905 VPBB->back().eraseFromParent();
2906 }
2907}
2908
2930
2931// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2932// the loop terminator with a branch-on-cond recipe with the negated
2933// active-lane-mask as operand. Note that this turns the loop into an
2934// uncountable one. Only the existing terminator is replaced, all other existing
2935// recipes/users remain unchanged, except for poison-generating flags being
2936// dropped from the canonical IV increment. Return the created
2937// VPActiveLaneMaskPHIRecipe.
2938//
2939// The function adds the following recipes:
2940//
2941// vector.ph:
2942// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2943// %EntryALM = active-lane-mask %EntryInc, TC
2944//
2945// vector.body:
2946// ...
2947// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2948// ...
2949// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2950// %ALM = active-lane-mask %InLoopInc, TC
2951// %Negated = Not %ALM
2952// branch-on-cond %Negated
2953//
2956 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2957 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2958 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2959 VPValue *StartV = CanonicalIVPHI->getStartValue();
2960
2961 auto *CanonicalIVIncrement =
2962 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2963 // TODO: Check if dropping the flags is needed.
2964 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2965 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2966 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2967 // we have to take unrolling into account. Each part needs to start at
2968 // Part * VF
2969 auto *VecPreheader = Plan.getVectorPreheader();
2970 VPBuilder Builder(VecPreheader);
2971
2972 // Create the ActiveLaneMask instruction using the correct start values.
2973 VPValue *TC = Plan.getTripCount();
2974 VPValue *VF = &Plan.getVF();
2975
2976 auto *EntryIncrement = Builder.createOverflowingOp(
2977 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2978 DL, "index.part.next");
2979
2980 // Create the active lane mask instruction in the VPlan preheader.
2981 VPValue *ALMMultiplier =
2982 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2983 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2984 {EntryIncrement, TC, ALMMultiplier}, DL,
2985 "active.lane.mask.entry");
2986
2987 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2988 // preheader ActiveLaneMask instruction.
2989 auto *LaneMaskPhi =
2991 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2992
2993 // Create the active lane mask for the next iteration of the loop before the
2994 // original terminator.
2995 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2996 Builder.setInsertPoint(OriginalTerminator);
2997 auto *InLoopIncrement = Builder.createOverflowingOp(
2999 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3000 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3001 {InLoopIncrement, TC, ALMMultiplier}, DL,
3002 "active.lane.mask.next");
3003 LaneMaskPhi->addOperand(ALM);
3004
3005 // Replace the original terminator with BranchOnCond. We have to invert the
3006 // mask here because a true condition means jumping to the exit block.
3007 auto *NotMask = Builder.createNot(ALM, DL);
3008 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3009 OriginalTerminator->eraseFromParent();
3010 return LaneMaskPhi;
3011}
3012
3014 bool UseActiveLaneMaskForControlFlow) {
3015 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3016 auto *FoundWidenCanonicalIVUser = find_if(
3018 assert(FoundWidenCanonicalIVUser &&
3019 "Must have widened canonical IV when tail folding!");
3020 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3021 auto *WideCanonicalIV =
3022 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3023 VPSingleDefRecipe *LaneMask;
3024 if (UseActiveLaneMaskForControlFlow) {
3025 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3026 } else {
3027 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3028 VPValue *ALMMultiplier =
3029 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3030 LaneMask =
3031 B.createNaryOp(VPInstruction::ActiveLaneMask,
3032 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3033 nullptr, "active.lane.mask");
3034 }
3035
3036 // Walk users of WideCanonicalIV and replace the header mask of the form
3037 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3038 // removing the old one to ensure there is always only a single header mask.
3039 HeaderMask->replaceAllUsesWith(LaneMask);
3040 HeaderMask->eraseFromParent();
3041}
3042
3043template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3044 Op0_t In;
3046
3047 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3048
3049 template <typename OpTy> bool match(OpTy *V) const {
3050 if (m_Specific(In).match(V)) {
3051 Out = nullptr;
3052 return true;
3053 }
3054 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3055 }
3056};
3057
3058/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3059/// Returns the remaining part \p Out if so, or nullptr otherwise.
3060template <typename Op0_t, typename Op1_t>
3061static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3062 Op1_t &Out) {
3063 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3064}
3065
3066/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3067/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3068/// recipe could be created.
3069/// \p HeaderMask Header Mask.
3070/// \p CurRecipe Recipe to be transform.
3071/// \p TypeInfo VPlan-based type analysis.
3072/// \p EVL The explicit vector length parameter of vector-predication
3073/// intrinsics.
3075 VPRecipeBase &CurRecipe,
3076 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3077 VPlan *Plan = CurRecipe.getParent()->getPlan();
3078 DebugLoc DL = CurRecipe.getDebugLoc();
3079 VPValue *Addr, *Mask, *EndPtr;
3080
3081 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3082 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3083 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3084 EVLEndPtr->insertBefore(&CurRecipe);
3085 EVLEndPtr->setOperand(1, &EVL);
3086 return EVLEndPtr;
3087 };
3088
3089 if (match(&CurRecipe,
3090 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3091 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3092 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3093 EVL, Mask);
3094
3095 VPValue *ReversedVal;
3096 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3097 match(ReversedVal,
3098 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3099 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3100 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3101 auto *LoadR = new VPWidenLoadEVLRecipe(
3102 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3103 LoadR->insertBefore(&CurRecipe);
3104 return new VPWidenIntrinsicRecipe(
3105 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3106 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3107 }
3108
3109 VPValue *StoredVal;
3110 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3111 m_RemoveMask(HeaderMask, Mask))) &&
3112 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3113 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3114 StoredVal, EVL, Mask);
3115
3116 if (match(&CurRecipe,
3117 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3118 m_RemoveMask(HeaderMask, Mask))) &&
3119 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3120 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3121 auto *NewReverse = new VPWidenIntrinsicRecipe(
3122 Intrinsic::experimental_vp_reverse,
3123 {ReversedVal, Plan->getTrue(), &EVL},
3124 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3125 NewReverse->insertBefore(&CurRecipe);
3126 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3127 AdjustEndPtr(EndPtr), NewReverse, EVL,
3128 Mask);
3129 }
3130
3131 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3132 if (Rdx->isConditional() &&
3133 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3134 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3135
3136 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3137 if (Interleave->getMask() &&
3138 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3139 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3140
3141 VPValue *LHS, *RHS;
3142 if (match(&CurRecipe,
3143 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3144 return new VPWidenIntrinsicRecipe(
3145 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3146 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3147
3148 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3149 m_VPValue(RHS))))
3150 return new VPWidenIntrinsicRecipe(
3151 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3152 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3153
3154 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3155 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3156 VPValue *ZExt = VPBuilder(&CurRecipe)
3158 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3159 return new VPInstruction(
3160 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3161 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3162 }
3163
3164 return nullptr;
3165}
3166
3167/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3168/// The transforms here need to preserve the original semantics.
3170 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3171 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3174 m_VPValue(EVL))) &&
3175 match(EVL, m_EVL(m_VPValue()))) {
3176 HeaderMask = R.getVPSingleValue();
3177 break;
3178 }
3179 }
3180 if (!HeaderMask)
3181 return;
3182
3183 VPTypeAnalysis TypeInfo(Plan);
3184 SmallVector<VPRecipeBase *> OldRecipes;
3185 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3187 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3188 NewR->insertBefore(R);
3189 for (auto [Old, New] :
3190 zip_equal(R->definedValues(), NewR->definedValues()))
3191 Old->replaceAllUsesWith(New);
3192 OldRecipes.push_back(R);
3193 }
3194 }
3195
3196 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3197 // False, EVL)
3198 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3199 VPValue *Mask;
3200 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3201 auto *LogicalAnd = cast<VPInstruction>(U);
3202 auto *Merge = new VPWidenIntrinsicRecipe(
3203 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3204 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3205 Merge->insertBefore(LogicalAnd);
3206 LogicalAnd->replaceAllUsesWith(Merge);
3207 OldRecipes.push_back(LogicalAnd);
3208 }
3209 }
3210
3211 // Erase old recipes at the end so we don't invalidate TypeInfo.
3212 for (VPRecipeBase *R : reverse(OldRecipes)) {
3213 SmallVector<VPValue *> PossiblyDead(R->operands());
3214 R->eraseFromParent();
3215 for (VPValue *Op : PossiblyDead)
3217 }
3218}
3219
3220/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3221/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3222/// iteration.
3223static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3224 VPTypeAnalysis TypeInfo(Plan);
3225 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3226 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3227
3228 assert(all_of(Plan.getVF().users(),
3231 "User of VF that we can't transform to EVL.");
3232 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3234 });
3235
3236 assert(all_of(Plan.getVFxUF().users(),
3238 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3239 m_Specific(&Plan.getVFxUF())),
3241 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3242 "increment of the canonical induction.");
3243 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3244 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3245 // canonical induction must not be updated.
3247 });
3248
3249 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3250 // contained.
3251 bool ContainsFORs =
3253 if (ContainsFORs) {
3254 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3255 VPValue *MaxEVL = &Plan.getVF();
3256 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3257 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3258 MaxEVL = Builder.createScalarZExtOrTrunc(
3259 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3260 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3261
3262 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3263 VPValue *PrevEVL = Builder.createScalarPhi(
3264 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3265
3268 for (VPRecipeBase &R : *VPBB) {
3269 VPValue *V1, *V2;
3270 if (!match(&R,
3272 m_VPValue(V1), m_VPValue(V2))))
3273 continue;
3274 VPValue *Imm = Plan.getOrAddLiveIn(
3277 Intrinsic::experimental_vp_splice,
3278 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3279 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3280 R.getDebugLoc());
3281 VPSplice->insertBefore(&R);
3282 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3283 }
3284 }
3285 }
3286
3287 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3288 if (!HeaderMask)
3289 return;
3290
3291 // Replace header masks with a mask equivalent to predicating by EVL:
3292 //
3293 // icmp ule widen-canonical-iv backedge-taken-count
3294 // ->
3295 // icmp ult step-vector, EVL
3296 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3297 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3298 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3299 VPValue *EVLMask = Builder.createICmp(
3301 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3302 HeaderMask->replaceAllUsesWith(EVLMask);
3303}
3304
3305/// Converts a tail folded vector loop region to step by
3306/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3307/// iteration.
3308///
3309/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3310/// replaces all uses except the canonical IV increment of
3311/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3312/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3313/// this transformation.
3314///
3315/// - The header mask is replaced with a header mask based on the EVL.
3316///
3317/// - Plans with FORs have a new phi added to keep track of the EVL of the
3318/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3319/// @llvm.vp.splice.
3320///
3321/// The function uses the following definitions:
3322/// %StartV is the canonical induction start value.
3323///
3324/// The function adds the following recipes:
3325///
3326/// vector.ph:
3327/// ...
3328///
3329/// vector.body:
3330/// ...
3331/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3332/// [ %NextIter, %vector.body ]
3333/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3334/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3335/// ...
3336/// %OpEVL = cast i32 %VPEVL to IVSize
3337/// %NextIter = add IVSize %OpEVL, %CurrentIter
3338/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3339/// ...
3340///
3341/// If MaxSafeElements is provided, the function adds the following recipes:
3342/// vector.ph:
3343/// ...
3344///
3345/// vector.body:
3346/// ...
3347/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3348/// [ %NextIter, %vector.body ]
3349/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3350/// %cmp = cmp ult %AVL, MaxSafeElements
3351/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3352/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3353/// ...
3354/// %OpEVL = cast i32 %VPEVL to IVSize
3355/// %NextIter = add IVSize %OpEVL, %CurrentIter
3356/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3357/// ...
3358///
3360 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3361 if (Plan.hasScalarVFOnly())
3362 return;
3363 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3364 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3365
3366 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3367 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3368 VPValue *StartV = CanonicalIVPHI->getStartValue();
3369
3370 // Create the CurrentIteration recipe in the vector loop.
3371 auto *CurrentIteration =
3373 CurrentIteration->insertAfter(CanonicalIVPHI);
3374 VPBuilder Builder(Header, Header->getFirstNonPhi());
3375 // Create the AVL (application vector length), starting from TC -> 0 in steps
3376 // of EVL.
3377 VPPhi *AVLPhi = Builder.createScalarPhi(
3378 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3379 VPValue *AVL = AVLPhi;
3380
3381 if (MaxSafeElements) {
3382 // Support for MaxSafeDist for correct loop emission.
3383 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3384 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3385 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3386 "safe_avl");
3387 }
3388 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3389 DebugLoc::getUnknown(), "evl");
3390
3391 auto *CanonicalIVIncrement =
3392 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3393 Builder.setInsertPoint(CanonicalIVIncrement);
3394 VPValue *OpVPEVL = VPEVL;
3395
3396 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3397 OpVPEVL = Builder.createScalarZExtOrTrunc(
3398 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3399
3400 auto *NextIter = Builder.createAdd(
3401 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3402 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3403 CurrentIteration->addOperand(NextIter);
3404
3405 VPValue *NextAVL =
3406 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3407 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3408 AVLPhi->addOperand(NextAVL);
3409
3410 fixupVFUsersForEVL(Plan, *VPEVL);
3411 removeDeadRecipes(Plan);
3412
3413 // Replace all uses of VPCanonicalIVPHIRecipe by
3414 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3415 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3416 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3417 // TODO: support unroll factor > 1.
3418 Plan.setUF(1);
3419}
3420
3422 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3423 // There should be only one VPCurrentIteration in the entire plan.
3424 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3425
3428 for (VPRecipeBase &R : VPBB->phis())
3429 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3430 assert(!CurrentIteration &&
3431 "Found multiple CurrentIteration. Only one expected");
3432 CurrentIteration = PhiR;
3433 }
3434
3435 // Early return if it is not variable-length stepping.
3436 if (!CurrentIteration)
3437 return;
3438
3439 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3440 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3441
3442 // Convert CurrentIteration to concrete recipe.
3443 auto *ScalarR =
3444 VPBuilder(CurrentIteration)
3446 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3447 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3448 CurrentIteration->replaceAllUsesWith(ScalarR);
3449 CurrentIteration->eraseFromParent();
3450
3451 // Replace CanonicalIVInc with CurrentIteration increment.
3452 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3453 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3454 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3455 m_Specific(&Plan.getVFxUF()))) &&
3456 "Unexpected canonical iv");
3457 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3458
3459 // Remove unused phi and increment.
3460 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3461 CanonicalIVIncrement->eraseFromParent();
3462 CanonicalIV->eraseFromParent();
3463}
3464
3466 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3467 // The canonical IV may not exist at this stage.
3468 if (!LoopRegion ||
3470 return;
3471 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3472 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3473 return;
3474 // The EVL IV is always immediately after the canonical IV.
3476 std::next(CanIV->getIterator()));
3477 if (!EVLPhi)
3478 return;
3479
3480 // Bail if not an EVL tail folded loop.
3481 VPValue *AVL;
3482 if (!match(EVLPhi->getBackedgeValue(),
3483 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3484 return;
3485
3486 // The AVL may be capped to a safe distance.
3487 VPValue *SafeAVL, *UnsafeAVL;
3488 if (match(AVL,
3490 m_VPValue(SafeAVL)),
3491 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3492 AVL = UnsafeAVL;
3493
3494 VPValue *AVLNext;
3495 [[maybe_unused]] bool FoundAVLNext =
3497 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3498 assert(FoundAVLNext && "Didn't find AVL backedge?");
3499
3500 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3501 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3502 if (match(LatchBr, m_BranchOnCond(m_True())))
3503 return;
3504
3505 assert(
3506 match(LatchBr,
3509 m_Specific(&Plan.getVectorTripCount())))) &&
3510 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3511 "trip count");
3512
3513 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3514 VPBuilder Builder(LatchBr);
3515 LatchBr->setOperand(
3516 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3517}
3518
3520 VPlan &Plan, PredicatedScalarEvolution &PSE,
3521 const DenseMap<Value *, const SCEV *> &StridesMap) {
3522 // Replace VPValues for known constant strides guaranteed by predicate scalar
3523 // evolution.
3524 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3525 auto *R = cast<VPRecipeBase>(&U);
3526 return R->getRegion() ||
3527 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3528 };
3529 ValueToSCEVMapTy RewriteMap;
3530 for (const SCEV *Stride : StridesMap.values()) {
3531 using namespace SCEVPatternMatch;
3532 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3533 const APInt *StrideConst;
3534 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3535 // Only handle constant strides for now.
3536 continue;
3537
3538 auto *CI = Plan.getConstantInt(*StrideConst);
3539 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3540 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3541
3542 // The versioned value may not be used in the loop directly but through a
3543 // sext/zext. Add new live-ins in those cases.
3544 for (Value *U : StrideV->users()) {
3546 continue;
3547 VPValue *StrideVPV = Plan.getLiveIn(U);
3548 if (!StrideVPV)
3549 continue;
3550 unsigned BW = U->getType()->getScalarSizeInBits();
3551 APInt C =
3552 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3553 VPValue *CI = Plan.getConstantInt(C);
3554 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3555 }
3556 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3557 }
3558
3559 for (VPRecipeBase &R : *Plan.getEntry()) {
3560 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3561 if (!ExpSCEV)
3562 continue;
3563 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3564 auto *NewSCEV =
3565 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3566 if (NewSCEV != ScevExpr) {
3567 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3568 ExpSCEV->replaceAllUsesWith(NewExp);
3569 if (Plan.getTripCount() == ExpSCEV)
3570 Plan.resetTripCount(NewExp);
3571 }
3572 }
3573}
3574
3576 VPlan &Plan,
3577 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3578 // Collect recipes in the backward slice of `Root` that may generate a poison
3579 // value that is used after vectorization.
3581 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3583 Worklist.push_back(Root);
3584
3585 // Traverse the backward slice of Root through its use-def chain.
3586 while (!Worklist.empty()) {
3587 VPRecipeBase *CurRec = Worklist.pop_back_val();
3588
3589 if (!Visited.insert(CurRec).second)
3590 continue;
3591
3592 // Prune search if we find another recipe generating a widen memory
3593 // instruction. Widen memory instructions involved in address computation
3594 // will lead to gather/scatter instructions, which don't need to be
3595 // handled.
3597 VPHeaderPHIRecipe>(CurRec))
3598 continue;
3599
3600 // This recipe contributes to the address computation of a widen
3601 // load/store. If the underlying instruction has poison-generating flags,
3602 // drop them directly.
3603 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3604 VPValue *A, *B;
3605 // Dropping disjoint from an OR may yield incorrect results, as some
3606 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3607 // for dependence analysis). Instead, replace it with an equivalent Add.
3608 // This is possible as all users of the disjoint OR only access lanes
3609 // where the operands are disjoint or poison otherwise.
3610 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3611 RecWithFlags->isDisjoint()) {
3612 VPBuilder Builder(RecWithFlags);
3613 VPInstruction *New =
3614 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3615 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3616 RecWithFlags->replaceAllUsesWith(New);
3617 RecWithFlags->eraseFromParent();
3618 CurRec = New;
3619 } else
3620 RecWithFlags->dropPoisonGeneratingFlags();
3621 } else {
3624 (void)Instr;
3625 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3626 "found instruction with poison generating flags not covered by "
3627 "VPRecipeWithIRFlags");
3628 }
3629
3630 // Add new definitions to the worklist.
3631 for (VPValue *Operand : CurRec->operands())
3632 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3633 Worklist.push_back(OpDef);
3634 }
3635 });
3636
3637 // Traverse all the recipes in the VPlan and collect the poison-generating
3638 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3639 // VPInterleaveRecipe.
3640 auto Iter = vp_depth_first_deep(Plan.getEntry());
3642 for (VPRecipeBase &Recipe : *VPBB) {
3643 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3644 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3645 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3646 if (AddrDef && WidenRec->isConsecutive() &&
3647 BlockNeedsPredication(UnderlyingInstr.getParent()))
3648 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3649 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3650 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3651 if (AddrDef) {
3652 // Check if any member of the interleave group needs predication.
3653 const InterleaveGroup<Instruction> *InterGroup =
3654 InterleaveRec->getInterleaveGroup();
3655 bool NeedPredication = false;
3656 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3657 I < NumMembers; ++I) {
3658 Instruction *Member = InterGroup->getMember(I);
3659 if (Member)
3660 NeedPredication |= BlockNeedsPredication(Member->getParent());
3661 }
3662
3663 if (NeedPredication)
3664 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3665 }
3666 }
3667 }
3668 }
3669}
3670
3672 VPlan &Plan,
3674 &InterleaveGroups,
3675 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3676 if (InterleaveGroups.empty())
3677 return;
3678
3679 // Interleave memory: for each Interleave Group we marked earlier as relevant
3680 // for this VPlan, replace the Recipes widening its memory instructions with a
3681 // single VPInterleaveRecipe at its insertion point.
3682 VPDominatorTree VPDT(Plan);
3683 for (const auto *IG : InterleaveGroups) {
3684 auto *Start =
3685 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3686 VPIRMetadata InterleaveMD(*Start);
3687 SmallVector<VPValue *, 4> StoredValues;
3688 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3689 StoredValues.push_back(StoreR->getStoredValue());
3690 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3691 Instruction *MemberI = IG->getMember(I);
3692 if (!MemberI)
3693 continue;
3694 VPWidenMemoryRecipe *MemoryR =
3695 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3696 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3697 StoredValues.push_back(StoreR->getStoredValue());
3698 InterleaveMD.intersect(*MemoryR);
3699 }
3700
3701 bool NeedsMaskForGaps =
3702 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3703 (!StoredValues.empty() && !IG->isFull());
3704
3705 Instruction *IRInsertPos = IG->getInsertPos();
3706 auto *InsertPos =
3707 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3708
3710 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3711 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3712 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3713
3714 // Get or create the start address for the interleave group.
3715 VPValue *Addr = Start->getAddr();
3716 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3717 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3718 // We cannot re-use the address of member zero because it does not
3719 // dominate the insert position. Instead, use the address of the insert
3720 // position and create a PtrAdd adjusting it to the address of member
3721 // zero.
3722 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3723 // InsertPos or sink loads above zero members to join it.
3724 assert(IG->getIndex(IRInsertPos) != 0 &&
3725 "index of insert position shouldn't be zero");
3726 auto &DL = IRInsertPos->getDataLayout();
3727 APInt Offset(32,
3728 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3729 IG->getIndex(IRInsertPos),
3730 /*IsSigned=*/true);
3731 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3732 VPBuilder B(InsertPos);
3733 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3734 }
3735 // If the group is reverse, adjust the index to refer to the last vector
3736 // lane instead of the first. We adjust the index from the first vector
3737 // lane, rather than directly getting the pointer for lane VF - 1, because
3738 // the pointer operand of the interleaved access is supposed to be uniform.
3739 if (IG->isReverse()) {
3740 auto *ReversePtr = new VPVectorEndPointerRecipe(
3741 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3742 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3743 ReversePtr->insertBefore(InsertPos);
3744 Addr = ReversePtr;
3745 }
3746 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3747 InsertPos->getMask(), NeedsMaskForGaps,
3748 InterleaveMD, InsertPos->getDebugLoc());
3749 VPIG->insertBefore(InsertPos);
3750
3751 unsigned J = 0;
3752 for (unsigned i = 0; i < IG->getFactor(); ++i)
3753 if (Instruction *Member = IG->getMember(i)) {
3754 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3755 if (!Member->getType()->isVoidTy()) {
3756 VPValue *OriginalV = MemberR->getVPSingleValue();
3757 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3758 J++;
3759 }
3760 MemberR->eraseFromParent();
3761 }
3762 }
3763}
3764
3765/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3766/// value, phi and backedge value. In the following example:
3767///
3768/// vector.ph:
3769/// Successor(s): vector loop
3770///
3771/// <x1> vector loop: {
3772/// vector.body:
3773/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3774/// ...
3775/// EMIT branch-on-count ...
3776/// No successors
3777/// }
3778///
3779/// WIDEN-INDUCTION will get expanded to:
3780///
3781/// vector.ph:
3782/// ...
3783/// vp<%induction.start> = ...
3784/// vp<%induction.increment> = ...
3785///
3786/// Successor(s): vector loop
3787///
3788/// <x1> vector loop: {
3789/// vector.body:
3790/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3791/// ...
3792/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3793/// EMIT branch-on-count ...
3794/// No successors
3795/// }
3796static void
3798 VPTypeAnalysis &TypeInfo) {
3799 VPlan *Plan = WidenIVR->getParent()->getPlan();
3800 VPValue *Start = WidenIVR->getStartValue();
3801 VPValue *Step = WidenIVR->getStepValue();
3802 VPValue *VF = WidenIVR->getVFValue();
3803 DebugLoc DL = WidenIVR->getDebugLoc();
3804
3805 // The value from the original loop to which we are mapping the new induction
3806 // variable.
3807 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3808
3809 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3812 VPIRFlags Flags = *WidenIVR;
3813 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3814 AddOp = Instruction::Add;
3815 MulOp = Instruction::Mul;
3816 } else {
3817 AddOp = ID.getInductionOpcode();
3818 MulOp = Instruction::FMul;
3819 }
3820
3821 // If the phi is truncated, truncate the start and step values.
3822 VPBuilder Builder(Plan->getVectorPreheader());
3823 Type *StepTy = TypeInfo.inferScalarType(Step);
3824 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3825 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3826 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3827 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3828 StepTy = Ty;
3829 }
3830
3831 // Construct the initial value of the vector IV in the vector loop preheader.
3832 Type *IVIntTy =
3834 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3835 if (StepTy->isFloatingPointTy())
3836 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3837
3838 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3839 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3840
3841 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3842 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3843 DebugLoc::getUnknown(), "induction");
3844
3845 // Create the widened phi of the vector IV.
3846 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3847 WidenIVR->getDebugLoc(), "vec.ind");
3848 WidePHI->insertBefore(WidenIVR);
3849
3850 // Create the backedge value for the vector IV.
3851 VPValue *Inc;
3852 VPValue *Prev;
3853 // If unrolled, use the increment and prev value from the operands.
3854 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3855 Inc = SplatVF;
3856 Prev = WidenIVR->getLastUnrolledPartOperand();
3857 } else {
3858 if (VPRecipeBase *R = VF->getDefiningRecipe())
3859 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3860 // Multiply the vectorization factor by the step using integer or
3861 // floating-point arithmetic as appropriate.
3862 if (StepTy->isFloatingPointTy())
3863 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3864 DL);
3865 else
3866 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3867 TypeInfo.inferScalarType(VF), DL);
3868
3869 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3870 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3871 Prev = WidePHI;
3872 }
3873
3875 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3876 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3877 WidenIVR->getDebugLoc(), "vec.ind.next");
3878
3879 WidePHI->addOperand(Next);
3880
3881 WidenIVR->replaceAllUsesWith(WidePHI);
3882}
3883
3884/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3885/// initial value, phi and backedge value. In the following example:
3886///
3887/// <x1> vector loop: {
3888/// vector.body:
3889/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3890/// ...
3891/// EMIT branch-on-count ...
3892/// }
3893///
3894/// WIDEN-POINTER-INDUCTION will get expanded to:
3895///
3896/// <x1> vector loop: {
3897/// vector.body:
3898/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3899/// EMIT %mul = mul %stepvector, %step
3900/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3901/// ...
3902/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3903/// EMIT branch-on-count ...
3904/// }
3906 VPTypeAnalysis &TypeInfo) {
3907 VPlan *Plan = R->getParent()->getPlan();
3908 VPValue *Start = R->getStartValue();
3909 VPValue *Step = R->getStepValue();
3910 VPValue *VF = R->getVFValue();
3911
3912 assert(R->getInductionDescriptor().getKind() ==
3914 "Not a pointer induction according to InductionDescriptor!");
3915 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3916 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3917 "Recipe should have been replaced");
3918
3919 VPBuilder Builder(R);
3920 DebugLoc DL = R->getDebugLoc();
3921
3922 // Build a scalar pointer phi.
3923 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3924
3925 // Create actual address geps that use the pointer phi as base and a
3926 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3927 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3928 Type *StepTy = TypeInfo.inferScalarType(Step);
3929 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3930 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3931 VPValue *PtrAdd =
3932 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3933 R->replaceAllUsesWith(PtrAdd);
3934
3935 // Create the backedge value for the scalar pointer phi.
3937 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3938 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3939 DL);
3940 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3941
3942 VPValue *InductionGEP =
3943 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3944 ScalarPtrPhi->addOperand(InductionGEP);
3945}
3946
3948 // Replace loop regions with explicity CFG.
3949 SmallVector<VPRegionBlock *> LoopRegions;
3951 vp_depth_first_deep(Plan.getEntry()))) {
3952 if (!R->isReplicator())
3953 LoopRegions.push_back(R);
3954 }
3955 for (VPRegionBlock *R : LoopRegions)
3956 R->dissolveToCFGLoop();
3957}
3958
3961 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3962 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3965 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3966 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3967 }
3968
3969 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3970 // single-condition branches:
3971 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3972 // the first condition is true, and otherwise jumps to a new interim block.
3973 // 2. A branch that ends the interim block, jumps to the second successor if
3974 // the second condition is true, and otherwise jumps to the third
3975 // successor.
3976 for (VPInstruction *Br : WorkList) {
3977 assert(Br->getNumOperands() == 2 &&
3978 "BranchOnTwoConds must have exactly 2 conditions");
3979 DebugLoc DL = Br->getDebugLoc();
3980 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3981 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3982 assert(Successors.size() == 3 &&
3983 "BranchOnTwoConds must have exactly 3 successors");
3984
3985 for (VPBlockBase *Succ : Successors)
3986 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3987
3988 VPValue *Cond0 = Br->getOperand(0);
3989 VPValue *Cond1 = Br->getOperand(1);
3990 VPBlockBase *Succ0 = Successors[0];
3991 VPBlockBase *Succ1 = Successors[1];
3992 VPBlockBase *Succ2 = Successors[2];
3993 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3994 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3995
3996 VPBasicBlock *InterimBB =
3997 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3998
3999 VPBuilder(BrOnTwoCondsBB)
4001 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4002 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4003
4005 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4006 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4007 Br->eraseFromParent();
4008 }
4009}
4010
4012 VPTypeAnalysis TypeInfo(Plan);
4015 vp_depth_first_deep(Plan.getEntry()))) {
4016 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4017 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4018 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4019 ToRemove.push_back(WidenIVR);
4020 continue;
4021 }
4022
4023 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4024 // If the recipe only generates scalars, scalarize it instead of
4025 // expanding it.
4026 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4027 VPBuilder Builder(WidenIVR);
4028 VPValue *PtrAdd =
4029 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4030 WidenIVR->replaceAllUsesWith(PtrAdd);
4031 ToRemove.push_back(WidenIVR);
4032 continue;
4033 }
4034 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4035 ToRemove.push_back(WidenIVR);
4036 continue;
4037 }
4038
4039 // Expand VPBlendRecipe into VPInstruction::Select.
4040 VPBuilder Builder(&R);
4041 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4042 VPValue *Select = Blend->getIncomingValue(0);
4043 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4044 Select = Builder.createSelect(Blend->getMask(I),
4045 Blend->getIncomingValue(I), Select,
4046 R.getDebugLoc(), "predphi", *Blend);
4047 Blend->replaceAllUsesWith(Select);
4048 ToRemove.push_back(Blend);
4049 }
4050
4051 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4052 if (!VEPR->getOffset()) {
4053 assert(Plan.getConcreteUF() == 1 &&
4054 "Expected unroller to have materialized offset for UF != 1");
4055 VEPR->materializeOffset();
4056 }
4057 }
4058
4059 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4060 Expr->decompose();
4061 ToRemove.push_back(Expr);
4062 }
4063
4064 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4065 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4066 if (LastActiveL &&
4067 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4068 // Create Not(Mask) for all operands.
4070 for (VPValue *Op : LastActiveL->operands()) {
4071 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4072 NotMasks.push_back(NotMask);
4073 }
4074
4075 // Create FirstActiveLane on the inverted masks.
4076 VPValue *FirstInactiveLane = Builder.createNaryOp(
4078 LastActiveL->getDebugLoc(), "first.inactive.lane");
4079
4080 // Subtract 1 to get the last active lane.
4081 VPValue *One =
4082 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4083 VPValue *LastLane =
4084 Builder.createSub(FirstInactiveLane, One,
4085 LastActiveL->getDebugLoc(), "last.active.lane");
4086
4087 LastActiveL->replaceAllUsesWith(LastLane);
4088 ToRemove.push_back(LastActiveL);
4089 continue;
4090 }
4091
4092 // Lower MaskedCond with block mask to LogicalAnd.
4094 auto *VPI = cast<VPInstruction>(&R);
4095 assert(VPI->isMasked() &&
4096 "Unmasked MaskedCond should be simplified earlier");
4097 VPI->replaceAllUsesWith(Builder.createNaryOp(
4098 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4099 ToRemove.push_back(VPI);
4100 continue;
4101 }
4102
4103 // Lower CanonicalIVIncrementForPart to plain Add.
4104 if (match(
4105 &R,
4107 auto *VPI = cast<VPInstruction>(&R);
4108 VPValue *Add = Builder.createOverflowingOp(
4109 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4110 VPI->getDebugLoc());
4111 VPI->replaceAllUsesWith(Add);
4112 ToRemove.push_back(VPI);
4113 continue;
4114 }
4115
4116 // Lower BranchOnCount to ICmp + BranchOnCond.
4117 VPValue *IV, *TC;
4118 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4119 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4120 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4121 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4122 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4123 ToRemove.push_back(BranchOnCountInst);
4124 continue;
4125 }
4126
4127 VPValue *VectorStep;
4128 VPValue *ScalarStep;
4130 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4131 continue;
4132
4133 // Expand WideIVStep.
4134 auto *VPI = cast<VPInstruction>(&R);
4135 Type *IVTy = TypeInfo.inferScalarType(VPI);
4136 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4138 ? Instruction::UIToFP
4139 : Instruction::Trunc;
4140 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4141 }
4142
4143 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4144 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4145 ScalarStep =
4146 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4147 }
4148
4149 VPIRFlags Flags;
4150 unsigned MulOpc;
4151 if (IVTy->isFloatingPointTy()) {
4152 MulOpc = Instruction::FMul;
4153 Flags = VPI->getFastMathFlags();
4154 } else {
4155 MulOpc = Instruction::Mul;
4156 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4157 }
4158
4159 VPInstruction *Mul = Builder.createNaryOp(
4160 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4161 VectorStep = Mul;
4162 VPI->replaceAllUsesWith(VectorStep);
4163 ToRemove.push_back(VPI);
4164 }
4165 }
4166
4167 for (VPRecipeBase *R : ToRemove)
4168 R->eraseFromParent();
4169}
4170
4172 VPBasicBlock *HeaderVPBB,
4173 VPBasicBlock *LatchVPBB,
4174 VPBasicBlock *MiddleVPBB,
4175 UncountableExitStyle Style) {
4176 struct EarlyExitInfo {
4177 VPBasicBlock *EarlyExitingVPBB;
4178 VPIRBasicBlock *EarlyExitVPBB;
4179 VPValue *CondToExit;
4180 };
4181
4182 VPDominatorTree VPDT(Plan);
4183 VPBuilder Builder(LatchVPBB->getTerminator());
4185 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4186 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4187 if (Pred == MiddleVPBB)
4188 continue;
4189 // Collect condition for this early exit.
4190 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4191 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4192 VPValue *CondOfEarlyExitingVPBB;
4193 [[maybe_unused]] bool Matched =
4194 match(EarlyExitingVPBB->getTerminator(),
4195 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4196 assert(Matched && "Terminator must be BranchOnCond");
4197
4198 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4199 // the correct block mask.
4200 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4201 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4203 TrueSucc == ExitBlock
4204 ? CondOfEarlyExitingVPBB
4205 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4206 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4207 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4208 VPDT.properlyDominates(
4209 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4210 LatchVPBB)) &&
4211 "exit condition must dominate the latch");
4212 Exits.push_back({
4213 EarlyExitingVPBB,
4214 ExitBlock,
4215 CondToEarlyExit,
4216 });
4217 }
4218 }
4219
4220 assert(!Exits.empty() && "must have at least one early exit");
4221 // Sort exits by RPO order to get correct program order. RPO gives a
4222 // topological ordering of the CFG, ensuring upstream exits are checked
4223 // before downstream exits in the dispatch chain.
4225 HeaderVPBB);
4227 for (const auto &[Num, VPB] : enumerate(RPOT))
4228 RPOIdx[VPB] = Num;
4229 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4230 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4231 });
4232#ifndef NDEBUG
4233 // After RPO sorting, verify that for any pair where one exit dominates
4234 // another, the dominating exit comes first. This is guaranteed by RPO
4235 // (topological order) and is required for the dispatch chain correctness.
4236 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4237 for (unsigned J = I + 1; J < Exits.size(); ++J)
4238 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4239 Exits[I].EarlyExitingVPBB) &&
4240 "RPO sort must place dominating exits before dominated ones");
4241#endif
4242
4243 // Build the AnyOf condition for the latch terminator using logical OR
4244 // to avoid poison propagation from later exit conditions when an earlier
4245 // exit is taken.
4246 VPValue *Combined = Exits[0].CondToExit;
4247 for (const EarlyExitInfo &Info : drop_begin(Exits))
4248 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4249
4250 VPValue *IsAnyExitTaken =
4251 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4252
4254 "Early exit store masking not implemented");
4255
4256 // Create the vector.early.exit blocks.
4257 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4258 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4259 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4260 VPBasicBlock *VectorEarlyExitVPBB =
4261 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4262 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4263 }
4264
4265 // Create the dispatch block (or reuse the single exit block if only one
4266 // exit). The dispatch block computes the first active lane of the combined
4267 // condition and, for multiple exits, chains through conditions to determine
4268 // which exit to take.
4269 VPBasicBlock *DispatchVPBB =
4270 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4271 : Plan.createVPBasicBlock("vector.early.exit.check");
4272 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4273 VPValue *FirstActiveLane =
4274 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4275 DebugLoc::getUnknown(), "first.active.lane");
4276
4277 // For each early exit, disconnect the original exiting block
4278 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4279 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4280 // values at the first active lane:
4281 //
4282 // Input:
4283 // early.exiting.I:
4284 // ...
4285 // EMIT branch-on-cond vp<%cond.I>
4286 // Successor(s): in.loop.succ, ir-bb<exit.I>
4287 //
4288 // ir-bb<exit.I>:
4289 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4290 //
4291 // Output:
4292 // early.exiting.I:
4293 // ...
4294 // Successor(s): in.loop.succ
4295 //
4296 // vector.early.exit.I:
4297 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4298 // Successor(s): ir-bb<exit.I>
4299 //
4300 // ir-bb<exit.I>:
4301 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4302 // vector.early.exit.I)
4303 //
4304 for (auto [Exit, VectorEarlyExitVPBB] :
4305 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4306 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4307 // Adjust the phi nodes in EarlyExitVPBB.
4308 // 1. remove incoming values from EarlyExitingVPBB,
4309 // 2. extract the incoming value at FirstActiveLane
4310 // 3. add back the extracts as last operands for the phis
4311 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4312 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4313 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4314 // values from VectorEarlyExitVPBB.
4315 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4316 auto *ExitIRI = cast<VPIRPhi>(&R);
4317 VPValue *IncomingVal =
4318 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4319 VPValue *NewIncoming = IncomingVal;
4320 if (!isa<VPIRValue>(IncomingVal)) {
4321 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4322 NewIncoming = EarlyExitBuilder.createNaryOp(
4323 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4324 DebugLoc::getUnknown(), "early.exit.value");
4325 }
4326 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4327 ExitIRI->addOperand(NewIncoming);
4328 }
4329
4330 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4331 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4332 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4333 }
4334
4335 // Chain through exits: for each exit, check if its condition is true at
4336 // the first active lane. If so, take that exit; otherwise, try the next.
4337 // The last exit needs no check since it must be taken if all others fail.
4338 //
4339 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4340 //
4341 // latch:
4342 // ...
4343 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4344 // ...
4345 //
4346 // vector.early.exit.check:
4347 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4348 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4349 // EMIT branch-on-cond vp<%at.cond.0>
4350 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4351 //
4352 // vector.early.exit.check.0:
4353 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4354 // EMIT branch-on-cond vp<%at.cond.1>
4355 // Successor(s): vector.early.exit.1, vector.early.exit.2
4356 VPBasicBlock *CurrentBB = DispatchVPBB;
4357 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4358 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4359 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4360 DebugLoc::getUnknown(), "exit.cond.at.lane");
4361
4362 // For the last dispatch, branch directly to the last exit on false;
4363 // otherwise, create a new check block.
4364 bool IsLastDispatch = (I + 2 == Exits.size());
4365 VPBasicBlock *FalseBB =
4366 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4367 : Plan.createVPBasicBlock(
4368 Twine("vector.early.exit.check.") + Twine(I));
4369
4370 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4371 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4372 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4373 FalseBB->setPredecessors({CurrentBB});
4374
4375 CurrentBB = FalseBB;
4376 DispatchBuilder.setInsertPoint(CurrentBB);
4377 }
4378
4379 // Replace the latch terminator with the new branching logic.
4380 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4381 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4382 "Unexpected terminator");
4383 auto *IsLatchExitTaken =
4384 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4385 LatchExitingBranch->getOperand(1));
4386
4387 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4388 LatchExitingBranch->eraseFromParent();
4389 Builder.setInsertPoint(LatchVPBB);
4390 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4391 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4392 LatchVPBB->clearSuccessors();
4393 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4394 DispatchVPBB->setPredecessors({LatchVPBB});
4395}
4396
4397/// This function tries convert extended in-loop reductions to
4398/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4399/// valid. The created recipe must be decomposed to its constituent
4400/// recipes before execution.
4401static VPExpressionRecipe *
4403 VFRange &Range) {
4404 Type *RedTy = Ctx.Types.inferScalarType(Red);
4405 VPValue *VecOp = Red->getVecOp();
4406
4407 // For partial reductions, the decision has already been made at the point of
4408 // transforming reductions -> partial reductions for a given plan, based on
4409 // the cost-model.
4410 if (Red->isPartialReduction())
4411 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4412
4413 // Clamp the range if using extended-reduction is profitable.
4414 auto IsExtendedRedValidAndClampRange =
4415 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4417 [&](ElementCount VF) {
4418 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4420
4422 InstructionCost ExtCost =
4423 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4424 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4425
4426 // TTI::getExtendedReductionCost for in-loop reductions
4427 // only supports integer types.
4428 if (RedTy->isFloatingPointTy())
4429 return false;
4430 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4431 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4432 Red->getFastMathFlags(), CostKind);
4433 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4434 },
4435 Range);
4436 };
4437
4438 VPValue *A;
4439 // Match reduce(ext)).
4442 IsExtendedRedValidAndClampRange(
4443 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4444 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4445 Ctx.Types.inferScalarType(A)))
4446 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4447
4448 return nullptr;
4449}
4450
4451/// This function tries convert extended in-loop reductions to
4452/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4453/// and valid. The created VPExpressionRecipe must be decomposed to its
4454/// constituent recipes before execution. Patterns of the
4455/// VPExpressionRecipe:
4456/// reduce.add(mul(...)),
4457/// reduce.add(mul(ext(A), ext(B))),
4458/// reduce.add(ext(mul(ext(A), ext(B)))).
4459/// reduce.fadd(fmul(ext(A), ext(B)))
4460static VPExpressionRecipe *
4462 VPCostContext &Ctx, VFRange &Range) {
4463 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4464 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4465 Opcode != Instruction::FAdd)
4466 return nullptr;
4467
4468 Type *RedTy = Ctx.Types.inferScalarType(Red);
4469
4470 // Clamp the range if using multiply-accumulate-reduction is profitable.
4471 auto IsMulAccValidAndClampRange =
4473 VPWidenCastRecipe *OuterExt) -> bool {
4475 [&](ElementCount VF) {
4476 // For partial reductions, the decision has already been made at the
4477 // point of transforming reductions -> partial reductions for a given
4478 // plan, based on the cost-model.
4479 if (Red->isPartialReduction())
4480 return true;
4481
4483 Type *SrcTy =
4484 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4485 InstructionCost MulAccCost;
4486
4487 // Only partial reductions support mixed or floating-point extends at
4488 // the moment.
4489 if (Ext0 && Ext1 &&
4490 (Ext0->getOpcode() != Ext1->getOpcode() ||
4491 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4492 return false;
4493
4494 bool IsZExt =
4495 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4496 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4497 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4498 SrcVecTy, CostKind);
4499
4500 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4501 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4502 InstructionCost ExtCost = 0;
4503 if (Ext0)
4504 ExtCost += Ext0->computeCost(VF, Ctx);
4505 if (Ext1)
4506 ExtCost += Ext1->computeCost(VF, Ctx);
4507 if (OuterExt)
4508 ExtCost += OuterExt->computeCost(VF, Ctx);
4509
4510 return MulAccCost.isValid() &&
4511 MulAccCost < ExtCost + MulCost + RedCost;
4512 },
4513 Range);
4514 };
4515
4516 VPValue *VecOp = Red->getVecOp();
4517 VPRecipeBase *Sub = nullptr;
4518 VPValue *A, *B;
4519 VPValue *Tmp = nullptr;
4520
4521 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4522 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4523 assert(Opcode == Instruction::FAdd &&
4524 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4525 "instruction");
4526 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4527 if (!FMul)
4528 return nullptr;
4529
4530 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4531 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4532
4533 if (RecipeA && RecipeB &&
4534 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4535 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4536 }
4537 }
4538 if (RedTy->isFloatingPointTy())
4539 return nullptr;
4540
4541 // Sub reductions could have a sub between the add reduction and vec op.
4542 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4543 Sub = VecOp->getDefiningRecipe();
4544 VecOp = Tmp;
4545 }
4546
4547 // If ValB is a constant and can be safely extended, truncate it to the same
4548 // type as ExtA's operand, then extend it to the same type as ExtA. This
4549 // creates two uniform extends that can more easily be matched by the rest of
4550 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4551 // replaced with the new extend of the constant.
4552 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4553 VPWidenCastRecipe *&ExtB,
4554 VPValue *&ValB,
4555 VPWidenRecipe *Mul) {
4556 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4557 return;
4558 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4559 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4560 const APInt *Const;
4561 if (!match(ValB, m_APInt(Const)) ||
4563 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4564 return;
4565 // The truncate ensures that the type of each extended operand is the
4566 // same, and it's been proven that the constant can be extended from
4567 // NarrowTy safely. Necessary since ExtA's extended operand would be
4568 // e.g. an i8, while the const will likely be an i32. This will be
4569 // elided by later optimisations.
4570 VPBuilder Builder(Mul);
4571 auto *Trunc =
4572 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4573 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4574 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4575 Mul->setOperand(1, ExtB);
4576 };
4577
4578 // Try to match reduce.add(mul(...)).
4579 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4582 auto *Mul = cast<VPWidenRecipe>(VecOp);
4583
4584 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4585 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4586
4587 // Match reduce.add/sub(mul(ext, ext)).
4588 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4589 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4590 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4591 if (Sub)
4592 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4593 cast<VPWidenRecipe>(Sub), Red);
4594 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4595 }
4596 // TODO: Add an expression type for this variant with a negated mul
4597 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4598 return new VPExpressionRecipe(Mul, Red);
4599 }
4600 // TODO: Add an expression type for negated versions of other expression
4601 // variants.
4602 if (Sub)
4603 return nullptr;
4604
4605 // Match reduce.add(ext(mul(A, B))).
4606 if (!Red->isPartialReduction() &&
4607 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4608 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4609 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4612
4613 // reduce.add(ext(mul(ext, const)))
4614 // -> reduce.add(ext(mul(ext, ext(const))))
4615 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4616
4617 // reduce.add(ext(mul(ext(A), ext(B))))
4618 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4619 // The inner extends must either have the same opcode as the outer extend or
4620 // be the same, in which case the multiply can never result in a negative
4621 // value and the outer extend can be folded away by doing wider
4622 // extends for the operands of the mul.
4623 if (Ext0 && Ext1 &&
4624 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4625 Ext0->getOpcode() == Ext1->getOpcode() &&
4626 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4627 auto *NewExt0 = new VPWidenCastRecipe(
4628 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4629 *Ext0, *Ext0, Ext0->getDebugLoc());
4630 NewExt0->insertBefore(Ext0);
4631
4632 VPWidenCastRecipe *NewExt1 = NewExt0;
4633 if (Ext0 != Ext1) {
4634 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4635 Ext->getResultType(), nullptr, *Ext1,
4636 *Ext1, Ext1->getDebugLoc());
4637 NewExt1->insertBefore(Ext1);
4638 }
4639 Mul->setOperand(0, NewExt0);
4640 Mul->setOperand(1, NewExt1);
4641 Red->setOperand(1, Mul);
4642 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4643 }
4644 }
4645 return nullptr;
4646}
4647
4648/// This function tries to create abstract recipes from the reduction recipe for
4649/// following optimizations and cost estimation.
4651 VPCostContext &Ctx,
4652 VFRange &Range) {
4653 VPExpressionRecipe *AbstractR = nullptr;
4654 auto IP = std::next(Red->getIterator());
4655 auto *VPBB = Red->getParent();
4656 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4657 AbstractR = MulAcc;
4658 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4659 AbstractR = ExtRed;
4660 // Cannot create abstract inloop reduction recipes.
4661 if (!AbstractR)
4662 return;
4663
4664 AbstractR->insertBefore(*VPBB, IP);
4665 Red->replaceAllUsesWith(AbstractR);
4666}
4667
4678
4680 if (Plan.hasScalarVFOnly())
4681 return;
4682
4683#ifndef NDEBUG
4684 VPDominatorTree VPDT(Plan);
4685#endif
4686
4687 SmallVector<VPValue *> VPValues;
4688 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4689 VPValues.push_back(BTC);
4690 append_range(VPValues, Plan.getLiveIns());
4691 for (VPRecipeBase &R : *Plan.getEntry())
4692 append_range(VPValues, R.definedValues());
4693
4694 auto *VectorPreheader = Plan.getVectorPreheader();
4695 for (VPValue *VPV : VPValues) {
4697 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4698 continue;
4699
4700 // Add explicit broadcast at the insert point that dominates all users.
4701 VPBasicBlock *HoistBlock = VectorPreheader;
4702 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4703 for (VPUser *User : VPV->users()) {
4704 if (User->usesScalars(VPV))
4705 continue;
4706 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4707 HoistPoint = HoistBlock->begin();
4708 else
4709 assert(VPDT.dominates(VectorPreheader,
4710 cast<VPRecipeBase>(User)->getParent()) &&
4711 "All users must be in the vector preheader or dominated by it");
4712 }
4713
4714 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4715 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4716 VPV->replaceUsesWithIf(Broadcast,
4717 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4718 return Broadcast != &U && !U.usesScalars(VPV);
4719 });
4720 }
4721}
4722
4724 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4725
4726 // Collect candidate loads with invariant addresses and noalias scopes
4727 // metadata and memory-writing recipes with noalias metadata.
4731 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4732 for (VPRecipeBase &R : *VPBB) {
4733 // Only handle single-scalar replicated loads with invariant addresses.
4734 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4735 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4736 RepR->getOpcode() != Instruction::Load)
4737 continue;
4738
4739 VPValue *Addr = RepR->getOperand(0);
4740 if (Addr->isDefinedOutsideLoopRegions()) {
4742 if (!Loc.AATags.Scope)
4743 continue;
4744 CandidateLoads.push_back({RepR, Loc});
4745 }
4746 }
4747 if (R.mayWriteToMemory()) {
4749 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4750 return;
4751 Stores.push_back(*Loc);
4752 }
4753 }
4754 }
4755
4756 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4757 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4758 // Hoist the load to the preheader if it doesn't alias with any stores
4759 // according to the noalias metadata. Other loads should have been hoisted
4760 // by other passes
4761 const AAMDNodes &LoadAA = LoadLoc.AATags;
4762 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4764 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4765 })) {
4766 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4767 }
4768 }
4769}
4770
4771// Collect common metadata from a group of replicate recipes by intersecting
4772// metadata from all recipes in the group.
4774 VPIRMetadata CommonMetadata = *Recipes.front();
4775 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4776 CommonMetadata.intersect(*Recipe);
4777 return CommonMetadata;
4778}
4779
4780template <unsigned Opcode>
4784 const Loop *L) {
4785 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4786 "Only Load and Store opcodes supported");
4787 constexpr bool IsLoad = (Opcode == Instruction::Load);
4788 VPTypeAnalysis TypeInfo(Plan);
4789
4790 // For each address, collect operations with the same or complementary masks.
4792 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4793 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4794 };
4796 Plan, PSE, L,
4797 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4798 for (auto Recipes : Groups) {
4799 if (Recipes.size() < 2)
4800 continue;
4801
4802 // Collect groups with the same or complementary masks.
4803 for (VPReplicateRecipe *&RecipeI : Recipes) {
4804 if (!RecipeI)
4805 continue;
4806
4807 VPValue *MaskI = RecipeI->getMask();
4808 Type *TypeI = GetLoadStoreValueType(RecipeI);
4810 Group.push_back(RecipeI);
4811 RecipeI = nullptr;
4812
4813 // Find all operations with the same or complementary masks.
4814 bool HasComplementaryMask = false;
4815 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4816 if (!RecipeJ)
4817 continue;
4818
4819 VPValue *MaskJ = RecipeJ->getMask();
4820 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4821 if (TypeI == TypeJ) {
4822 // Check if any operation in the group has a complementary mask with
4823 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4824 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4825 match(MaskJ, m_Not(m_Specific(MaskI)));
4826 Group.push_back(RecipeJ);
4827 RecipeJ = nullptr;
4828 }
4829 }
4830
4831 if (HasComplementaryMask) {
4832 assert(Group.size() >= 2 && "must have at least 2 entries");
4833 AllGroups.push_back(std::move(Group));
4834 }
4835 }
4836 }
4837
4838 return AllGroups;
4839}
4840
4841// Find the recipe with minimum alignment in the group.
4842template <typename InstType>
4843static VPReplicateRecipe *
4845 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4846 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4847 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4848 });
4849}
4850
4853 const Loop *L) {
4854 auto Groups =
4856 if (Groups.empty())
4857 return;
4858
4859 // Process each group of loads.
4860 for (auto &Group : Groups) {
4861 // Try to use the earliest (most dominating) load to replace all others.
4862 VPReplicateRecipe *EarliestLoad = Group[0];
4863 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4864 VPBasicBlock *LastBB = Group.back()->getParent();
4865
4866 // Check that the load doesn't alias with stores between first and last.
4867 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4868 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4869 continue;
4870
4871 // Collect common metadata from all loads in the group.
4872 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4873
4874 // Find the load with minimum alignment to use.
4875 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4876
4877 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4878 assert(all_of(Group,
4879 [IsSingleScalar](VPReplicateRecipe *R) {
4880 return R->isSingleScalar() == IsSingleScalar;
4881 }) &&
4882 "all members in group must agree on IsSingleScalar");
4883
4884 // Create an unpredicated version of the earliest load with common
4885 // metadata.
4886 auto *UnpredicatedLoad = new VPReplicateRecipe(
4887 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4888 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4889
4890 UnpredicatedLoad->insertBefore(EarliestLoad);
4891
4892 // Replace all loads in the group with the unpredicated load.
4893 for (VPReplicateRecipe *Load : Group) {
4894 Load->replaceAllUsesWith(UnpredicatedLoad);
4895 Load->eraseFromParent();
4896 }
4897 }
4898}
4899
4900static bool
4902 PredicatedScalarEvolution &PSE, const Loop &L,
4903 VPTypeAnalysis &TypeInfo) {
4904 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4905 if (!StoreLoc || !StoreLoc->AATags.Scope)
4906 return false;
4907
4908 // When sinking a group of stores, all members of the group alias each other.
4909 // Skip them during the alias checks.
4910 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4911 StoresToSink.end());
4912
4913 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4914 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4915 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4916 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4917}
4918
4921 const Loop *L) {
4922 auto Groups =
4924 if (Groups.empty())
4925 return;
4926
4927 VPTypeAnalysis TypeInfo(Plan);
4928
4929 for (auto &Group : Groups) {
4930 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4931 continue;
4932
4933 // Use the last (most dominated) store's location for the unconditional
4934 // store.
4935 VPReplicateRecipe *LastStore = Group.back();
4936 VPBasicBlock *InsertBB = LastStore->getParent();
4937
4938 // Collect common alias metadata from all stores in the group.
4939 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4940
4941 // Build select chain for stored values.
4942 VPValue *SelectedValue = Group[0]->getOperand(0);
4943 VPBuilder Builder(InsertBB, LastStore->getIterator());
4944
4945 bool IsSingleScalar = Group[0]->isSingleScalar();
4946 for (unsigned I = 1; I < Group.size(); ++I) {
4947 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4948 "all members in group must agree on IsSingleScalar");
4949 VPValue *Mask = Group[I]->getMask();
4950 VPValue *Value = Group[I]->getOperand(0);
4951 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4952 Group[I]->getDebugLoc());
4953 }
4954
4955 // Find the store with minimum alignment to use.
4956 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4957
4958 // Create unconditional store with selected value and common metadata.
4959 auto *UnpredicatedStore = new VPReplicateRecipe(
4960 StoreWithMinAlign->getUnderlyingInstr(),
4961 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4962 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4963 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4964
4965 // Remove all predicated stores from the group.
4966 for (VPReplicateRecipe *Store : Group)
4967 Store->eraseFromParent();
4968 }
4969}
4970
4972 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4974 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4975 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4976
4977 VPValue *TC = Plan.getTripCount();
4978 if (TC->getNumUsers() == 0)
4979 return;
4980
4981 // Skip cases for which the trip count may be non-trivial to materialize.
4982 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4983 // tail is required.
4984 if (!Plan.hasScalarTail() ||
4986 Plan.getScalarPreheader() ||
4987 !isa<VPIRValue>(TC))
4988 return;
4989
4990 // Materialize vector trip counts for constants early if it can simply
4991 // be computed as (Original TC / VF * UF) * VF * UF.
4992 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4993 // tail-folded loops.
4994 ScalarEvolution &SE = *PSE.getSE();
4995 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4996 if (!isa<SCEVConstant>(TCScev))
4997 return;
4998 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4999 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5000 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5001 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5002}
5003
5005 VPBasicBlock *VectorPH) {
5007 if (BTC->getNumUsers() == 0)
5008 return;
5009
5010 VPBuilder Builder(VectorPH, VectorPH->begin());
5011 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5012 auto *TCMO =
5013 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5014 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5015 BTC->replaceAllUsesWith(TCMO);
5016}
5017
5019 if (Plan.hasScalarVFOnly())
5020 return;
5021
5022 VPTypeAnalysis TypeInfo(Plan);
5023 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5024 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5026 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5027 vp_depth_first_shallow(LoopRegion->getEntry()));
5028 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5029 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5030 // regions. Those are not materialized explicitly yet. Those vector users are
5031 // still handled in VPReplicateRegion::execute(), via shouldPack().
5032 // TODO: materialize build vectors for replicating recipes in replicating
5033 // regions.
5034 for (VPBasicBlock *VPBB :
5035 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5036 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5038 continue;
5039 auto *DefR = cast<VPSingleDefRecipe>(&R);
5040 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5041 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5042 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5043 };
5044 if ((isa<VPReplicateRecipe>(DefR) &&
5045 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5046 (isa<VPInstruction>(DefR) &&
5048 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5049 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5050 continue;
5051
5052 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5053 unsigned Opcode = ScalarTy->isStructTy()
5056 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5057 BuildVector->insertAfter(DefR);
5058
5059 DefR->replaceUsesWithIf(
5060 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5061 VPUser &U, unsigned) {
5062 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5063 });
5064 }
5065 }
5066
5067 // Create explicit VPInstructions to convert vectors to scalars. The current
5068 // implementation is conservative - it may miss some cases that may or may not
5069 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5070 // if they are known to operate on scalar values.
5071 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5072 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5075 continue;
5076 for (VPValue *Def : R.definedValues()) {
5077 // Skip recipes that are single-scalar or only have their first lane
5078 // used.
5079 // TODO: The Defs skipped here may or may not be vector values.
5080 // Introduce Unpacks, and remove them later, if they are guaranteed to
5081 // produce scalar values.
5083 continue;
5084
5085 // At the moment, we create unpacks only for scalar users outside
5086 // replicate regions. Recipes inside replicate regions still extract the
5087 // required lanes implicitly.
5088 // TODO: Remove once replicate regions are unrolled completely.
5089 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5090 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5091 return U->usesScalars(Def) &&
5092 (!ParentRegion || !ParentRegion->isReplicator());
5093 };
5094 if (none_of(Def->users(), IsCandidateUnpackUser))
5095 continue;
5096
5097 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5098 if (R.isPhi())
5099 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5100 else
5101 Unpack->insertAfter(&R);
5102 Def->replaceUsesWithIf(Unpack,
5103 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5104 return IsCandidateUnpackUser(&U);
5105 });
5106 }
5107 }
5108 }
5109}
5110
5112 VPBasicBlock *VectorPHVPBB,
5113 bool TailByMasking,
5114 bool RequiresScalarEpilogue,
5115 VPValue *Step) {
5116 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5117 // There's nothing to do if there are no users of the vector trip count or its
5118 // IR value has already been set.
5119 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5120 return;
5121
5122 VPValue *TC = Plan.getTripCount();
5123 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5124 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5125 if (auto *StepR = Step->getDefiningRecipe()) {
5126 assert(StepR->getParent() == VectorPHVPBB &&
5127 "Step must be defined in VectorPHVPBB");
5128 // Insert after Step's definition to maintain valid def-use ordering.
5129 InsertPt = std::next(StepR->getIterator());
5130 }
5131 VPBuilder Builder(VectorPHVPBB, InsertPt);
5132
5133 // If the tail is to be folded by masking, round the number of iterations N
5134 // up to a multiple of Step instead of rounding down. This is done by first
5135 // adding Step-1 and then rounding down. Note that it's ok if this addition
5136 // overflows: the vector induction variable will eventually wrap to zero given
5137 // that it starts at zero and its Step is a power of two; the loop will then
5138 // exit, with the last early-exit vector comparison also producing all-true.
5139 if (TailByMasking) {
5140 TC = Builder.createAdd(
5141 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5142 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5143 }
5144
5145 // Now we need to generate the expression for the part of the loop that the
5146 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5147 // iterations are not required for correctness, or N - Step, otherwise. Step
5148 // is equal to the vectorization factor (number of SIMD elements) times the
5149 // unroll factor (number of SIMD instructions).
5150 VPValue *R =
5151 Builder.createNaryOp(Instruction::URem, {TC, Step},
5152 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5153
5154 // There are cases where we *must* run at least one iteration in the remainder
5155 // loop. See the cost model for when this can happen. If the step evenly
5156 // divides the trip count, we set the remainder to be equal to the step. If
5157 // the step does not evenly divide the trip count, no adjustment is necessary
5158 // since there will already be scalar iterations. Note that the minimum
5159 // iterations check ensures that N >= Step.
5160 if (RequiresScalarEpilogue) {
5161 assert(!TailByMasking &&
5162 "requiring scalar epilogue is not supported with fail folding");
5163 VPValue *IsZero =
5164 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5165 R = Builder.createSelect(IsZero, Step, R);
5166 }
5167
5168 VPValue *Res =
5169 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5170 VectorTC.replaceAllUsesWith(Res);
5171}
5172
5174 ElementCount VFEC) {
5175 // If VF and VFxUF have already been materialized (no remaining users),
5176 // there's nothing more to do.
5177 if (Plan.getVF().isMaterialized()) {
5178 assert(Plan.getVFxUF().isMaterialized() &&
5179 "VF and VFxUF must be materialized together");
5180 return;
5181 }
5182
5183 VPBuilder Builder(VectorPH, VectorPH->begin());
5184 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5185 VPValue &VF = Plan.getVF();
5186 VPValue &VFxUF = Plan.getVFxUF();
5187 // If there are no users of the runtime VF, compute VFxUF by constant folding
5188 // the multiplication of VF and UF.
5189 if (VF.getNumUsers() == 0) {
5190 VPValue *RuntimeVFxUF =
5191 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5192 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5193 return;
5194 }
5195
5196 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5197 // vscale) * UF.
5198 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5200 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5202 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5203 }
5204 VF.replaceAllUsesWith(RuntimeVF);
5205
5206 VPValue *MulByUF = Builder.createOverflowingOp(
5207 Instruction::Mul,
5208 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5209 {true, false});
5210 VFxUF.replaceAllUsesWith(MulByUF);
5211}
5212
5215 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5216
5217 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5218 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5219 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5220 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5222 continue;
5223 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5224 if (!ExpSCEV)
5225 break;
5226 const SCEV *Expr = ExpSCEV->getSCEV();
5227 Value *Res =
5228 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5229 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5230 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5231 ExpSCEV->replaceAllUsesWith(Exp);
5232 if (Plan.getTripCount() == ExpSCEV)
5233 Plan.resetTripCount(Exp);
5234 ExpSCEV->eraseFromParent();
5235 }
5237 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5238 "before any VPIRInstructions");
5239 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5240 // to the VPIRBasicBlock.
5241 auto EI = Entry->begin();
5242 for (Instruction &I : drop_end(*EntryBB)) {
5243 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5244 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5245 EI++;
5246 continue;
5247 }
5249 }
5250
5251 return ExpandedSCEVs;
5252}
5253
5254/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5255/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5256/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5257/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5258/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5259/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5260/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5261/// is defined at \p Idx of a load interleave group.
5262static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5263 VPValue *OpV, unsigned Idx, bool IsScalable) {
5264 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5265 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5266 if (!Member0OpR)
5267 return Member0Op == OpV;
5268 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5269 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5270 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5271 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5272 Member0Op == OpV;
5273 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5274 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5275 return false;
5276}
5277
5278static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5280 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5281 if (!WideMember0)
5282 return false;
5283 for (VPValue *V : Ops) {
5285 return false;
5286 auto *R = cast<VPSingleDefRecipe>(V);
5287 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5288 return false;
5289 }
5290
5291 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5293 for (VPValue *Op : Ops)
5294 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5295
5296 if (canNarrowOps(OpsI, IsScalable))
5297 continue;
5298
5299 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5300 const auto &[OpIdx, OpV] = P;
5301 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5302 }))
5303 return false;
5304 }
5305
5306 return true;
5307}
5308
5309/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5310/// number of members both equal to VF. The interleave group must also access
5311/// the full vector width.
5312static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5314 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5315 if (!InterleaveR || InterleaveR->getMask())
5316 return std::nullopt;
5317
5318 Type *GroupElementTy = nullptr;
5319 if (InterleaveR->getStoredValues().empty()) {
5320 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5321 if (!all_of(InterleaveR->definedValues(),
5322 [&TypeInfo, GroupElementTy](VPValue *Op) {
5323 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5324 }))
5325 return std::nullopt;
5326 } else {
5327 GroupElementTy =
5328 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5329 if (!all_of(InterleaveR->getStoredValues(),
5330 [&TypeInfo, GroupElementTy](VPValue *Op) {
5331 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5332 }))
5333 return std::nullopt;
5334 }
5335
5336 auto IG = InterleaveR->getInterleaveGroup();
5337 if (IG->getFactor() != IG->getNumMembers())
5338 return std::nullopt;
5339
5340 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5341 TypeSize Size = TTI.getRegisterBitWidth(
5344 assert(Size.isScalable() == VF.isScalable() &&
5345 "if Size is scalable, VF must be scalable and vice versa");
5346 return Size.getKnownMinValue();
5347 };
5348
5349 for (ElementCount VF : VFs) {
5350 unsigned MinVal = VF.getKnownMinValue();
5351 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5352 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5353 return {VF};
5354 }
5355 return std::nullopt;
5356}
5357
5358/// Returns true if \p VPValue is a narrow VPValue.
5359static bool isAlreadyNarrow(VPValue *VPV) {
5360 if (isa<VPIRValue>(VPV))
5361 return true;
5362 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5363 return RepR && RepR->isSingleScalar();
5364}
5365
5366// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5367// a narrow variant.
5368static VPValue *
5370 auto *R = V->getDefiningRecipe();
5371 if (!R || NarrowedOps.contains(V))
5372 return V;
5373
5374 if (isAlreadyNarrow(V))
5375 return V;
5376
5378 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5379 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5380 WideMember0->setOperand(
5381 Idx,
5382 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5383 return V;
5384 }
5385
5386 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5387 // Narrow interleave group to wide load, as transformed VPlan will only
5388 // process one original iteration.
5389 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5390 auto *L = new VPWidenLoadRecipe(
5391 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5392 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5393 L->insertBefore(LoadGroup);
5394 NarrowedOps.insert(L);
5395 return L;
5396 }
5397
5398 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5399 assert(RepR->isSingleScalar() &&
5400 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5401 "must be a single scalar load");
5402 NarrowedOps.insert(RepR);
5403 return RepR;
5404 }
5405
5406 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5407 VPValue *PtrOp = WideLoad->getAddr();
5408 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5409 PtrOp = VecPtr->getOperand(0);
5410 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5411 // process one original iteration.
5412 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5413 /*IsUniform*/ true,
5414 /*Mask*/ nullptr, {}, *WideLoad);
5415 N->insertBefore(WideLoad);
5416 NarrowedOps.insert(N);
5417 return N;
5418}
5419
5420std::unique_ptr<VPlan>
5422 const TargetTransformInfo &TTI) {
5423 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5424
5425 if (!VectorLoop)
5426 return nullptr;
5427
5428 // Only handle single-block loops for now.
5429 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5430 return nullptr;
5431
5432 // Skip plans when we may not be able to properly narrow.
5433 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5434 if (!match(&Exiting->back(), m_BranchOnCount()))
5435 return nullptr;
5436
5437 assert(match(&Exiting->back(),
5439 m_Specific(&Plan.getVectorTripCount()))) &&
5440 "unexpected branch-on-count");
5441
5442 VPTypeAnalysis TypeInfo(Plan);
5444 std::optional<ElementCount> VFToOptimize;
5445 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5447 continue;
5448
5451 continue;
5452
5453 // Bail out on recipes not supported at the moment:
5454 // * phi recipes other than the canonical induction
5455 // * recipes writing to memory except interleave groups
5456 // Only support plans with a canonical induction phi.
5457 if (R.isPhi())
5458 return nullptr;
5459
5460 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5461 if (R.mayWriteToMemory() && !InterleaveR)
5462 return nullptr;
5463
5464 // All other ops are allowed, but we reject uses that cannot be converted
5465 // when checking all allowed consumers (store interleave groups) below.
5466 if (!InterleaveR)
5467 continue;
5468
5469 // Try to find a single VF, where all interleave groups are consecutive and
5470 // saturate the full vector width. If we already have a candidate VF, check
5471 // if it is applicable for the current InterleaveR, otherwise look for a
5472 // suitable VF across the Plan's VFs.
5474 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5475 : to_vector(Plan.vectorFactors());
5476 std::optional<ElementCount> NarrowedVF =
5477 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5478 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5479 return nullptr;
5480 VFToOptimize = NarrowedVF;
5481
5482 // Skip read interleave groups.
5483 if (InterleaveR->getStoredValues().empty())
5484 continue;
5485
5486 // Narrow interleave groups, if all operands are already matching narrow
5487 // ops.
5488 auto *Member0 = InterleaveR->getStoredValues()[0];
5489 if (isAlreadyNarrow(Member0) &&
5490 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5491 StoreGroups.push_back(InterleaveR);
5492 continue;
5493 }
5494
5495 // For now, we only support full interleave groups storing load interleave
5496 // groups.
5497 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5498 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5499 if (!DefR)
5500 return false;
5501 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5502 return IR && IR->getInterleaveGroup()->isFull() &&
5503 IR->getVPValue(Op.index()) == Op.value();
5504 })) {
5505 StoreGroups.push_back(InterleaveR);
5506 continue;
5507 }
5508
5509 // Check if all values feeding InterleaveR are matching wide recipes, which
5510 // operands that can be narrowed.
5511 if (!canNarrowOps(InterleaveR->getStoredValues(),
5512 VFToOptimize->isScalable()))
5513 return nullptr;
5514 StoreGroups.push_back(InterleaveR);
5515 }
5516
5517 if (StoreGroups.empty())
5518 return nullptr;
5519
5520 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5521 bool RequiresScalarEpilogue =
5522 MiddleVPBB->getNumSuccessors() == 1 &&
5523 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5524 // Bail out for tail-folding (middle block with a single successor to exit).
5525 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5526 return nullptr;
5527
5528 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5529 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5530 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5531 // TODO: Handle cases where only some interleave groups can be narrowed.
5532 std::unique_ptr<VPlan> NewPlan;
5533 if (size(Plan.vectorFactors()) != 1) {
5534 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5535 Plan.setVF(*VFToOptimize);
5536 NewPlan->removeVF(*VFToOptimize);
5537 }
5538
5539 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5540 SmallPtrSet<VPValue *, 4> NarrowedOps;
5541 // Narrow operation tree rooted at store groups.
5542 for (auto *StoreGroup : StoreGroups) {
5543 VPValue *Res =
5544 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5545 auto *SI =
5546 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5547 auto *S = new VPWidenStoreRecipe(
5548 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5549 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5550 S->insertBefore(StoreGroup);
5551 StoreGroup->eraseFromParent();
5552 }
5553
5554 // Adjust induction to reflect that the transformed plan only processes one
5555 // original iteration.
5556 auto *CanIV = VectorLoop->getCanonicalIV();
5557 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5558 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5559 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5560
5561 VPValue *UF = &Plan.getUF();
5562 VPValue *Step;
5563 if (VFToOptimize->isScalable()) {
5564 VPValue *VScale = PHBuilder.createElementCount(
5566 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5567 {true, false});
5568 Plan.getVF().replaceAllUsesWith(VScale);
5569 } else {
5570 Step = UF;
5572 Plan.getConstantInt(CanIV->getScalarType(), 1));
5573 }
5574 // Materialize vector trip count with the narrowed step.
5575 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5576 RequiresScalarEpilogue, Step);
5577
5578 Inc->setOperand(1, Step);
5579 Plan.getVFxUF().replaceAllUsesWith(Step);
5580
5581 removeDeadRecipes(Plan);
5582 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5584 "All VPVectorPointerRecipes should have been removed");
5585 return NewPlan;
5586}
5587
5588/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5589/// BranchOnCond recipe.
5591 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5592 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5593 auto *MiddleTerm =
5595 // Only add branch metadata if there is a (conditional) terminator.
5596 if (!MiddleTerm)
5597 return;
5598
5599 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5600 "must have a BranchOnCond");
5601 // Assume that `TripCount % VectorStep ` is equally distributed.
5602 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5603 if (VF.isScalable() && VScaleForTuning.has_value())
5604 VectorStep *= *VScaleForTuning;
5605 assert(VectorStep > 0 && "trip count should not be zero");
5606 MDBuilder MDB(Plan.getContext());
5607 MDNode *BranchWeights =
5608 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5609 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5610}
5611
5613 VFRange &Range) {
5614 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5615 auto *MiddleVPBB = Plan.getMiddleBlock();
5616 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5617
5618 auto IsScalableOne = [](ElementCount VF) -> bool {
5619 return VF == ElementCount::getScalable(1);
5620 };
5621
5622 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5623 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5624 if (!FOR)
5625 continue;
5626
5627 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5628 "Cannot handle loops with uncountable early exits");
5629
5630 // This is the second phase of vectorizing first-order recurrences, creating
5631 // extract for users outside the loop. An overview of the transformation is
5632 // described below. Suppose we have the following loop with some use after
5633 // the loop of the last a[i-1],
5634 //
5635 // for (int i = 0; i < n; ++i) {
5636 // t = a[i - 1];
5637 // b[i] = a[i] - t;
5638 // }
5639 // use t;
5640 //
5641 // There is a first-order recurrence on "a". For this loop, the shorthand
5642 // scalar IR looks like:
5643 //
5644 // scalar.ph:
5645 // s.init = a[-1]
5646 // br scalar.body
5647 //
5648 // scalar.body:
5649 // i = phi [0, scalar.ph], [i+1, scalar.body]
5650 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5651 // s2 = a[i]
5652 // b[i] = s2 - s1
5653 // br cond, scalar.body, exit.block
5654 //
5655 // exit.block:
5656 // use = lcssa.phi [s1, scalar.body]
5657 //
5658 // In this example, s1 is a recurrence because it's value depends on the
5659 // previous iteration. In the first phase of vectorization, we created a
5660 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5661 // for users in the scalar preheader and exit block.
5662 //
5663 // vector.ph:
5664 // v_init = vector(..., ..., ..., a[-1])
5665 // br vector.body
5666 //
5667 // vector.body
5668 // i = phi [0, vector.ph], [i+4, vector.body]
5669 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5670 // v2 = a[i, i+1, i+2, i+3]
5671 // b[i] = v2 - v1
5672 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5673 // b[i, i+1, i+2, i+3] = v2 - v1
5674 // br cond, vector.body, middle.block
5675 //
5676 // middle.block:
5677 // vector.recur.extract.for.phi = v2(2)
5678 // vector.recur.extract = v2(3)
5679 // br cond, scalar.ph, exit.block
5680 //
5681 // scalar.ph:
5682 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5683 // [s.init, otherwise]
5684 // br scalar.body
5685 //
5686 // scalar.body:
5687 // i = phi [0, scalar.ph], [i+1, scalar.body]
5688 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5689 // s2 = a[i]
5690 // b[i] = s2 - s1
5691 // br cond, scalar.body, exit.block
5692 //
5693 // exit.block:
5694 // lo = lcssa.phi [s1, scalar.body],
5695 // [vector.recur.extract.for.phi, middle.block]
5696 //
5697 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5698 // Extract the penultimate value of the recurrence and use it as operand for
5699 // the VPIRInstruction modeling the phi.
5701 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5703 continue;
5704
5705 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5706 // penultimate value of the recurrence. Instead we rely on the existing
5707 // extract of the last element from the result of
5708 // VPInstruction::FirstOrderRecurrenceSplice.
5709 // TODO: Consider vscale_range info and UF.
5711 Range))
5712 return;
5713 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5714 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5715 "vector.recur.extract.for.phi");
5716 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5717 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5718 if (!ExitPhi)
5719 continue;
5720 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5721 }
5722 }
5723 }
5724}
5725
5726/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5727/// value. Returns the widened IV if found, nullptr otherwise.
5729 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5730 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5731 Instruction::isIntDivRem(BinOp->getOpcode()))
5732 return nullptr;
5733
5734 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5735 VPValue *InvariantCandidate = BinOp->getOperand(1);
5736 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5737 std::swap(WidenIVCandidate, InvariantCandidate);
5738
5739 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5740 return nullptr;
5741
5742 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5743}
5744
5745/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5746/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5750 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5751 auto *ClonedOp = BinOp->clone();
5752 if (ClonedOp->getOperand(0) == WidenIV) {
5753 ClonedOp->setOperand(0, ScalarIV);
5754 } else {
5755 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5756 ClonedOp->setOperand(1, ScalarIV);
5757 }
5758 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5759 return ClonedOp;
5760}
5761
5764 Loop &L) {
5765 ScalarEvolution &SE = *PSE.getSE();
5766 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5767
5768 // Helper lambda to check if the IV range excludes the sentinel value. Try
5769 // signed first, then unsigned. Return an excluded sentinel if found,
5770 // otherwise return std::nullopt.
5771 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5772 bool UseMax) -> std::optional<APSInt> {
5773 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5774 for (bool Signed : {true, false}) {
5775 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5776 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5777
5778 ConstantRange IVRange =
5779 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5780 if (!IVRange.contains(Sentinel))
5781 return Sentinel;
5782 }
5783 return std::nullopt;
5784 };
5785
5786 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5787 for (VPRecipeBase &Phi :
5788 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5789 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5791 PhiR->getRecurrenceKind()))
5792 continue;
5793
5794 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5795 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5796 continue;
5797
5798 // If there's a header mask, the backedge select will not be the find-last
5799 // select.
5800 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5801 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5802 if (HeaderMask &&
5803 !match(BackedgeVal,
5804 m_Select(m_Specific(HeaderMask),
5805 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5806 llvm_unreachable("expected header mask select");
5807
5808 // Get the find-last expression from the find-last select of the reduction
5809 // phi. The find-last select should be a select between the phi and the
5810 // find-last expression.
5811 VPValue *Cond, *FindLastExpression;
5812 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5813 m_VPValue(FindLastExpression))) &&
5814 !match(FindLastSelect,
5815 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5816 m_Specific(PhiR))))
5817 continue;
5818
5819 // Check if FindLastExpression is a simple expression of a widened IV. If
5820 // so, we can track the underlying IV instead and sink the expression.
5821 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5822 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5823 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5824 &L);
5825 const SCEV *Step;
5826 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5827 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5829 "IVOfExpressionToSink not being an AddRec must imply "
5830 "FindLastExpression not being an AddRec.");
5831 continue;
5832 }
5833
5834 // Determine direction from SCEV step.
5835 if (!SE.isKnownNonZero(Step))
5836 continue;
5837
5838 // Positive step means we need UMax/SMax to find the last IV value, and
5839 // UMin/SMin otherwise.
5840 bool UseMax = SE.isKnownPositive(Step);
5841 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5842 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5843
5844 // Sinking an expression will disable epilogue vectorization. Only use it,
5845 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5846 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5847 // multiply or divide by large constant, respectively), which also makes
5848 // sinking undesirable.
5849 if (IVOfExpressionToSink) {
5850 const SCEV *FindLastExpressionSCEV =
5851 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5852 if (match(FindLastExpressionSCEV,
5853 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5854 bool NewUseMax = SE.isKnownPositive(Step);
5855 if (auto NewSentinel =
5856 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5857 // The original expression already has a sentinel, so prefer not
5858 // sinking to keep epilogue vectorization possible.
5859 SentinelVal = *NewSentinel;
5860 UseSigned = NewSentinel->isSigned();
5861 UseMax = NewUseMax;
5862 IVSCEV = FindLastExpressionSCEV;
5863 IVOfExpressionToSink = nullptr;
5864 }
5865 }
5866 }
5867
5868 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5869 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5870 // cannot use min/max.
5871 if (!SentinelVal) {
5872 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5873 if (AR->hasNoSignedWrap())
5874 UseSigned = true;
5875 else if (AR->hasNoUnsignedWrap())
5876 UseSigned = false;
5877 else
5878 continue;
5879 }
5880
5882 BackedgeVal,
5884
5885 VPValue *NewFindLastSelect = BackedgeVal;
5886 VPValue *SelectCond = Cond;
5887 if (!SentinelVal || IVOfExpressionToSink) {
5888 // When we need to create a new select, normalize the condition so that
5889 // PhiR is the last operand and include the header mask if needed.
5890 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5891 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5892 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5893 SelectCond = LoopBuilder.createNot(SelectCond);
5894
5895 // When tail folding, mask the condition with the header mask to prevent
5896 // propagating poison from inactive lanes in the last vector iteration.
5897 if (HeaderMask)
5898 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5899
5900 if (SelectCond != Cond || IVOfExpressionToSink) {
5901 NewFindLastSelect = LoopBuilder.createSelect(
5902 SelectCond,
5903 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5904 PhiR, DL);
5905 }
5906 }
5907
5908 // Create the reduction result in the middle block using sentinel directly.
5909 RecurKind MinMaxKind =
5910 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5911 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5912 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5913 FastMathFlags());
5914 DebugLoc ExitDL = RdxResult->getDebugLoc();
5915 VPBuilder MiddleBuilder(RdxResult);
5916 VPValue *ReducedIV =
5918 NewFindLastSelect, Flags, ExitDL);
5919
5920 // If IVOfExpressionToSink is an expression to sink, sink it now.
5921 VPValue *VectorRegionExitingVal = ReducedIV;
5922 if (IVOfExpressionToSink)
5923 VectorRegionExitingVal =
5924 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5925 ReducedIV, IVOfExpressionToSink);
5926
5927 VPValue *NewRdxResult;
5928 VPValue *StartVPV = PhiR->getStartValue();
5929 if (SentinelVal) {
5930 // Sentinel-based approach: reduce IVs with min/max, compare against
5931 // sentinel to detect if condition was ever true, select accordingly.
5932 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5933 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5934 Sentinel, ExitDL);
5935 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5936 StartVPV, ExitDL);
5937 StartVPV = Sentinel;
5938 } else {
5939 // Introduce a boolean AnyOf reduction to track if the condition was ever
5940 // true in the loop. Use it to select the initial start value, if it was
5941 // never true.
5942 auto *AnyOfPhi = new VPReductionPHIRecipe(
5943 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5944 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5945 AnyOfPhi->insertAfter(PhiR);
5946
5947 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5948 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5949 AnyOfPhi->setOperand(1, OrVal);
5950
5951 VPIRFlags OrFlags(RecurKind::Or, /*IsOrdered=*/false,
5952 /*IsInLoop=*/false, FastMathFlags());
5953 auto *OrReduce = MiddleBuilder.createNaryOp(
5954 VPInstruction::ComputeReductionResult, {OrVal}, OrFlags, ExitDL);
5955 NewRdxResult = MiddleBuilder.createNaryOp(
5957 {StartVPV, VectorRegionExitingVal, OrReduce}, {}, ExitDL);
5958
5959 // Initialize the IV reduction phi with the neutral element, not the
5960 // original start value, to ensure correct min/max reduction results.
5961 StartVPV = Plan.getOrAddLiveIn(
5962 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5963 }
5964 RdxResult->replaceAllUsesWith(NewRdxResult);
5965 RdxResult->eraseFromParent();
5966
5967 auto *NewPhiR = new VPReductionPHIRecipe(
5968 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5969 *NewFindLastSelect, RdxUnordered{1}, {},
5970 PhiR->hasUsesOutsideReductionChain());
5971 NewPhiR->insertBefore(PhiR);
5972 PhiR->replaceAllUsesWith(NewPhiR);
5973 PhiR->eraseFromParent();
5974 }
5975}
5976
5977namespace {
5978
5979using ExtendKind = TTI::PartialReductionExtendKind;
5980struct ReductionExtend {
5981 Type *SrcType = nullptr;
5982 ExtendKind Kind = ExtendKind::PR_None;
5983};
5984
5985/// Describes the extends used to compute the extended reduction operand.
5986/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5987/// operation.
5988struct ExtendedReductionOperand {
5989 /// The recipe that consumes the extends.
5990 VPWidenRecipe *ExtendsUser = nullptr;
5991 /// Extend descriptions (inputs to getPartialReductionCost).
5992 ReductionExtend ExtendA, ExtendB;
5993};
5994
5995/// A chain of recipes that form a partial reduction. Matches either
5996/// reduction_bin_op (extend (A), accumulator), or
5997/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5998struct VPPartialReductionChain {
5999 /// The top-level binary operation that forms the reduction to a scalar
6000 /// after the loop body.
6001 VPWidenRecipe *ReductionBinOp = nullptr;
6002 /// The user of the extends that is then reduced.
6003 ExtendedReductionOperand ExtendedOp;
6004 unsigned ScaleFactor;
6005 /// The recurrence kind for the entire partial reduction chain.
6006 /// This allows distinguishing between Sub and AddWithSub recurrences,
6007 /// when the ReductionBinOp is a Instruction::Sub.
6008 RecurKind RK;
6009};
6010
6011static VPSingleDefRecipe *
6012optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
6013 VPTypeAnalysis &TypeInfo) {
6014 // reduce.add(mul(ext(A), C))
6015 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6016 const APInt *Const;
6017 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6018 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
6019 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6020 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
6021 if (!BinOp->hasOneUse() ||
6023 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6024 return BinOp;
6025
6026 VPBuilder Builder(BinOp);
6027 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6028 BinOp->getOperand(1), NarrowTy);
6029 Type *WideTy = TypeInfo.inferScalarType(ExtA);
6030 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6031 return BinOp;
6032 }
6033
6034 // reduce.add(ext(mul(ext(A), ext(B))))
6035 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6036 // TODO: Support this optimization for float types.
6038 m_ZExtOrSExt(m_VPValue()))))) {
6039 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
6040 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6041 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6042 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6043 if (!Mul->hasOneUse() ||
6044 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6045 MulLHS->getOpcode() != MulRHS->getOpcode())
6046 return BinOp;
6047 VPBuilder Builder(Mul);
6048 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
6049 MulLHS->getOperand(0),
6050 Ext->getResultType()));
6051 Mul->setOperand(1, MulLHS == MulRHS
6052 ? Mul->getOperand(0)
6053 : Builder.createWidenCast(MulRHS->getOpcode(),
6054 MulRHS->getOperand(0),
6055 Ext->getResultType()));
6056 return Mul;
6057 }
6058
6059 return BinOp;
6060}
6061
6062// Helper to transform a partial reduction chain into a partial reduction
6063// recipe. Assumes profitability has been checked.
6064static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6065 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6066 VPReductionPHIRecipe *RdxPhi) {
6067 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6068 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6069
6070 VPValue *BinOpVal = WidenRecipe->getOperand(0);
6071 VPValue *Accumulator = WidenRecipe->getOperand(1);
6072
6073 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
6075 isa<VPExpressionRecipe>(BinOpVal))
6076 std::swap(BinOpVal, Accumulator);
6077 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
6078
6079 // Sub-reductions can be implemented in two ways:
6080 // (1) negate the operand in the vector loop (the default way).
6081 // (2) subtract the reduced value from the init value in the middle block.
6082 // Both ways keep the reduction itself as an 'add' reduction.
6083 //
6084 // The ISD nodes for partial reductions don't support folding the
6085 // sub/negation into its operands because the following is not a valid
6086 // transformation:
6087 // sub(0, mul(ext(a), ext(b)))
6088 // -> mul(ext(a), ext(sub(0, b)))
6089 //
6090 // It's therefore better to choose option (2) such that the partial
6091 // reduction is always positive (starting at '0') and to do a final
6092 // subtract in the middle block.
6093 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6094 Chain.RK != RecurKind::Sub) {
6095 VPBuilder Builder(WidenRecipe);
6096 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
6097 auto *Zero = Plan.getZero(ElemTy);
6098 auto *NegRecipe =
6099 new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, VPIRFlags(),
6101 Builder.insert(NegRecipe);
6102 BinOp = NegRecipe;
6103 }
6104
6105 // FIXME: Do these transforms before invoking the cost-model.
6106 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
6107
6108 // Check if WidenRecipe is the final result of the reduction. If so look
6109 // through selects for predicated reductions.
6110 VPValue *Cond = nullptr;
6112 WidenRecipe,
6113 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6114 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6115 RdxPhi->getBackedgeValue() == ExitValue;
6116 assert((!ExitValue || IsLastInChain) &&
6117 "if we found ExitValue, it must match RdxPhi's backedge value");
6118
6119 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6120 RecurKind RdxKind =
6122 auto *PartialRed = new VPReductionRecipe(
6123 RdxKind,
6124 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6125 : FastMathFlags(),
6126 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6127 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6128 PartialRed->insertBefore(WidenRecipe);
6129
6130 if (Cond)
6131 ExitValue->replaceAllUsesWith(PartialRed);
6132 WidenRecipe->replaceAllUsesWith(PartialRed);
6133
6134 // We only need to update the PHI node once, which is when we find the
6135 // last reduction in the chain.
6136 if (!IsLastInChain)
6137 return;
6138
6139 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6140 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6141 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6142
6143 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6144 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6145 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6146 StartInst->setOperand(2, NewScaleFactor);
6147
6148 // If this is the last value in a sub-reduction chain, then update the PHI
6149 // node to start at `0` and update the reduction-result to subtract from
6150 // the PHI's start value.
6151 if (Chain.RK != RecurKind::Sub)
6152 return;
6153
6154 VPValue *OldStartValue = StartInst->getOperand(0);
6155 StartInst->setOperand(0, StartInst->getOperand(1));
6156
6157 // Replace reduction_result by 'sub (startval, reductionresult)'.
6159 assert(RdxResult && "Could not find reduction result");
6160
6161 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6162 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6163 VPInstruction *NewResult = Builder.createNaryOp(
6164 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6165 RdxPhi->getDebugLoc());
6166 RdxResult->replaceUsesWithIf(
6167 NewResult,
6168 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6169}
6170
6171/// Returns the cost of a link in a partial-reduction chain for a given VF.
6172static InstructionCost
6173getPartialReductionLinkCost(VPCostContext &CostCtx,
6174 const VPPartialReductionChain &Link,
6175 ElementCount VF) {
6176 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6177 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6178 std::optional<unsigned> BinOpc = std::nullopt;
6179 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6180 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6181 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6182
6183 std::optional<llvm::FastMathFlags> Flags;
6184 if (RdxType->isFloatingPointTy())
6185 Flags = Link.ReductionBinOp->getFastMathFlags();
6186
6187 unsigned Opcode = Link.RK == RecurKind::Sub
6188 ? (unsigned)Instruction::Add
6189 : Link.ReductionBinOp->getOpcode();
6190 return CostCtx.TTI.getPartialReductionCost(
6191 Opcode, ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType, RdxType,
6192 VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6193 CostCtx.CostKind, Flags);
6194}
6195
6196static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6198}
6199
6200/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6201/// operand. This is an operand where the source of the value (e.g. a load) has
6202/// been extended (sext, zext, or fpext) before it is used in the reduction.
6203///
6204/// Possible forms matched by this function:
6205/// - UpdateR(PrevValue, ext(...))
6206/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6207/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6208/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6209/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6210/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6211/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6212///
6213/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6214static std::optional<ExtendedReductionOperand>
6215matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6216 VPTypeAnalysis &TypeInfo) {
6217 assert(is_contained(UpdateR->operands(), Op) &&
6218 "Op should be operand of UpdateR");
6219
6220 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6222 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6223 VPValue *CastSource = CastRecipe->getOperand(0);
6224 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6225 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6226 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6227 // Match: ext(mul(...))
6228 // Record the outer extend kind and set `Op` to the mul. We can then match
6229 // this as a binary operation. Note: We can optimize out the outer extend
6230 // by widening the inner extends to match it. See
6231 // optimizeExtendsForPartialReduction.
6232 Op = CastSource;
6233 } else if (UpdateR->getOpcode() == Instruction::Add ||
6234 UpdateR->getOpcode() == Instruction::FAdd) {
6235 // Match: UpdateR(PrevValue, ext(...))
6236 // TODO: Remove the add/fadd restriction (we should be able to handle this
6237 // case for sub reductions too).
6238 return ExtendedReductionOperand{
6239 UpdateR,
6240 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6241 /*ExtendB=*/{}};
6242 }
6243 }
6244
6245 if (!Op->hasOneUse())
6246 return std::nullopt;
6247
6248 // Handle neg(...) pattern (aka sub(0, ...)).
6249 VPValue *NegatedOp = nullptr;
6250 if (match(Op, m_Sub(m_ZeroInt(), m_VPValue(NegatedOp))))
6251 Op = NegatedOp;
6252
6254 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
6255 return std::nullopt;
6256
6257 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6258 // binary operation.
6259
6260 VPValue *LHS = BinOp->getOperand(0);
6261 VPValue *RHS = BinOp->getOperand(1);
6262
6263 // The LHS of the operation must always be an extend.
6265 return std::nullopt;
6266
6267 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6268 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6269 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6270
6271 // The RHS of the operation can be an extend or a constant integer.
6272 const APInt *RHSConst = nullptr;
6273 VPWidenCastRecipe *RHSCast = nullptr;
6275 RHSCast = cast<VPWidenCastRecipe>(RHS);
6276 else if (!match(RHS, m_APInt(RHSConst)) ||
6277 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6278 return std::nullopt;
6279
6280 // The outer extend kind must match the inner extends for folding.
6281 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6282 if (Cast && OuterExtKind &&
6283 getPartialReductionExtendKind(Cast) != OuterExtKind)
6284 return std::nullopt;
6285
6286 Type *RHSInputType = LHSInputType;
6287 ExtendKind RHSExtendKind = LHSExtendKind;
6288 if (RHSCast) {
6289 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6290 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6291 }
6292
6293 return ExtendedReductionOperand{
6294 BinOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6295}
6296
6297/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6298/// and determines if the target can use a cheaper operation with a wider
6299/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6300/// of operations in the reduction.
6301static std::optional<SmallVector<VPPartialReductionChain>>
6302getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6303 VFRange &Range) {
6304 // Get the backedge value from the reduction PHI and find the
6305 // ComputeReductionResult that uses it (directly or through a select for
6306 // predicated reductions).
6307 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6308 if (!RdxResult)
6309 return std::nullopt;
6310 VPValue *ExitValue = RdxResult->getOperand(0);
6311 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6312
6313 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6315 RecurKind RK = RedPhiR->getRecurrenceKind();
6316 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6317 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6318
6319 // Work backwards from the ExitValue examining each reduction operation.
6320 VPValue *CurrentValue = ExitValue;
6321 while (CurrentValue != RedPhiR) {
6322 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6323 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6324 return std::nullopt;
6325
6326 VPValue *Op = UpdateR->getOperand(1);
6327 VPValue *PrevValue = UpdateR->getOperand(0);
6328
6329 // Find the extended operand. The other operand (PrevValue) is the next link
6330 // in the reduction chain.
6331 std::optional<ExtendedReductionOperand> ExtendedOp =
6332 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6333 if (!ExtendedOp) {
6334 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6335 if (!ExtendedOp)
6336 return std::nullopt;
6337 std::swap(Op, PrevValue);
6338 }
6339
6340 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6341 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6342 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6343 return std::nullopt;
6344
6345 // Check if a partial reduction chain is supported by the target (i.e. does
6346 // not have an invalid cost) for the given VF range. Clamps the range and
6347 // returns true if feasible for any VF.
6348 VPPartialReductionChain Link(
6349 {UpdateR, *ExtendedOp,
6350 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)), RK});
6351 Chain.push_back(Link);
6352 CurrentValue = PrevValue;
6353 }
6354
6355 // The chain links were collected by traversing backwards from the exit value.
6356 // Reverse the chains so they are in program order.
6357 std::reverse(Chain.begin(), Chain.end());
6358 return Chain;
6359}
6360} // namespace
6361
6363 VPCostContext &CostCtx,
6364 VFRange &Range) {
6365 // Find all possible valid partial reductions, grouping chains by their PHI.
6366 // This grouping allows invalidating the whole chain, if any link is not a
6367 // valid partial reduction.
6369 ChainsByPhi;
6370 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6371 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6372 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6373 if (!RedPhiR)
6374 continue;
6375
6376 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6377 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6378 }
6379
6380 if (ChainsByPhi.empty())
6381 return;
6382
6383 // Build set of partial reduction operations for extend user validation and
6384 // a map of reduction bin ops to their scale factors for scale validation.
6385 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6386 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6387 for (const auto &[_, Chains] : ChainsByPhi)
6388 for (const VPPartialReductionChain &Chain : Chains) {
6389 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6390 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6391 }
6392
6393 // A partial reduction is invalid if any of its extends are used by
6394 // something that isn't another partial reduction. This is because the
6395 // extends are intended to be lowered along with the reduction itself.
6396 auto ExtendUsersValid = [&](VPValue *Ext) {
6397 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6398 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6399 });
6400 };
6401
6402 auto IsProfitablePartialReductionChainForVF =
6403 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6404 InstructionCost PartialCost = 0, RegularCost = 0;
6405
6406 // The chain is a profitable partial reduction chain if the cost of handling
6407 // the entire chain is cheaper when using partial reductions than when
6408 // handling the entire chain using regular reductions.
6409 for (const VPPartialReductionChain &Link : Chain) {
6410 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6411 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6412 if (!LinkCost.isValid())
6413 return false;
6414
6415 PartialCost += LinkCost;
6416 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6417 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6418 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6419 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6420 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6421 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6422 RegularCost += Extend->computeCost(VF, CostCtx);
6423 }
6424 return PartialCost.isValid() && PartialCost <= RegularCost;
6425 };
6426
6427 // Validate chains: check that extends are only used by partial reductions,
6428 // and that reduction bin ops are only used by other partial reductions with
6429 // matching scale factors, are outside the loop region or the select
6430 // introduced by tail-folding. Otherwise we would create users of scaled
6431 // reductions where the types of the other operands don't match.
6432 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6433 for (const VPPartialReductionChain &Chain : Chains) {
6434 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6435 Chains.clear();
6436 break;
6437 }
6438 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6439 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6440 return PhiR == RedPhiR;
6441 auto *R = cast<VPSingleDefRecipe>(U);
6442 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6444 m_Specific(Chain.ReductionBinOp))) ||
6445 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6446 m_Specific(RedPhiR)));
6447 };
6448 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6449 Chains.clear();
6450 break;
6451 }
6452
6453 // Check if the compute-reduction-result is used by a sunk store.
6454 // TODO: Also form partial reductions in those cases.
6455 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6456 if (any_of(RdxResult->users(), [](VPUser *U) {
6457 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6458 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6459 })) {
6460 Chains.clear();
6461 break;
6462 }
6463 }
6464 }
6465
6466 // Clear the chain if it is not profitable.
6468 [&, &Chains = Chains](ElementCount VF) {
6469 return IsProfitablePartialReductionChainForVF(Chains, VF);
6470 },
6471 Range))
6472 Chains.clear();
6473 }
6474
6475 for (auto &[Phi, Chains] : ChainsByPhi)
6476 for (const VPPartialReductionChain &Chain : Chains)
6477 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6478}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1043
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1016
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1618
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3889
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4253
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4328
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4280
iterator end()
Definition VPlan.h:4290
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4288
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4341
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4300
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4302
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2794
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2830
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2820
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2836
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2816
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:299
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:272
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:256
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3298
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3831
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3921
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4001
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3343
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2306
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2348
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2337
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4406
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1327
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1272
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1321
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2939
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2931
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2960
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3012
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2970
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1593
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3485
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4558
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3172
A recipe for handling reduction phis.
Definition VPlan.h:2700
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2747
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2740
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2758
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3063
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4441
const VPBlockBase * getEntry() const
Definition VPlan.h:4477
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4552
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4509
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4494
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4539
const VPBlockBase * getExiting() const
Definition VPlan.h:4489
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4502
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3217
bool isSingleScalar() const
Definition VPlan.h:3258
bool isPredicated() const
Definition VPlan.h:3260
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3282
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4073
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
unsigned getNumOperands() const
Definition VPlanValue.h:334
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
void addOperand(VPValue *Operand)
Definition VPlanValue.h:329
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1446
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
bool hasOneUse() const
Definition VPlanValue.h:166
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1449
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1455
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2154
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3964
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1840
Instruction::CastOps getOpcode() const
Definition VPlan.h:1878
A recipe for handling GEP instructions.
Definition VPlan.h:2090
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2372
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2400
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2418
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2403
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2423
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2454
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2501
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2505
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2532
A recipe for widening vector intrinsics.
Definition VPlan.h:1892
A common base class for widening memory operations.
Definition VPlan.h:3528
A recipe for widened phis.
Definition VPlan.h:2590
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1784
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1804
unsigned getOpcode() const
Definition VPlan.h:1821
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4571
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4886
bool hasVF(ElementCount VF) const
Definition VPlan.h:4784
const DataLayout & getDataLayout() const
Definition VPlan.h:4766
LLVMContext & getContext() const
Definition VPlan.h:4762
VPBasicBlock * getEntry()
Definition VPlan.h:4663
bool hasScalableVF() const
Definition VPlan.h:4785
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4721
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4742
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4791
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4857
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4760
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4863
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4933
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4889
bool hasUF(unsigned UF) const
Definition VPlan.h:4809
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4711
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4750
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4747
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4834
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4860
void setVF(ElementCount VF)
Definition VPlan.h:4772
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4825
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1067
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4812
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4735
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4688
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4912
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4854
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4757
bool hasScalarVFOnly() const
Definition VPlan.h:4802
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4702
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4668
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4753
void setUF(unsigned UF)
Definition VPlan.h:4817
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4965
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1215
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4868
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2803
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:279
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1811
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2682
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2638
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:207
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:255
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3661
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3619
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3746
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3702
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...