LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290static bool sinkScalarOperands(VPlan &Plan) {
291 auto Iter = vp_depth_first_deep(Plan.getEntry());
292 bool ScalarVFOnly = Plan.hasScalarVFOnly();
293 bool Changed = false;
294
296 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
297 VPBasicBlock *SinkTo, VPValue *Op) {
298 auto *Candidate =
299 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
300 if (!Candidate)
301 return;
302
303 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
304 // for now.
306 return;
307
308 if (Candidate->getParent() == SinkTo ||
309 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
310 return;
311
312 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
313 if (!ScalarVFOnly && RepR->isSingleScalar())
314 return;
315
316 WorkList.insert({SinkTo, Candidate});
317 };
318
319 // First, collect the operands of all recipes in replicate blocks as seeds for
320 // sinking.
322 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
323 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
324 continue;
325 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
326 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
327 continue;
328 for (auto &Recipe : *VPBB)
329 for (VPValue *Op : Recipe.operands())
330 InsertIfValidSinkCandidate(VPBB, Op);
331 }
332
333 // Try to sink each replicate or scalar IV steps recipe in the worklist.
334 for (unsigned I = 0; I != WorkList.size(); ++I) {
335 VPBasicBlock *SinkTo;
336 VPSingleDefRecipe *SinkCandidate;
337 std::tie(SinkTo, SinkCandidate) = WorkList[I];
338
339 // All recipe users of SinkCandidate must be in the same block SinkTo or all
340 // users outside of SinkTo must only use the first lane of SinkCandidate. In
341 // the latter case, we need to duplicate SinkCandidate.
342 auto UsersOutsideSinkTo =
343 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
344 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
345 });
346 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
347 return !U->usesFirstLaneOnly(SinkCandidate);
348 }))
349 continue;
350 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
351
352 if (NeedsDuplicating) {
353 if (ScalarVFOnly)
354 continue;
355 VPSingleDefRecipe *Clone;
356 if (auto *SinkCandidateRepR =
357 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
358 // TODO: Handle converting to uniform recipes as separate transform,
359 // then cloning should be sufficient here.
360 Instruction *I = SinkCandidate->getUnderlyingInstr();
361 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
362 nullptr /*Mask*/, *SinkCandidateRepR,
363 *SinkCandidateRepR);
364 // TODO: add ".cloned" suffix to name of Clone's VPValue.
365 } else {
366 Clone = SinkCandidate->clone();
367 }
368
369 Clone->insertBefore(SinkCandidate);
370 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
371 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
372 });
373 }
374 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
375 for (VPValue *Op : SinkCandidate->operands())
376 InsertIfValidSinkCandidate(SinkTo, Op);
377 Changed = true;
378 }
379 return Changed;
380}
381
382/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
383/// the mask.
385 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
386 if (!EntryBB || EntryBB->size() != 1 ||
387 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
388 return nullptr;
389
390 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
391}
392
393/// If \p R is a triangle region, return the 'then' block of the triangle.
395 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
396 if (EntryBB->getNumSuccessors() != 2)
397 return nullptr;
398
399 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
400 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
401 if (!Succ0 || !Succ1)
402 return nullptr;
403
404 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
405 return nullptr;
406 if (Succ0->getSingleSuccessor() == Succ1)
407 return Succ0;
408 if (Succ1->getSingleSuccessor() == Succ0)
409 return Succ1;
410 return nullptr;
411}
412
413// Merge replicate regions in their successor region, if a replicate region
414// is connected to a successor replicate region with the same predicate by a
415// single, empty VPBasicBlock.
417 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
418
419 // Collect replicate regions followed by an empty block, followed by another
420 // replicate region with matching masks to process front. This is to avoid
421 // iterator invalidation issues while merging regions.
424 vp_depth_first_deep(Plan.getEntry()))) {
425 if (!Region1->isReplicator())
426 continue;
427 auto *MiddleBasicBlock =
428 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
429 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
430 continue;
431
432 auto *Region2 =
433 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434 if (!Region2 || !Region2->isReplicator())
435 continue;
436
437 VPValue *Mask1 = getPredicatedMask(Region1);
438 VPValue *Mask2 = getPredicatedMask(Region2);
439 if (!Mask1 || Mask1 != Mask2)
440 continue;
441
442 assert(Mask1 && Mask2 && "both region must have conditions");
443 WorkList.push_back(Region1);
444 }
445
446 // Move recipes from Region1 to its successor region, if both are triangles.
447 for (VPRegionBlock *Region1 : WorkList) {
448 if (TransformedRegions.contains(Region1))
449 continue;
450 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
451 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
452
453 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
454 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
455 if (!Then1 || !Then2)
456 continue;
457
458 // Note: No fusion-preventing memory dependencies are expected in either
459 // region. Such dependencies should be rejected during earlier dependence
460 // checks, which guarantee accesses can be re-ordered for vectorization.
461 //
462 // Move recipes to the successor region.
463 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
464 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
465
466 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
467 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
468
469 // Move VPPredInstPHIRecipes from the merge block to the successor region's
470 // merge block. Update all users inside the successor region to use the
471 // original values.
472 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
473 VPValue *PredInst1 =
474 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
475 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
476 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
477 return cast<VPRecipeBase>(&U)->getParent() == Then2;
478 });
479
480 // Remove phi recipes that are unused after merging the regions.
481 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
482 Phi1ToMove.eraseFromParent();
483 continue;
484 }
485 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
486 }
487
488 // Remove the dead recipes in Region1's entry block.
489 for (VPRecipeBase &R :
490 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
491 R.eraseFromParent();
492
493 // Finally, remove the first region.
494 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
495 VPBlockUtils::disconnectBlocks(Pred, Region1);
496 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
497 }
498 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
499 TransformedRegions.insert(Region1);
500 }
501
502 return !TransformedRegions.empty();
503}
504
506 VPRegionBlock *ParentRegion,
507 VPlan &Plan) {
508 Instruction *Instr = PredRecipe->getUnderlyingInstr();
509 // Build the triangular if-then region.
510 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
511 assert(Instr->getParent() && "Predicated instruction not in any basic block");
512 auto *BlockInMask = PredRecipe->getMask();
513 auto *MaskDef = BlockInMask->getDefiningRecipe();
514 auto *BOMRecipe = new VPBranchOnMaskRecipe(
515 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
516 auto *Entry =
517 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
518
519 // Replace predicated replicate recipe with a replicate recipe without a
520 // mask but in the replicate region.
521 auto *RecipeWithoutMask = new VPReplicateRecipe(
522 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
523 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
524 PredRecipe->getDebugLoc());
525 auto *Pred =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
527 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
529 Plan.createReplicateRegion(Entry, Exiting, RegionName);
530
531 // Note: first set Entry as region entry and then connect successors starting
532 // from it in order, to propagate the "parent" of each VPBasicBlock.
533 Region->setParent(ParentRegion);
534 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
535 VPBlockUtils::connectBlocks(Pred, Exiting);
536
537 if (PredRecipe->getNumUsers() != 0) {
538 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
539 RecipeWithoutMask->getDebugLoc());
540 Exiting->appendRecipe(PHIRecipe);
541 PredRecipe->replaceAllUsesWith(PHIRecipe);
542 }
543 PredRecipe->eraseFromParent();
544 return Region;
545}
546
547static void addReplicateRegions(VPlan &Plan) {
550 vp_depth_first_deep(Plan.getEntry()))) {
551 for (VPRecipeBase &R : *VPBB)
552 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
553 if (RepR->isPredicated())
554 WorkList.push_back(RepR);
555 }
556 }
557
558 unsigned BBNum = 0;
559 for (VPReplicateRecipe *RepR : WorkList) {
560 VPBasicBlock *CurrentBlock = RepR->getParent();
561 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
562
563 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
564 SplitBlock->setName(
565 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
566 // Record predicated instructions for above packing optimizations.
568 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
570
571 VPRegionBlock *ParentRegion = Region->getParent();
572 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
573 ParentRegion->setExiting(SplitBlock);
574 }
575}
576
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
603 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
604 }
605 return !WorkList.empty();
606}
607
609 // Convert masked VPReplicateRecipes to if-then region blocks.
611
612 bool ShouldSimplify = true;
613 while (ShouldSimplify) {
614 ShouldSimplify = sinkScalarOperands(Plan);
615 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
616 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
617 }
618}
619
620/// Remove redundant casts of inductions.
621///
622/// Such redundant casts are casts of induction variables that can be ignored,
623/// because we already proved that the casted phi is equal to the uncasted phi
624/// in the vectorized loop. There is no need to vectorize the cast - the same
625/// value can be used for both the phi and casts in the vector loop.
627 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
629 if (!IV || IV->getTruncInst())
630 continue;
631
632 // A sequence of IR Casts has potentially been recorded for IV, which
633 // *must be bypassed* when the IV is vectorized, because the vectorized IV
634 // will produce the desired casted value. This sequence forms a def-use
635 // chain and is provided in reverse order, ending with the cast that uses
636 // the IV phi. Search for the recipe of the last cast in the chain and
637 // replace it with the original IV. Note that only the final cast is
638 // expected to have users outside the cast-chain and the dead casts left
639 // over will be cleaned up later.
640 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
641 VPValue *FindMyCast = IV;
642 for (Instruction *IRCast : reverse(Casts)) {
643 VPSingleDefRecipe *FoundUserCast = nullptr;
644 for (auto *U : FindMyCast->users()) {
645 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
646 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
647 FoundUserCast = UserCast;
648 break;
649 }
650 }
651 FindMyCast = FoundUserCast;
652 }
653 FindMyCast->replaceAllUsesWith(IV);
654 }
655}
656
659 Instruction::BinaryOps InductionOpcode,
660 FPMathOperator *FPBinOp, Instruction *TruncI,
661 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
662 VPBuilder &Builder) {
663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
664 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
665 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
666 VPSingleDefRecipe *BaseIV =
667 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
668
669 // Truncate base induction if needed.
670 VPTypeAnalysis TypeInfo(Plan);
671 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
672 if (TruncI) {
673 Type *TruncTy = TruncI->getType();
674 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
675 "Not truncating.");
676 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
677 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
678 ResultTy = TruncTy;
679 }
680
681 // Truncate step if needed.
682 Type *StepTy = TypeInfo.inferScalarType(Step);
683 if (ResultTy != StepTy) {
684 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
685 "Not truncating.");
686 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
687 auto *VecPreheader =
689 VPBuilder::InsertPointGuard Guard(Builder);
690 Builder.setInsertPoint(VecPreheader);
691 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
692 }
693 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
694 &Plan.getVF(), DL);
695}
696
697/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
698/// recipe, if it exists.
700 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
701 VPRegionValue *CanonicalIV = LoopRegion->getCanonicalIV();
702 auto *WidenNewIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(CanonicalIV);
703
704 if (!WidenNewIV)
705 return;
706
707 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
708 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
709 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
710
711 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
712 continue;
713
714 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
715 // everything WidenNewIV's users need. That is, WidenOriginalIV will
716 // generate a vector phi or all users of WidenNewIV demand the first lane
717 // only.
718 if (Plan.hasScalarVFOnly() ||
719 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
720 vputils::onlyFirstLaneUsed(WidenNewIV)) {
721 // We are replacing a wide canonical iv with a suitable wide induction.
722 // This is used to compute header mask, hence all lanes will be used and
723 // we need to drop wrap flags only applying to lanes guranteed to execute
724 // in the original scalar loop.
725 WidenOriginalIV->dropPoisonGeneratingFlags();
726 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
727 WidenNewIV->eraseFromParent();
728 return;
729 }
730 }
731
732 if (!vputils::onlyFirstLaneUsed(WidenNewIV) && !Plan.hasScalarVFOnly()) {
734 "Lanes other than first lane being used should imply that not just "
735 "scalars are used");
736 return;
737 }
738
739 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
740 // IV.
741 Type *CanonicalIVTy = LoopRegion->getCanonicalIVType();
742 VPBuilder Builder(WidenNewIV);
743 WidenNewIV->replaceAllUsesWith(createScalarIVSteps(
744 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
745 nullptr, Plan.getZero(CanonicalIVTy),
746 Plan.getConstantInt(CanonicalIVTy, 1), CanonicalIV->getDebugLoc(),
747 Builder));
748 WidenNewIV->eraseFromParent();
749}
750
752 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
754 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
755 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
756 if (!LoopRegion || Plan.hasScalarVFOnly())
757 return;
758
759 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
760 auto *WideCanIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(CanonicalIV);
761 if (!WideCanIV || vputils::onlyScalarValuesUsed(WideCanIV))
762 return;
763
764 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
765 Type *CanIVTy = LoopRegion->getCanonicalIVType();
766 auto *VecTy = VectorType::get(CanIVTy, VF);
767 InstructionCost BroadcastCost = TTI.getShuffleCost(
769 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
770 if (PHICost > BroadcastCost)
771 return;
772
773 // Bail out if the additional wide induction phi increase the expected spill
774 // cost.
775 VPRegisterUsage UnrolledBase =
776 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
777 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
778 NumUsers *= UF;
779 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
780 VPRegisterUsage Projected = UnrolledBase;
781 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
782 if (Projected.spillCost(TTI, CostKind) >
783 UnrolledBase.spillCost(TTI, CostKind))
784 return;
785
788 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
789 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
790 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
791 VPIRFlags::WrapFlagsTy(/*HasNUW=*/LoopRegion->hasCanonicalIVNUW(),
792 /*HasNSW=*/false),
793 WideCanIV->getDebugLoc());
794 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
795 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
796 WideCanIV->replaceAllUsesWith(NewWideIV);
797 WideCanIV->eraseFromParent();
798}
799
800/// Returns true if \p R is dead and can be removed.
801static bool isDeadRecipe(VPRecipeBase &R) {
802 // Do remove conditional assume instructions as their conditions may be
803 // flattened.
804 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
805 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
807 if (IsConditionalAssume)
808 return true;
809
810 if (R.mayHaveSideEffects())
811 return false;
812
813 // Recipe is dead if no user keeps the recipe alive.
814 return all_of(R.definedValues(),
815 [](VPValue *V) { return V->getNumUsers() == 0; });
816}
817
820 Plan.getEntry());
822 // The recipes in the block are processed in reverse order, to catch chains
823 // of dead recipes.
824 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
825 if (isDeadRecipe(R)) {
826 R.eraseFromParent();
827 continue;
828 }
829
830 // Check if R is a dead VPPhi <-> update cycle and remove it.
831 VPValue *Start, *Incoming;
832 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
833 continue;
834 auto *PhiR = cast<VPPhi>(&R);
835 VPUser *PhiUser = PhiR->getSingleUser();
836 if (!PhiUser)
837 continue;
838 if (PhiUser != Incoming->getDefiningRecipe() ||
839 Incoming->getNumUsers() != 1)
840 continue;
841 PhiR->replaceAllUsesWith(Start);
842 PhiR->eraseFromParent();
843 Incoming->getDefiningRecipe()->eraseFromParent();
844 }
845 }
846}
847
850 for (unsigned I = 0; I != Users.size(); ++I) {
852 for (VPValue *V : Cur->definedValues())
853 Users.insert_range(V->users());
854 }
855 return Users.takeVector();
856}
857
858/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
859/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
860/// generates scalar values.
861static VPValue *
863 VPlan &Plan, VPBuilder &Builder) {
865 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
866 VPValue *StepV = PtrIV->getOperand(1);
868 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
869 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
870
871 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
872 PtrIV->getDebugLoc(), "next.gep");
873}
874
875/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
876/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
877/// VPWidenPointerInductionRecipe will generate vectors only. If some users
878/// require vectors while other require scalars, the scalar uses need to extract
879/// the scalars from the generated vectors (Note that this is different to how
880/// int/fp inductions are handled). Legalize extract-from-ends using uniform
881/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
882/// the correct end value is available. Also optimize
883/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
884/// providing them scalar steps built on the canonical scalar IV and update the
885/// original IV's users. This is an optional optimization to reduce the needs of
886/// vector extracts.
889 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
890 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
891 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
892 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
893 if (!PhiR)
894 continue;
895
896 // Try to narrow wide and replicating recipes to uniform recipes, based on
897 // VPlan analysis.
898 // TODO: Apply to all recipes in the future, to replace legacy uniformity
899 // analysis.
900 auto Users = collectUsersRecursively(PhiR);
901 for (VPUser *U : reverse(Users)) {
902 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
903 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
904 // Skip recipes that shouldn't be narrowed.
905 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
906 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
907 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
908 continue;
909
910 // Skip recipes that may have other lanes than their first used.
912 continue;
913
914 // TODO: Support scalarizing ExtractValue.
915 if (match(Def,
917 continue;
918
919 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
920 Def->operands(), /*IsUniform*/ true,
921 /*Mask*/ nullptr, /*Flags*/ *Def);
922 Clone->insertAfter(Def);
923 Def->replaceAllUsesWith(Clone);
924 }
925
926 // Replace wide pointer inductions which have only their scalars used by
927 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
928 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
929 if (!Plan.hasScalarVFOnly() &&
930 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
931 continue;
932
933 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
934 PtrIV->replaceAllUsesWith(PtrAdd);
935 continue;
936 }
937
938 // Replace widened induction with scalar steps for users that only use
939 // scalars.
940 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
941 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
942 return U->usesScalars(WideIV);
943 }))
944 continue;
945
946 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
948 Plan, ID.getKind(), ID.getInductionOpcode(),
949 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
950 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
951 WideIV->getDebugLoc(), Builder);
952
953 // Update scalar users of IV to use Step instead.
954 if (!HasOnlyVectorVFs) {
955 assert(!Plan.hasScalableVF() &&
956 "plans containing a scalar VF cannot also include scalable VFs");
957 WideIV->replaceAllUsesWith(Steps);
958 } else {
959 bool HasScalableVF = Plan.hasScalableVF();
960 WideIV->replaceUsesWithIf(Steps,
961 [WideIV, HasScalableVF](VPUser &U, unsigned) {
962 if (HasScalableVF)
963 return U.usesFirstLaneOnly(WideIV);
964 return U.usesScalars(WideIV);
965 });
966 }
967 }
968}
969
970/// Check if \p VPV is an untruncated wide induction, either before or after the
971/// increment. If so return the header IV (before the increment), otherwise
972/// return null.
975 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
976 if (WideIV) {
977 // VPV itself is a wide induction, separately compute the end value for exit
978 // users if it is not a truncated IV.
979 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
980 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
981 }
982
983 // Check if VPV is an optimizable induction increment.
984 VPRecipeBase *Def = VPV->getDefiningRecipe();
985 if (!Def || Def->getNumOperands() != 2)
986 return nullptr;
987 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
988 if (!WideIV)
989 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
990 if (!WideIV)
991 return nullptr;
992
993 auto IsWideIVInc = [&]() {
994 auto &ID = WideIV->getInductionDescriptor();
995
996 // Check if VPV increments the induction by the induction step.
997 VPValue *IVStep = WideIV->getStepValue();
998 switch (ID.getInductionOpcode()) {
999 case Instruction::Add:
1000 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
1001 case Instruction::FAdd:
1002 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
1003 case Instruction::FSub:
1004 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
1005 m_Specific(IVStep)));
1006 case Instruction::Sub: {
1007 // IVStep will be the negated step of the subtraction. Check if Step == -1
1008 // * IVStep.
1009 VPValue *Step;
1010 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
1011 return false;
1012 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1013 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1014 ScalarEvolution &SE = *PSE.getSE();
1015 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1016 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1017 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1018 }
1019 default:
1020 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1021 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1022 m_Specific(WideIV->getStepValue())));
1023 }
1024 llvm_unreachable("should have been covered by switch above");
1025 };
1026 return IsWideIVInc() ? WideIV : nullptr;
1027}
1028
1029/// Attempts to optimize the induction variable exit values for users in the
1030/// early exit block.
1032 VPTypeAnalysis &TypeInfo,
1033 VPBlockBase *PredVPBB,
1034 VPValue *Op,
1036 VPValue *Incoming, *Mask;
1038 m_VPValue(Incoming))))
1039 return nullptr;
1040
1041 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1042 if (!WideIV)
1043 return nullptr;
1044
1045 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1046 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1047 return nullptr;
1048
1049 // Calculate the final index.
1050 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1051 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1052 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1053 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1054
1055 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1056 VPValue *FirstActiveLane =
1057 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1058 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1059 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1060 FirstActiveLaneType, DL);
1061 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1062
1063 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1064 // changed it means the exit is using the incremented value, so we need to
1065 // add the step.
1066 if (Incoming != WideIV) {
1067 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1068 EndValue = B.createAdd(EndValue, One, DL);
1069 }
1070
1071 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1072 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1073 VPIRValue *Start = WideIV->getStartValue();
1074 VPValue *Step = WideIV->getStepValue();
1075 EndValue = B.createDerivedIV(
1076 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1077 Start, EndValue, Step);
1078 }
1079
1080 return EndValue;
1081}
1082
1083/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1084/// VPDerivedIVRecipe for non-canonical inductions.
1086 VPBuilder &VectorPHBuilder,
1087 VPTypeAnalysis &TypeInfo,
1088 VPValue *VectorTC) {
1089 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1090 // Truncated wide inductions resume from the last lane of their vector value
1091 // in the last vector iteration which is handled elsewhere.
1092 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1093 return nullptr;
1094
1095 VPIRValue *Start = WideIV->getStartValue();
1096 VPValue *Step = WideIV->getStepValue();
1098 VPValue *EndValue = VectorTC;
1099 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1100 EndValue = VectorPHBuilder.createDerivedIV(
1101 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1102 Start, VectorTC, Step);
1103 }
1104
1105 // EndValue is derived from the vector trip count (which has the same type as
1106 // the widest induction) and thus may be wider than the induction here.
1107 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1108 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1109 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1110 ScalarTypeOfWideIV,
1111 WideIV->getDebugLoc());
1112 }
1113
1114 return EndValue;
1115}
1116
1117/// Attempts to optimize the induction variable exit values for users in the
1118/// exit block coming from the latch in the original scalar loop.
1120 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1122 VPValue *Incoming;
1123 VPWidenInductionRecipe *WideIV = nullptr;
1125 WideIV = getOptimizableIVOf(Incoming, PSE);
1126
1127 if (!WideIV)
1128 return nullptr;
1129
1130 VPValue *EndValue = EndValues.lookup(WideIV);
1131 assert(EndValue && "Must have computed the end value up front");
1132
1133 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1134 // changed it means the exit is using the incremented value, so we don't
1135 // need to subtract the step.
1136 if (Incoming != WideIV)
1137 return EndValue;
1138
1139 // Otherwise, subtract the step from the EndValue.
1140 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1141 VPValue *Step = WideIV->getStepValue();
1142 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1143 if (ScalarTy->isIntegerTy())
1144 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1145 if (ScalarTy->isPointerTy()) {
1146 Type *StepTy = TypeInfo.inferScalarType(Step);
1147 auto *Zero = Plan.getZero(StepTy);
1148 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1149 DebugLoc::getUnknown(), "ind.escape");
1150 }
1151 if (ScalarTy->isFloatingPointTy()) {
1152 const auto &ID = WideIV->getInductionDescriptor();
1153 return B.createNaryOp(
1154 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1155 ? Instruction::FSub
1156 : Instruction::FAdd,
1157 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1158 }
1159 llvm_unreachable("all possible induction types must be handled");
1160 return nullptr;
1161}
1162
1164 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1165 // Compute end values for all inductions.
1166 VPTypeAnalysis TypeInfo(Plan);
1167 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1168 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1169 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1171 VPValue *ResumeTC =
1172 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1173 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1174 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1175 if (!WideIV)
1176 continue;
1178 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1179 EndValues[WideIV] = EndValue;
1180 }
1181
1182 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1183 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1184 VPValue *Op;
1185 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1186 continue;
1187 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1188 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1189 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1190 R.eraseFromParent();
1191 }
1192 }
1193
1194 // Then, optimize exit block users.
1195 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1196 for (VPRecipeBase &R : ExitVPBB->phis()) {
1197 auto *ExitIRI = cast<VPIRPhi>(&R);
1198
1199 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1200 VPValue *Escape = nullptr;
1201 if (PredVPBB == MiddleVPBB)
1202 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1203 ExitIRI->getOperand(Idx),
1204 EndValues, PSE);
1205 else
1207 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1208 if (Escape)
1209 ExitIRI->setOperand(Idx, Escape);
1210 }
1211 }
1212 }
1213}
1214
1215/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1216/// them with already existing recipes expanding the same SCEV expression.
1219
1220 for (VPRecipeBase &R :
1222 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1223 if (!ExpR)
1224 continue;
1225
1226 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1227 if (Inserted)
1228 continue;
1229 ExpR->replaceAllUsesWith(V->second);
1230 ExpR->eraseFromParent();
1231 }
1232}
1233
1235 SmallVector<VPValue *> WorkList;
1237 WorkList.push_back(V);
1238
1239 while (!WorkList.empty()) {
1240 VPValue *Cur = WorkList.pop_back_val();
1241 if (!Seen.insert(Cur).second)
1242 continue;
1243 VPRecipeBase *R = Cur->getDefiningRecipe();
1244 if (!R)
1245 continue;
1246 if (!isDeadRecipe(*R))
1247 continue;
1248 append_range(WorkList, R->operands());
1249 R->eraseFromParent();
1250 }
1251}
1252
1253/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1254/// Returns an optional pair, where the first element indicates whether it is
1255/// an intrinsic ID.
1256static std::optional<std::pair<bool, unsigned>>
1258 return TypeSwitch<const VPSingleDefRecipe *,
1259 std::optional<std::pair<bool, unsigned>>>(R)
1262 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1263 .Case([](const VPWidenIntrinsicRecipe *I) {
1264 return std::make_pair(true, I->getVectorIntrinsicID());
1265 })
1266 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1267 [](auto *I) {
1268 // For recipes that do not directly map to LLVM IR instructions,
1269 // assign opcodes after the last VPInstruction opcode (which is also
1270 // after the last IR Instruction opcode), based on the VPRecipeID.
1271 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1272 I->getVPRecipeID());
1273 })
1274 .Default([](auto *) { return std::nullopt; });
1275}
1276
1277/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1278/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1279/// Operands are foldable live-ins.
1281 ArrayRef<VPValue *> Operands,
1282 const DataLayout &DL,
1283 VPTypeAnalysis &TypeInfo) {
1284 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1285 if (!OpcodeOrIID)
1286 return nullptr;
1287
1289 for (VPValue *Op : Operands) {
1290 if (!match(Op, m_LiveIn()))
1291 return nullptr;
1292 Value *V = Op->getUnderlyingValue();
1293 if (!V)
1294 return nullptr;
1295 Ops.push_back(V);
1296 }
1297
1298 auto FoldToIRValue = [&]() -> Value * {
1299 InstSimplifyFolder Folder(DL);
1300 if (OpcodeOrIID->first) {
1301 if (R.getNumOperands() != 2)
1302 return nullptr;
1303 unsigned ID = OpcodeOrIID->second;
1304 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1305 TypeInfo.inferScalarType(&R));
1306 }
1307 unsigned Opcode = OpcodeOrIID->second;
1308 if (Instruction::isBinaryOp(Opcode))
1309 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1310 Ops[0], Ops[1]);
1311 if (Instruction::isCast(Opcode))
1312 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1313 TypeInfo.inferScalarType(R.getVPSingleValue()));
1314 switch (Opcode) {
1316 return Folder.FoldSelect(Ops[0], Ops[1],
1318 case VPInstruction::Not:
1319 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1321 case Instruction::Select:
1322 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1323 case Instruction::ICmp:
1324 case Instruction::FCmp:
1325 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1326 Ops[1]);
1327 case Instruction::GetElementPtr: {
1328 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1329 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1330 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1331 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1332 }
1335 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1336 Ops[0], Ops[1],
1337 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1338 // An extract of a live-in is an extract of a broadcast, so return the
1339 // broadcasted element.
1340 case Instruction::ExtractElement:
1341 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1342 return Ops[0];
1343 }
1344 return nullptr;
1345 };
1346
1347 if (Value *V = FoldToIRValue())
1348 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1349 return nullptr;
1350}
1351
1352/// Try to simplify VPSingleDefRecipe \p Def.
1354 VPlan *Plan = Def->getParent()->getPlan();
1355
1356 // Simplification of live-in IR values for SingleDef recipes using
1357 // InstSimplifyFolder.
1358 const DataLayout &DL = Plan->getDataLayout();
1359 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1360 return Def->replaceAllUsesWith(V);
1361
1362 // Fold PredPHI LiveIn -> LiveIn.
1363 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1364 VPValue *Op = PredPHI->getOperand(0);
1365 if (isa<VPIRValue>(Op))
1366 PredPHI->replaceAllUsesWith(Op);
1367 }
1368
1369 VPBuilder Builder(Def);
1370
1371 // Avoid replacing VPInstructions with underlying values with new
1372 // VPInstructions, as we would fail to create widen/replicate recpes from the
1373 // new VPInstructions without an underlying value, and miss out on some
1374 // transformations that only apply to widened/replicated recipes later, by
1375 // doing so.
1376 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1377 // VPInstructions without underlying values, as those will get skipped during
1378 // cost computation.
1379 bool CanCreateNewRecipe =
1380 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1381
1382 VPValue *A;
1383 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1384 Type *TruncTy = TypeInfo.inferScalarType(Def);
1385 Type *ATy = TypeInfo.inferScalarType(A);
1386 if (TruncTy == ATy) {
1387 Def->replaceAllUsesWith(A);
1388 } else {
1389 // Don't replace a non-widened cast recipe with a widened cast.
1390 if (!isa<VPWidenCastRecipe>(Def))
1391 return;
1392 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1393
1394 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1395 ? Instruction::SExt
1396 : Instruction::ZExt;
1397 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1398 TruncTy);
1399 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1400 // UnderlyingExt has distinct return type, used to retain legacy cost.
1401 Ext->setUnderlyingValue(UnderlyingExt);
1402 }
1403 Def->replaceAllUsesWith(Ext);
1404 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1405 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1406 Def->replaceAllUsesWith(Trunc);
1407 }
1408 }
1409#ifndef NDEBUG
1410 // Verify that the cached type info is for both A and its users is still
1411 // accurate by comparing it to freshly computed types.
1412 VPTypeAnalysis TypeInfo2(*Plan);
1413 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1414 for (VPUser *U : A->users()) {
1415 auto *R = cast<VPRecipeBase>(U);
1416 for (VPValue *VPV : R->definedValues())
1417 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1418 }
1419#endif
1420 }
1421
1422 // Simplify (X && Y) | (X && !Y) -> X.
1423 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1424 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1425 // recipes to be visited during simplification.
1426 VPValue *X, *Y, *Z;
1427 if (match(Def,
1430 Def->replaceAllUsesWith(X);
1431 Def->eraseFromParent();
1432 return;
1433 }
1434
1435 // x | AllOnes -> AllOnes
1436 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1437 return Def->replaceAllUsesWith(
1438 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1439
1440 // x | 0 -> x
1441 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1442 return Def->replaceAllUsesWith(X);
1443
1444 // x | !x -> AllOnes
1446 return Def->replaceAllUsesWith(
1447 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1448
1449 // x & 0 -> 0
1450 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1451 return Def->replaceAllUsesWith(
1452 Plan->getZero(TypeInfo.inferScalarType(Def)));
1453
1454 // x & AllOnes -> x
1455 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1456 return Def->replaceAllUsesWith(X);
1457
1458 // x && false -> false
1459 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1460 return Def->replaceAllUsesWith(Plan->getFalse());
1461
1462 // x && true -> x
1463 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1464 return Def->replaceAllUsesWith(X);
1465
1466 // (x && y) | (x && z) -> x && (y | z)
1467 if (CanCreateNewRecipe &&
1470 // Simplify only if one of the operands has one use to avoid creating an
1471 // extra recipe.
1472 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1473 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1474 return Def->replaceAllUsesWith(
1475 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1476
1477 // x && (x && y) -> x && y
1478 if (match(Def, m_LogicalAnd(m_VPValue(X),
1480 return Def->replaceAllUsesWith(Def->getOperand(1));
1481
1482 // x && (y && x) -> x && y
1483 if (match(Def, m_LogicalAnd(m_VPValue(X),
1485 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1486
1487 // x && !x -> 0
1489 return Def->replaceAllUsesWith(Plan->getFalse());
1490
1491 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1492 return Def->replaceAllUsesWith(X);
1493
1494 // select c, false, true -> not c
1495 VPValue *C;
1496 if (CanCreateNewRecipe &&
1497 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1498 return Def->replaceAllUsesWith(Builder.createNot(C));
1499
1500 // select !c, x, y -> select c, y, x
1501 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1502 Def->setOperand(0, C);
1503 Def->setOperand(1, Y);
1504 Def->setOperand(2, X);
1505 return;
1506 }
1507
1508 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1509 return Def->replaceAllUsesWith(A);
1510
1511 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1512 return Def->replaceAllUsesWith(A);
1513
1514 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1515 return Def->replaceAllUsesWith(
1516 Plan->getZero(TypeInfo.inferScalarType(Def)));
1517
1518 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1519 // Preserve nsw from the Mul on the new Sub.
1521 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1522 return Def->replaceAllUsesWith(
1523 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1524 Def->getDebugLoc(), "", NW));
1525 }
1526
1527 if (CanCreateNewRecipe &&
1529 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1530 // new Sub.
1532 false,
1533 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1534 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1535 ->hasNoSignedWrap()};
1536 return Def->replaceAllUsesWith(
1537 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1538 }
1539
1540 const APInt *APC;
1541 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1542 APC->isPowerOf2())
1543 return Def->replaceAllUsesWith(Builder.createNaryOp(
1544 Instruction::Shl,
1545 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1546 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1547
1548 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1549 APC->isPowerOf2())
1550 return Def->replaceAllUsesWith(Builder.createNaryOp(
1551 Instruction::LShr,
1552 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1553 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1554
1555 if (match(Def, m_Not(m_VPValue(A)))) {
1556 if (match(A, m_Not(m_VPValue(A))))
1557 return Def->replaceAllUsesWith(A);
1558
1559 // Try to fold Not into compares by adjusting the predicate in-place.
1560 CmpPredicate Pred;
1561 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1562 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1563 if (all_of(Cmp->users(),
1565 m_Not(m_Specific(Cmp)),
1566 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1567 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1568 for (VPUser *U : to_vector(Cmp->users())) {
1569 auto *R = cast<VPSingleDefRecipe>(U);
1570 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1571 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1572 R->setOperand(1, Y);
1573 R->setOperand(2, X);
1574 } else {
1575 // not (cmp pred) -> cmp inv_pred
1576 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1577 R->replaceAllUsesWith(Cmp);
1578 }
1579 }
1580 // If Cmp doesn't have a debug location, use the one from the negation,
1581 // to preserve the location.
1582 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1583 Cmp->setDebugLoc(Def->getDebugLoc());
1584 }
1585 }
1586 }
1587
1588 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1589 // any-of (fcmp uno %A, %B), ...
1590 if (match(Def, m_AnyOf())) {
1592 VPRecipeBase *UnpairedCmp = nullptr;
1593 for (VPValue *Op : Def->operands()) {
1594 VPValue *X;
1595 if (Op->getNumUsers() > 1 ||
1597 m_Deferred(X)))) {
1598 NewOps.push_back(Op);
1599 } else if (!UnpairedCmp) {
1600 UnpairedCmp = Op->getDefiningRecipe();
1601 } else {
1602 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1603 UnpairedCmp->getOperand(0), X));
1604 UnpairedCmp = nullptr;
1605 }
1606 }
1607
1608 if (UnpairedCmp)
1609 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1610
1611 if (NewOps.size() < Def->getNumOperands()) {
1612 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1613 return Def->replaceAllUsesWith(NewAnyOf);
1614 }
1615 }
1616
1617 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1618 // This is useful for fmax/fmin without fast-math flags, where we need to
1619 // check if any operand is NaN.
1620 if (CanCreateNewRecipe &&
1622 m_Deferred(X)),
1624 m_Deferred(Y))))) {
1625 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1626 return Def->replaceAllUsesWith(NewCmp);
1627 }
1628
1629 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1630 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1631 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1632 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1633 TypeInfo.inferScalarType(Def))
1634 return Def->replaceAllUsesWith(Def->getOperand(1));
1635
1637 m_One()))) {
1638 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1639 if (TypeInfo.inferScalarType(X) != WideStepTy)
1640 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1641 Def->replaceAllUsesWith(X);
1642 return;
1643 }
1644
1645 // For i1 vp.merges produced by AnyOf reductions:
1646 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1648 m_VPValue(X), m_VPValue())) &&
1650 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1651 Def->setOperand(1, Def->getOperand(0));
1652 Def->setOperand(0, Y);
1653 return;
1654 }
1655
1656 // Simplify MaskedCond with no block mask to its single operand.
1658 !cast<VPInstruction>(Def)->isMasked())
1659 return Def->replaceAllUsesWith(Def->getOperand(0));
1660
1661 // Look through ExtractLastLane.
1662 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1663 if (match(A, m_BuildVector())) {
1664 auto *BuildVector = cast<VPInstruction>(A);
1665 Def->replaceAllUsesWith(
1666 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1667 return;
1668 }
1669 if (Plan->hasScalarVFOnly())
1670 return Def->replaceAllUsesWith(A);
1671 }
1672
1673 // Look through ExtractPenultimateElement (BuildVector ....).
1675 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1676 Def->replaceAllUsesWith(
1677 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1678 return;
1679 }
1680
1681 uint64_t Idx;
1683 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1684 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1685 return;
1686 }
1687
1688 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1689 Def->replaceAllUsesWith(
1690 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1691 return;
1692 }
1693
1694 // Look through broadcast of single-scalar when used as select conditions; in
1695 // that case the scalar condition can be used directly.
1696 if (match(Def,
1699 "broadcast operand must be single-scalar");
1700 Def->setOperand(0, C);
1701 return;
1702 }
1703
1705 if (Def->getNumOperands() == 1) {
1706 Def->replaceAllUsesWith(Def->getOperand(0));
1707 return;
1708 }
1709 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1710 if (all_equal(Phi->incoming_values()))
1711 Phi->replaceAllUsesWith(Phi->getOperand(0));
1712 }
1713 return;
1714 }
1715
1716 VPIRValue *IRV;
1717 if (Def->getNumOperands() == 1 &&
1719 return Def->replaceAllUsesWith(IRV);
1720
1721 // Some simplifications can only be applied after unrolling. Perform them
1722 // below.
1723 if (!Plan->isUnrolled())
1724 return;
1725
1726 // After unrolling, extract-lane may be used to extract values from multiple
1727 // scalar sources. Only simplify when extracting from a single scalar source.
1728 VPValue *LaneToExtract;
1729 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1730 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1732 return Def->replaceAllUsesWith(A);
1733
1734 // Simplify extract-lane with single source to extract-element.
1735 Def->replaceAllUsesWith(Builder.createNaryOp(
1736 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1737 return;
1738 }
1739
1740 // Look for cycles where Def is of the form:
1741 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1742 // IVInc = X + Step ; used by X and Def
1743 // Def = IVInc + Y
1744 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1745 // and if Inc exists, replace it with X.
1746 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1747 isa<VPIRValue>(Y) &&
1748 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1749 auto *Phi = cast<VPPhi>(X);
1750 auto *IVInc = Def->getOperand(0);
1751 if (IVInc->getNumUsers() == 2) {
1752 // If Phi has a second user (besides IVInc's defining recipe), it must
1753 // be Inc = Phi + Y for the fold to apply.
1756 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1757 Def->replaceAllUsesWith(IVInc);
1758 if (Inc)
1759 Inc->replaceAllUsesWith(Phi);
1760 Phi->setOperand(0, Y);
1761 return;
1762 }
1763 }
1764 }
1765
1766 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1767 // just the pointer operand.
1768 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1769 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1770 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1771
1772 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1773 // the start index is zero and only the first lane 0 is demanded.
1774 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1775 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1776 Steps->replaceAllUsesWith(Steps->getOperand(0));
1777 return;
1778 }
1779 }
1780 // Simplify redundant ReductionStartVector recipes after unrolling.
1781 VPValue *StartV;
1783 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1784 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1785 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1786 return PhiR && PhiR->isInLoop();
1787 });
1788 return;
1789 }
1790
1792 Def->replaceAllUsesWith(A);
1793 return;
1794 }
1795
1796 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1799 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1800 all_of(A->users(),
1801 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1802 return Def->replaceAllUsesWith(A);
1803 }
1804
1805 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1806 return Def->replaceAllUsesWith(A);
1807}
1808
1811 Plan.getEntry());
1812 VPTypeAnalysis TypeInfo(Plan);
1814 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1815 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1816 simplifyRecipe(Def, TypeInfo);
1817 }
1818}
1819
1820/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1821/// header mask to be simplified further when tail folding, e.g. in
1822/// optimizeEVLMasks.
1823static void reassociateHeaderMask(VPlan &Plan) {
1824 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1825 if (!HeaderMask)
1826 return;
1827
1828 SmallVector<VPUser *> Worklist;
1829 for (VPUser *U : HeaderMask->users())
1830 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1832
1833 while (!Worklist.empty()) {
1834 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1835 VPValue *X, *Y;
1836 if (!R || !match(R, m_LogicalAnd(
1837 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1838 m_VPValue(Y))))
1839 continue;
1840 append_range(Worklist, R->users());
1841 VPBuilder Builder(R);
1842 R->replaceAllUsesWith(
1843 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1844 }
1845}
1846
1848 if (Plan.hasScalarVFOnly())
1849 return;
1850
1851 // Try to narrow wide and replicating recipes to single scalar recipes,
1852 // based on VPlan analysis. Only process blocks in the loop region for now,
1853 // without traversing into nested regions, as recipes in replicate regions
1854 // cannot be converted yet.
1857 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1859 continue;
1860 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1861 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1862 continue;
1863
1864 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1865 if (RepR && RepR->getOpcode() == Instruction::Store &&
1866 vputils::isSingleScalar(RepR->getOperand(1))) {
1867 auto *Clone = new VPReplicateRecipe(
1868 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1869 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1870 *RepR /*Metadata*/, RepR->getDebugLoc());
1871 Clone->insertBefore(RepOrWidenR);
1872 VPBuilder Builder(Clone);
1873 VPValue *ExtractOp = Clone->getOperand(0);
1874 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1875 ExtractOp =
1876 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1877 ExtractOp =
1878 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1879 Clone->setOperand(0, ExtractOp);
1880 RepR->eraseFromParent();
1881 continue;
1882 }
1883
1884 // Skip recipes that aren't single scalars.
1885 if (!vputils::isSingleScalar(RepOrWidenR))
1886 continue;
1887
1888 // Predicate to check if a user of Op introduces extra broadcasts.
1889 auto IntroducesBCastOf = [](const VPValue *Op) {
1890 return [Op](const VPUser *U) {
1891 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1895 VPI->getOpcode()))
1896 return false;
1897 }
1898 return !U->usesScalars(Op);
1899 };
1900 };
1901
1902 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1903 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1904 if (any_of(
1905 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1906 IntroducesBCastOf(Op)))
1907 return false;
1908 // Non-constant live-ins require broadcasts, while constants do not
1909 // need explicit broadcasts.
1910 auto *IRV = dyn_cast<VPIRValue>(Op);
1911 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1912 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1913 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1914 }))
1915 continue;
1916
1917 auto *Clone = new VPReplicateRecipe(
1918 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1919 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1920 Clone->insertBefore(RepOrWidenR);
1921 RepOrWidenR->replaceAllUsesWith(Clone);
1922 if (isDeadRecipe(*RepOrWidenR))
1923 RepOrWidenR->eraseFromParent();
1924 }
1925 }
1926}
1927
1928/// Try to see if all of \p Blend's masks share a common value logically and'ed
1929/// and remove it from the masks.
1931 if (Blend->isNormalized())
1932 return;
1933 VPValue *CommonEdgeMask;
1934 if (!match(Blend->getMask(0),
1935 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1936 return;
1937 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1938 if (!match(Blend->getMask(I),
1939 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1940 return;
1941 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1942 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1943}
1944
1945/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1946/// to make sure the masks are simplified.
1947static void simplifyBlends(VPlan &Plan) {
1950 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1951 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1952 if (!Blend)
1953 continue;
1954
1955 removeCommonBlendMask(Blend);
1956
1957 // Try to remove redundant blend recipes.
1958 SmallPtrSet<VPValue *, 4> UniqueValues;
1959 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1960 UniqueValues.insert(Blend->getIncomingValue(0));
1961 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1962 if (!match(Blend->getMask(I), m_False()))
1963 UniqueValues.insert(Blend->getIncomingValue(I));
1964
1965 if (UniqueValues.size() == 1) {
1966 Blend->replaceAllUsesWith(*UniqueValues.begin());
1967 Blend->eraseFromParent();
1968 continue;
1969 }
1970
1971 if (Blend->isNormalized())
1972 continue;
1973
1974 // Normalize the blend so its first incoming value is used as the initial
1975 // value with the others blended into it.
1976
1977 unsigned StartIndex = 0;
1978 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1979 // If a value's mask is used only by the blend then is can be deadcoded.
1980 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1981 // that's used by multiple blends where it can be removed from them all.
1982 VPValue *Mask = Blend->getMask(I);
1983 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1984 StartIndex = I;
1985 break;
1986 }
1987 }
1988
1989 SmallVector<VPValue *, 4> OperandsWithMask;
1990 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1991
1992 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1993 if (I == StartIndex)
1994 continue;
1995 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1996 OperandsWithMask.push_back(Blend->getMask(I));
1997 }
1998
1999 auto *NewBlend =
2000 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2001 OperandsWithMask, *Blend, Blend->getDebugLoc());
2002 NewBlend->insertBefore(&R);
2003
2004 VPValue *DeadMask = Blend->getMask(StartIndex);
2005 Blend->replaceAllUsesWith(NewBlend);
2006 Blend->eraseFromParent();
2008
2009 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2010 VPValue *NewMask;
2011 if (NewBlend->getNumOperands() == 3 &&
2012 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2013 VPValue *Inc0 = NewBlend->getOperand(0);
2014 VPValue *Inc1 = NewBlend->getOperand(1);
2015 VPValue *OldMask = NewBlend->getOperand(2);
2016 NewBlend->setOperand(0, Inc1);
2017 NewBlend->setOperand(1, Inc0);
2018 NewBlend->setOperand(2, NewMask);
2019 if (OldMask->getNumUsers() == 0)
2020 cast<VPInstruction>(OldMask)->eraseFromParent();
2021 }
2022 }
2023 }
2024}
2025
2026/// Optimize the width of vector induction variables in \p Plan based on a known
2027/// constant Trip Count, \p BestVF and \p BestUF.
2029 ElementCount BestVF,
2030 unsigned BestUF) {
2031 // Only proceed if we have not completely removed the vector region.
2032 if (!Plan.getVectorLoopRegion())
2033 return false;
2034
2035 const APInt *TC;
2036 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2037 return false;
2038
2039 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2040 // and UF. Returns at least 8.
2041 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2042 APInt AlignedTC =
2045 APInt MaxVal = AlignedTC - 1;
2046 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2047 };
2048 unsigned NewBitWidth =
2049 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2050
2051 LLVMContext &Ctx = Plan.getContext();
2052 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2053
2054 bool MadeChange = false;
2055
2056 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2057 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2058 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2059
2060 // Currently only handle canonical IVs as it is trivial to replace the start
2061 // and stop values, and we currently only perform the optimization when the
2062 // IV has a single use.
2063 if (!WideIV || !WideIV->isCanonical() ||
2064 WideIV->hasMoreThanOneUniqueUser() ||
2065 NewIVTy == WideIV->getScalarType())
2066 continue;
2067
2068 // Currently only handle cases where the single user is a header-mask
2069 // comparison with the backedge-taken-count.
2070 VPUser *SingleUser = WideIV->getSingleUser();
2071 if (!SingleUser ||
2072 !match(SingleUser,
2073 m_ICmp(m_Specific(WideIV),
2075 continue;
2076
2077 // Update IV operands and comparison bound to use new narrower type.
2078 auto *NewStart = Plan.getZero(NewIVTy);
2079 WideIV->setStartValue(NewStart);
2080 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2081 WideIV->setStepValue(NewStep);
2082
2083 auto *NewBTC = new VPWidenCastRecipe(
2084 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2085 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2086 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2087 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2088 Cmp->setOperand(1, NewBTC);
2089
2090 MadeChange = true;
2091 }
2092
2093 return MadeChange;
2094}
2095
2096/// Return true if \p Cond is known to be true for given \p BestVF and \p
2097/// BestUF.
2099 ElementCount BestVF, unsigned BestUF,
2102 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2103 &PSE](VPValue *C) {
2104 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2105 });
2106
2107 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2110 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2111 m_Specific(&Plan.getVectorTripCount()))))
2112 return false;
2113
2114 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2115 // count is not conveniently available as SCEV so far, so we compare directly
2116 // against the original trip count. This is stricter than necessary, as we
2117 // will only return true if the trip count == vector trip count.
2118 const SCEV *VectorTripCount =
2120 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2121 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2122 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2123 "Trip count SCEV must be computable");
2124 ScalarEvolution &SE = *PSE.getSE();
2125 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2126 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2127 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2128}
2129
2130/// Try to replace multiple active lane masks used for control flow with
2131/// a single, wide active lane mask instruction followed by multiple
2132/// extract subvector intrinsics. This applies to the active lane mask
2133/// instructions both in the loop and in the preheader.
2134/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2135/// new extracts from the first active lane mask, which has it's last
2136/// operand (multiplier) set to UF.
2138 unsigned UF) {
2139 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2140 return false;
2141
2142 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2143 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2144 auto *Term = &ExitingVPBB->back();
2145
2146 using namespace llvm::VPlanPatternMatch;
2148 m_VPValue(), m_VPValue(), m_VPValue())))))
2149 return false;
2150
2151 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2152 LLVMContext &Ctx = Plan.getContext();
2153
2154 auto ExtractFromALM = [&](VPInstruction *ALM,
2155 SmallVectorImpl<VPValue *> &Extracts) {
2156 DebugLoc DL = ALM->getDebugLoc();
2157 for (unsigned Part = 0; Part < UF; ++Part) {
2159 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2160 auto *Ext =
2161 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2162 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2163 Extracts[Part] = Ext;
2164 Ext->insertAfter(ALM);
2165 }
2166 };
2167
2168 // Create a list of each active lane mask phi, ordered by unroll part.
2170 for (VPRecipeBase &R : Header->phis()) {
2172 if (!Phi)
2173 continue;
2174 VPValue *Index = nullptr;
2175 match(Phi->getBackedgeValue(),
2177 assert(Index && "Expected index from ActiveLaneMask instruction");
2178
2179 uint64_t Part;
2180 if (match(Index,
2182 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2183 Phis[Part] = Phi;
2184 else {
2185 // Anything other than a CanonicalIVIncrementForPart is part 0
2186 assert(!match(
2187 Index,
2189 Phis[0] = Phi;
2190 }
2191 }
2192
2193 assert(all_of(Phis, not_equal_to(nullptr)) &&
2194 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2195
2196 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2197 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2198
2199 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2200 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2201 "Expected incoming values of Phi to be ActiveLaneMasks");
2202
2203 // When using wide lane masks, the return type of the get.active.lane.mask
2204 // intrinsic is VF x UF (last operand).
2205 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2206 EntryALM->setOperand(2, ALMMultiplier);
2207 LoopALM->setOperand(2, ALMMultiplier);
2208
2209 // Create UF x extract vectors and insert into preheader.
2210 SmallVector<VPValue *> EntryExtracts(UF);
2211 ExtractFromALM(EntryALM, EntryExtracts);
2212
2213 // Create UF x extract vectors and insert before the loop compare & branch,
2214 // updating the compare to use the first extract.
2215 SmallVector<VPValue *> LoopExtracts(UF);
2216 ExtractFromALM(LoopALM, LoopExtracts);
2217 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2218 Not->setOperand(0, LoopExtracts[0]);
2219
2220 // Update the incoming values of active lane mask phis.
2221 for (unsigned Part = 0; Part < UF; ++Part) {
2222 Phis[Part]->setStartValue(EntryExtracts[Part]);
2223 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2224 }
2225
2226 return true;
2227}
2228
2229/// Try to simplify the branch condition of \p Plan. This may restrict the
2230/// resulting plan to \p BestVF and \p BestUF.
2232 unsigned BestUF,
2234 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2235 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2236 auto *Term = &ExitingVPBB->back();
2237 VPValue *Cond;
2238 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2239 // Check if the branch condition compares the canonical IV increment (for main
2240 // loop), or the canonical IV increment plus an offset (for epilog loop).
2241 if (match(Term, m_BranchOnCount(
2242 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2243 m_VPValue())) ||
2245 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2246 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2247 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2248 const SCEV *VectorTripCount =
2250 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2251 VectorTripCount =
2253 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2254 "Trip count SCEV must be computable");
2255 ScalarEvolution &SE = *PSE.getSE();
2256 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2257 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2258 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2259 return false;
2260 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2262 // For BranchOnCond, check if we can prove the condition to be true using VF
2263 // and UF.
2264 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2265 return false;
2266 } else {
2267 return false;
2268 }
2269
2270 // The vector loop region only executes once. Convert terminator of the
2271 // exiting block to exit in the first iteration.
2272 if (match(Term, m_BranchOnTwoConds())) {
2273 Term->setOperand(1, Plan.getTrue());
2274 return true;
2275 }
2276
2277 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2278 {}, Term->getDebugLoc());
2279 ExitingVPBB->appendRecipe(BOC);
2280 Term->eraseFromParent();
2281
2282 return true;
2283}
2284
2285/// From the definition of llvm.experimental.get.vector.length,
2286/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2290 vp_depth_first_deep(Plan.getEntry()))) {
2291 for (VPRecipeBase &R : *VPBB) {
2292 VPValue *AVL;
2293 if (!match(&R, m_EVL(m_VPValue(AVL))))
2294 continue;
2295
2296 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2297 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2298 continue;
2299 ScalarEvolution &SE = *PSE.getSE();
2300 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2301 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2302 continue;
2303
2305 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2306 R.getDebugLoc());
2307 if (Trunc != AVL) {
2308 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2309 const DataLayout &DL = Plan.getDataLayout();
2310 VPTypeAnalysis TypeInfo(Plan);
2311 if (VPValue *Folded =
2312 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2313 Trunc = Folded;
2314 }
2315 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2316 return true;
2317 }
2318 }
2319 return false;
2320}
2321
2323 unsigned BestUF,
2325 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2326 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2327
2328 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2329 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2330 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2331
2332 if (MadeChange) {
2333 Plan.setVF(BestVF);
2334 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2335 }
2336}
2337
2339 for (VPRecipeBase &R :
2341 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2342 if (!PhiR)
2343 continue;
2344 RecurKind RK = PhiR->getRecurrenceKind();
2345 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2347 continue;
2348
2349 for (VPUser *U : collectUsersRecursively(PhiR))
2350 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2351 RecWithFlags->dropPoisonGeneratingFlags();
2352 }
2353 }
2354}
2355
2356namespace {
2357struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2358 static bool isSentinel(const VPSingleDefRecipe *Def) {
2359 return Def == getEmptyKey() || Def == getTombstoneKey();
2360 }
2361
2362 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2363 /// return that source element type.
2364 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2365 // All VPInstructions that lower to GEPs must have the i8 source element
2366 // type (as they are PtrAdds), so we omit it.
2368 .Case([](const VPReplicateRecipe *I) -> Type * {
2369 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2370 return GEP->getSourceElementType();
2371 return nullptr;
2372 })
2373 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2374 [](auto *I) { return I->getSourceElementType(); })
2375 .Default([](auto *) { return nullptr; });
2376 }
2377
2378 /// Returns true if recipe \p Def can be safely handed for CSE.
2379 static bool canHandle(const VPSingleDefRecipe *Def) {
2380 // We can extend the list of handled recipes in the future,
2381 // provided we account for the data embedded in them while checking for
2382 // equality or hashing.
2383 auto C = getOpcodeOrIntrinsicID(Def);
2384
2385 // The issue with (Insert|Extract)Value is that the index of the
2386 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2387 // VPlan.
2388 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2389 C->second == Instruction::ExtractValue)))
2390 return false;
2391
2392 // During CSE, we can only handle recipes that don't read from memory: if
2393 // they read from memory, there could be an intervening write to memory
2394 // before the next instance is CSE'd, leading to an incorrect result.
2395 return !Def->mayReadFromMemory();
2396 }
2397
2398 /// Hash the underlying data of \p Def.
2399 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2400 const VPlan *Plan = Def->getParent()->getPlan();
2401 VPTypeAnalysis TypeInfo(*Plan);
2402 hash_code Result = hash_combine(
2403 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2404 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2406 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2407 if (RFlags->hasPredicate())
2408 return hash_combine(Result, RFlags->getPredicate());
2409 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2410 return hash_combine(Result, SIVSteps->getInductionOpcode());
2411 return Result;
2412 }
2413
2414 /// Check equality of underlying data of \p L and \p R.
2415 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2416 if (isSentinel(L) || isSentinel(R))
2417 return L == R;
2418 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2420 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2422 !equal(L->operands(), R->operands()))
2423 return false;
2425 "must have valid opcode info for both recipes");
2426 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2427 if (LFlags->hasPredicate() &&
2428 LFlags->getPredicate() !=
2429 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2430 return false;
2431 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2432 if (LSIV->getInductionOpcode() !=
2433 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2434 return false;
2435 // Recipes in replicate regions implicitly depend on predicate. If either
2436 // recipe is in a replicate region, only consider them equal if both have
2437 // the same parent.
2438 const VPRegionBlock *RegionL = L->getRegion();
2439 const VPRegionBlock *RegionR = R->getRegion();
2440 if (((RegionL && RegionL->isReplicator()) ||
2441 (RegionR && RegionR->isReplicator())) &&
2442 L->getParent() != R->getParent())
2443 return false;
2444 const VPlan *Plan = L->getParent()->getPlan();
2445 VPTypeAnalysis TypeInfo(*Plan);
2446 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2447 }
2448};
2449} // end anonymous namespace
2450
2451/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2452/// Plan.
2454 VPDominatorTree VPDT(Plan);
2456
2458 Plan.getEntry());
2460 for (VPRecipeBase &R : *VPBB) {
2461 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2462 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2463 continue;
2464 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2465 // V must dominate Def for a valid replacement.
2466 if (!VPDT.dominates(V->getParent(), VPBB))
2467 continue;
2468 // Only keep flags present on both V and Def.
2469 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2470 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2471 Def->replaceAllUsesWith(V);
2472 continue;
2473 }
2474 CSEMap[Def] = Def;
2475 }
2476 }
2477}
2478
2479/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2480static void licm(VPlan &Plan) {
2481 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2482
2483 // Hoist any loop invariant recipes from the vector loop region to the
2484 // preheader. Preform a shallow traversal of the vector loop region, to
2485 // exclude recipes in replicate regions. Since the top-level blocks in the
2486 // vector loop region are guaranteed to execute if the vector pre-header is,
2487 // we don't need to check speculation safety.
2488 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2489 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2490 "Expected vector prehader's successor to be the vector loop region");
2492 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2493 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2495 continue;
2496 if (any_of(R.operands(), [](VPValue *Op) {
2497 return !Op->isDefinedOutsideLoopRegions();
2498 }))
2499 continue;
2500 R.moveBefore(*Preheader, Preheader->end());
2501 }
2502 }
2503
2504#ifndef NDEBUG
2505 VPDominatorTree VPDT(Plan);
2506#endif
2507 // Sink recipes with no users inside the vector loop region if all users are
2508 // in the same exit block of the region.
2509 // TODO: Extend to sink recipes from inner loops.
2511 LoopRegion->getEntry());
2513 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2514 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2515 continue;
2516
2517 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2518 assert(!RepR->isPredicated() &&
2519 "Expected prior transformation of predicated replicates to "
2520 "replicate regions");
2521 // narrowToSingleScalarRecipes should have already maximally narrowed
2522 // replicates to single-scalar replicates.
2523 // TODO: When unrolling, replicateByVF doesn't handle sunk
2524 // non-single-scalar replicates correctly.
2525 if (!RepR->isSingleScalar())
2526 continue;
2527 }
2528
2529 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2530 // support recipes with multiple defined values (e.g., interleaved loads).
2531 auto *Def = cast<VPSingleDefRecipe>(&R);
2532
2533 // Cannot sink the recipe if the user is defined in a loop region or a
2534 // non-successor of the vector loop region. Cannot sink if user is a phi
2535 // either.
2536 VPBasicBlock *SinkBB = nullptr;
2537 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2538 auto *UserR = cast<VPRecipeBase>(U);
2539 VPBasicBlock *Parent = UserR->getParent();
2540 // TODO: Support sinking when users are in multiple blocks.
2541 if (SinkBB && SinkBB != Parent)
2542 return true;
2543 SinkBB = Parent;
2544 // TODO: If the user is a PHI node, we should check the block of
2545 // incoming value. Support PHI node users if needed.
2546 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2547 Parent->getSinglePredecessor() != LoopRegion;
2548 }))
2549 continue;
2550
2551 if (!SinkBB)
2552 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2553
2554 // TODO: This will need to be a check instead of a assert after
2555 // conditional branches in vectorized loops are supported.
2556 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2557 "Defining block must dominate sink block");
2558 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2559 // just moving.
2560 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2561 }
2562 }
2563}
2564
2566 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2567 if (Plan.hasScalarVFOnly())
2568 return;
2569 // Keep track of created truncates, so they can be re-used. Note that we
2570 // cannot use RAUW after creating a new truncate, as this would could make
2571 // other uses have different types for their operands, making them invalidly
2572 // typed.
2574 VPTypeAnalysis TypeInfo(Plan);
2575 VPBasicBlock *PH = Plan.getVectorPreheader();
2578 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2581 continue;
2582
2583 VPValue *ResultVPV = R.getVPSingleValue();
2584 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2585 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2586 if (!NewResSizeInBits)
2587 continue;
2588
2589 // If the value wasn't vectorized, we must maintain the original scalar
2590 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2591 // skip casts which do not need to be handled explicitly here, as
2592 // redundant casts will be removed during recipe simplification.
2594 continue;
2595
2596 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2597 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2598 assert(OldResTy->isIntegerTy() && "only integer types supported");
2599 (void)OldResSizeInBits;
2600
2601 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2602
2603 // Any wrapping introduced by shrinking this operation shouldn't be
2604 // considered undefined behavior. So, we can't unconditionally copy
2605 // arithmetic wrapping flags to VPW.
2606 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2607 VPW->dropPoisonGeneratingFlags();
2608
2609 if (OldResSizeInBits != NewResSizeInBits &&
2610 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2611 // Extend result to original width.
2612 auto *Ext = new VPWidenCastRecipe(
2613 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2614 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2615 Ext->insertAfter(&R);
2616 ResultVPV->replaceAllUsesWith(Ext);
2617 Ext->setOperand(0, ResultVPV);
2618 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2619 } else {
2620 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2621 "Only ICmps should not need extending the result.");
2622 }
2623
2624 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2626 continue;
2627
2628 // Shrink operands by introducing truncates as needed.
2629 unsigned StartIdx =
2630 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2631 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2632 auto *Op = R.getOperand(Idx);
2633 unsigned OpSizeInBits =
2635 if (OpSizeInBits == NewResSizeInBits)
2636 continue;
2637 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2638 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2639 if (!IterIsEmpty) {
2640 R.setOperand(Idx, ProcessedIter->second);
2641 continue;
2642 }
2643
2644 VPBuilder Builder;
2645 if (isa<VPIRValue>(Op))
2646 Builder.setInsertPoint(PH);
2647 else
2648 Builder.setInsertPoint(&R);
2649 VPWidenCastRecipe *NewOp =
2650 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2651 ProcessedIter->second = NewOp;
2652 R.setOperand(Idx, NewOp);
2653 }
2654
2655 }
2656 }
2657}
2658
2659void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2660 std::optional<VPDominatorTree> VPDT;
2661 if (OnlyLatches)
2662 VPDT.emplace(Plan);
2663
2664 // Collect all blocks before modifying the CFG so we can identify unreachable
2665 // ones after constant branch removal.
2667
2668 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2669 VPValue *Cond;
2670 // Skip blocks that are not terminated by BranchOnCond.
2671 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2672 continue;
2673
2674 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2675 continue;
2676
2677 assert(VPBB->getNumSuccessors() == 2 &&
2678 "Two successors expected for BranchOnCond");
2679 unsigned RemovedIdx;
2680 if (match(Cond, m_True()))
2681 RemovedIdx = 1;
2682 else if (match(Cond, m_False()))
2683 RemovedIdx = 0;
2684 else
2685 continue;
2686
2687 VPBasicBlock *RemovedSucc =
2688 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2689 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2690 "There must be a single edge between VPBB and its successor");
2691 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2692 // these recipes.
2693 for (VPRecipeBase &R : RemovedSucc->phis())
2694 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2695
2696 // Disconnect blocks and remove the terminator.
2697 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2698 VPBB->back().eraseFromParent();
2699 }
2700
2701 // Compute which blocks are still reachable from the entry after constant
2702 // branch removal.
2705
2706 // Detach all unreachable blocks from their successors, removing their recipes
2707 // and incoming values from phi recipes.
2708 VPSymbolicValue Tmp;
2709 for (VPBlockBase *B : AllBlocks) {
2710 if (Reachable.contains(B))
2711 continue;
2712 for (VPBlockBase *Succ : to_vector(B->successors())) {
2713 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2714 for (VPRecipeBase &R : SuccBB->phis())
2715 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2717 }
2718 for (VPBasicBlock *DeadBB :
2720 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2721 for (VPValue *Def : R.definedValues())
2722 Def->replaceAllUsesWith(&Tmp);
2723 R.eraseFromParent();
2724 }
2725 }
2726 }
2727}
2728
2750
2751// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2752// the loop terminator with a branch-on-cond recipe with the negated
2753// active-lane-mask as operand. Note that this turns the loop into an
2754// uncountable one. Only the existing terminator is replaced, all other existing
2755// recipes/users remain unchanged, except for poison-generating flags being
2756// dropped from the canonical IV increment. Return the created
2757// VPActiveLaneMaskPHIRecipe.
2758//
2759// The function adds the following recipes:
2760//
2761// vector.ph:
2762// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2763// %EntryALM = active-lane-mask %EntryInc, TC
2764//
2765// vector.body:
2766// ...
2767// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2768// ...
2769// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2770// %ALM = active-lane-mask %InLoopInc, TC
2771// %Negated = Not %ALM
2772// branch-on-cond %Negated
2773//
2776 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2777 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2778 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2779 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2780 // TODO: Check if dropping the flags is needed.
2781 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2782 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2783 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2784 // we have to take unrolling into account. Each part needs to start at
2785 // Part * VF
2786 auto *VecPreheader = Plan.getVectorPreheader();
2787 VPBuilder Builder(VecPreheader);
2788
2789 // Create the ActiveLaneMask instruction using the correct start values.
2790 VPValue *TC = Plan.getTripCount();
2791 VPValue *VF = &Plan.getVF();
2792
2793 auto *EntryIncrement = Builder.createOverflowingOp(
2794 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2795 DL, "index.part.next");
2796
2797 // Create the active lane mask instruction in the VPlan preheader.
2798 VPValue *ALMMultiplier =
2799 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2800 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2801 {EntryIncrement, TC, ALMMultiplier}, DL,
2802 "active.lane.mask.entry");
2803
2804 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2805 // preheader ActiveLaneMask instruction.
2806 auto *LaneMaskPhi =
2808 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2809 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2810
2811 // Create the active lane mask for the next iteration of the loop before the
2812 // original terminator.
2813 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2814 Builder.setInsertPoint(OriginalTerminator);
2815 auto *InLoopIncrement = Builder.createOverflowingOp(
2817 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2818 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2819 {InLoopIncrement, TC, ALMMultiplier}, DL,
2820 "active.lane.mask.next");
2821 LaneMaskPhi->addOperand(ALM);
2822
2823 // Replace the original terminator with BranchOnCond. We have to invert the
2824 // mask here because a true condition means jumping to the exit block.
2825 auto *NotMask = Builder.createNot(ALM, DL);
2826 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2827 OriginalTerminator->eraseFromParent();
2828 return LaneMaskPhi;
2829}
2830
2832 bool UseActiveLaneMaskForControlFlow) {
2833 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2834 auto *WideCanonicalIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(
2835 LoopRegion->getCanonicalIV());
2836 assert(WideCanonicalIV &&
2837 "Must have widened canonical IV when tail folding!");
2838 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2839 VPSingleDefRecipe *LaneMask;
2840 if (UseActiveLaneMaskForControlFlow) {
2841 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2842 } else {
2843 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2844 VPValue *ALMMultiplier =
2845 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2846 LaneMask =
2847 B.createNaryOp(VPInstruction::ActiveLaneMask,
2848 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2849 nullptr, "active.lane.mask");
2850 }
2851
2852 // Walk users of WideCanonicalIV and replace the header mask of the form
2853 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2854 // removing the old one to ensure there is always only a single header mask.
2855 HeaderMask->replaceAllUsesWith(LaneMask);
2856 HeaderMask->eraseFromParent();
2857}
2858
2859template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2860 Op0_t In;
2862
2863 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2864
2865 template <typename OpTy> bool match(OpTy *V) const {
2866 if (m_Specific(In).match(V)) {
2867 Out = nullptr;
2868 return true;
2869 }
2870 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2871 }
2872};
2873
2874/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2875/// Returns the remaining part \p Out if so, or nullptr otherwise.
2876template <typename Op0_t, typename Op1_t>
2877static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2878 Op1_t &Out) {
2879 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2880}
2881
2882/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2883/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2884/// recipe could be created.
2885/// \p HeaderMask Header Mask.
2886/// \p CurRecipe Recipe to be transform.
2887/// \p TypeInfo VPlan-based type analysis.
2888/// \p EVL The explicit vector length parameter of vector-predication
2889/// intrinsics.
2891 VPRecipeBase &CurRecipe,
2892 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2893 VPlan *Plan = CurRecipe.getParent()->getPlan();
2894 DebugLoc DL = CurRecipe.getDebugLoc();
2895 VPValue *Addr, *Mask, *EndPtr;
2896
2897 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2898 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2899 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2900 EVLEndPtr->insertBefore(&CurRecipe);
2901 EVLEndPtr->setOperand(1, &EVL);
2902 return EVLEndPtr;
2903 };
2904
2905 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
2907 if (!V)
2908 return nullptr;
2909 auto *Reverse = new VPWidenIntrinsicRecipe(
2910 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2911 TypeInfo.inferScalarType(V), {}, {}, DL);
2912 Reverse->insertBefore(&CurRecipe);
2913 return Reverse;
2914 };
2915
2916 if (match(&CurRecipe,
2917 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2918 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2919 EVL, Mask);
2920
2921 VPValue *ReversedVal;
2922 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2923 match(ReversedVal,
2924 m_MaskedLoad(m_VPValue(EndPtr),
2925 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2926 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2927 Mask = GetVPReverse(Mask);
2928 Addr = AdjustEndPtr(EndPtr);
2929 auto *LoadR = new VPWidenLoadEVLRecipe(
2930 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
2931 LoadR->insertBefore(&CurRecipe);
2932 return new VPWidenIntrinsicRecipe(
2933 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2934 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2935 }
2936
2937 VPValue *StoredVal;
2938 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2939 m_RemoveMask(HeaderMask, Mask))))
2940 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2941 StoredVal, EVL, Mask);
2942
2943 if (match(&CurRecipe,
2944 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2945 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2946 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2947 Mask = GetVPReverse(Mask);
2948 Addr = AdjustEndPtr(EndPtr);
2949 StoredVal = GetVPReverse(ReversedVal);
2950 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2951 StoredVal, EVL, Mask);
2952 }
2953
2954 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2955 if (Rdx->isConditional() &&
2956 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2957 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2958
2959 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2960 if (Interleave->getMask() &&
2961 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2962 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2963
2964 VPValue *LHS, *RHS;
2965 if (match(&CurRecipe,
2966 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2967 return new VPWidenIntrinsicRecipe(
2968 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2969 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2970
2971 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2972 m_VPValue(RHS))))
2973 return new VPWidenIntrinsicRecipe(
2974 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2975 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2976
2977 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2978 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2979 VPValue *ZExt = VPBuilder(&CurRecipe)
2981 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
2982 return new VPInstruction(
2983 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
2984 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
2985 }
2986
2987 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
2988 if (match(&CurRecipe,
2990 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
2991 return new VPWidenIntrinsicRecipe(
2992 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
2993 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2994
2995 return nullptr;
2996}
2997
2998/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
2999/// The transforms here need to preserve the original semantics.
3001 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3002 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3005 m_VPValue(EVL))) &&
3006 match(EVL, m_EVL(m_VPValue()))) {
3007 HeaderMask = R.getVPSingleValue();
3008 break;
3009 }
3010 }
3011 if (!HeaderMask)
3012 return;
3013
3014 VPTypeAnalysis TypeInfo(Plan);
3015 SmallVector<VPRecipeBase *> OldRecipes;
3016 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3018 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3019 NewR->insertBefore(R);
3020 for (auto [Old, New] :
3021 zip_equal(R->definedValues(), NewR->definedValues()))
3022 Old->replaceAllUsesWith(New);
3023 OldRecipes.push_back(R);
3024 }
3025 }
3026
3027 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3028 // False, EVL)
3029 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3030 VPValue *Mask;
3031 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3032 auto *LogicalAnd = cast<VPInstruction>(U);
3033 auto *Merge = new VPWidenIntrinsicRecipe(
3034 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3035 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3036 Merge->insertBefore(LogicalAnd);
3037 LogicalAnd->replaceAllUsesWith(Merge);
3038 OldRecipes.push_back(LogicalAnd);
3039 }
3040 }
3041
3042 // Erase old recipes at the end so we don't invalidate TypeInfo.
3043 for (VPRecipeBase *R : reverse(OldRecipes)) {
3044 SmallVector<VPValue *> PossiblyDead(R->operands());
3045 R->eraseFromParent();
3046 for (VPValue *Op : PossiblyDead)
3048 }
3049}
3050
3051/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3052/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3053/// iteration.
3054static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3055 VPTypeAnalysis TypeInfo(Plan);
3056 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3057 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3058
3059 assert(all_of(Plan.getVF().users(),
3062 "User of VF that we can't transform to EVL.");
3063 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3065 });
3066
3067 assert(all_of(Plan.getVFxUF().users(),
3069 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3070 m_Specific(&Plan.getVFxUF())),
3072 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3073 "increment of the canonical induction.");
3074 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3075 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3076 // canonical induction must not be updated.
3078 });
3079
3080 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3081 // contained.
3082 bool ContainsFORs =
3084 if (ContainsFORs) {
3085 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3086 VPValue *MaxEVL = &Plan.getVF();
3087 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3088 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3089 MaxEVL = Builder.createScalarZExtOrTrunc(
3090 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3091 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3092
3093 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3094 VPValue *PrevEVL = Builder.createScalarPhi(
3095 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3096
3099 for (VPRecipeBase &R : *VPBB) {
3100 VPValue *V1, *V2;
3101 if (!match(&R,
3103 m_VPValue(V1), m_VPValue(V2))))
3104 continue;
3105 VPValue *Imm = Plan.getOrAddLiveIn(
3108 Intrinsic::experimental_vp_splice,
3109 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3110 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3111 R.getDebugLoc());
3112 VPSplice->insertBefore(&R);
3113 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3114 }
3115 }
3116 }
3117
3118 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3119 if (!HeaderMask)
3120 return;
3121
3122 // Ensure that any reduction that uses a select to mask off tail lanes does so
3123 // in the vector loop, not the middle block, since EVL tail folding can have
3124 // tail elements in the penultimate iteration.
3125 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3126 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3127 m_VPValue(), m_VPValue()))))
3128 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3129 Plan.getVectorLoopRegion();
3130 return true;
3131 }));
3132
3133 // Replace header masks with a mask equivalent to predicating by EVL:
3134 //
3135 // icmp ule widen-canonical-iv backedge-taken-count
3136 // ->
3137 // icmp ult step-vector, EVL
3138 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3139 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3140 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3141 VPValue *EVLMask = Builder.createICmp(
3143 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3144 HeaderMask->replaceAllUsesWith(EVLMask);
3145}
3146
3147/// Converts a tail folded vector loop region to step by
3148/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3149/// iteration.
3150///
3151/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3152/// replaces all uses of the canonical IV except for the canonical IV
3153/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3154/// only for loop iterations counting after this transformation.
3155///
3156/// - The header mask is replaced with a header mask based on the EVL.
3157///
3158/// - Plans with FORs have a new phi added to keep track of the EVL of the
3159/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3160/// @llvm.vp.splice.
3161///
3162/// The function uses the following definitions:
3163/// %StartV is the canonical induction start value.
3164///
3165/// The function adds the following recipes:
3166///
3167/// vector.ph:
3168/// ...
3169///
3170/// vector.body:
3171/// ...
3172/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3173/// [ %NextIter, %vector.body ]
3174/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3175/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3176/// ...
3177/// %OpEVL = cast i32 %VPEVL to IVSize
3178/// %NextIter = add IVSize %OpEVL, %CurrentIter
3179/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3180/// ...
3181///
3182/// If MaxSafeElements is provided, the function adds the following recipes:
3183/// vector.ph:
3184/// ...
3185///
3186/// vector.body:
3187/// ...
3188/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3189/// [ %NextIter, %vector.body ]
3190/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3191/// %cmp = cmp ult %AVL, MaxSafeElements
3192/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3193/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3194/// ...
3195/// %OpEVL = cast i32 %VPEVL to IVSize
3196/// %NextIter = add IVSize %OpEVL, %CurrentIter
3197/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3198/// ...
3199///
3201 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3202 if (Plan.hasScalarVFOnly())
3203 return;
3204 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3205 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3206
3207 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3208 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3209 VPValue *StartV = Plan.getZero(CanIVTy);
3210 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3211
3212 // Create the CurrentIteration recipe in the vector loop.
3213 auto *CurrentIteration =
3215 CurrentIteration->insertBefore(*Header, Header->begin());
3216 VPBuilder Builder(Header, Header->getFirstNonPhi());
3217 // Create the AVL (application vector length), starting from TC -> 0 in steps
3218 // of EVL.
3219 VPPhi *AVLPhi = Builder.createScalarPhi(
3220 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3221 VPValue *AVL = AVLPhi;
3222
3223 if (MaxSafeElements) {
3224 // Support for MaxSafeDist for correct loop emission.
3225 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3226 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3227 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3228 "safe_avl");
3229 }
3230 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3231 DebugLoc::getUnknown(), "evl");
3232
3233 Builder.setInsertPoint(CanonicalIVIncrement);
3234 VPValue *OpVPEVL = VPEVL;
3235
3236 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3237 OpVPEVL = Builder.createScalarZExtOrTrunc(
3238 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3239
3240 auto *NextIter = Builder.createAdd(
3241 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3242 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3243 CurrentIteration->addOperand(NextIter);
3244
3245 VPValue *NextAVL =
3246 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3247 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3248 AVLPhi->addOperand(NextAVL);
3249
3250 fixupVFUsersForEVL(Plan, *VPEVL);
3251 removeDeadRecipes(Plan);
3252
3253 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3254 // except for the canonical IV increment.
3255 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3256 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3257 // TODO: support unroll factor > 1.
3258 Plan.setUF(1);
3259}
3260
3262 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3263 // There should be only one VPCurrentIteration in the entire plan.
3264 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3265
3268 for (VPRecipeBase &R : VPBB->phis())
3269 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3270 assert(!CurrentIteration &&
3271 "Found multiple CurrentIteration. Only one expected");
3272 CurrentIteration = PhiR;
3273 }
3274
3275 // Early return if it is not variable-length stepping.
3276 if (!CurrentIteration)
3277 return;
3278
3279 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3280 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3281
3282 // Convert CurrentIteration to concrete recipe.
3283 auto *ScalarR =
3284 VPBuilder(CurrentIteration)
3286 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3287 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3288 CurrentIteration->replaceAllUsesWith(ScalarR);
3289 CurrentIteration->eraseFromParent();
3290
3291 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3292 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3293 if (auto *CanIVInc = vputils::findUserOf(
3294 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3295 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3296 CanIVInc->eraseFromParent();
3297 }
3298}
3299
3301 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3302 if (!LoopRegion)
3303 return;
3304 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3305 if (Header->empty())
3306 return;
3307 // The EVL IV is always at the beginning.
3308 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3309 if (!EVLPhi)
3310 return;
3311
3312 // Bail if not an EVL tail folded loop.
3313 VPValue *AVL;
3314 if (!match(EVLPhi->getBackedgeValue(),
3315 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3316 return;
3317
3318 // The AVL may be capped to a safe distance.
3319 VPValue *SafeAVL, *UnsafeAVL;
3320 if (match(AVL,
3322 m_VPValue(SafeAVL)),
3323 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3324 AVL = UnsafeAVL;
3325
3326 VPValue *AVLNext;
3327 [[maybe_unused]] bool FoundAVLNext =
3329 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3330 assert(FoundAVLNext && "Didn't find AVL backedge?");
3331
3332 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3333 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3334 if (match(LatchBr, m_BranchOnCond(m_True())))
3335 return;
3336
3337 VPValue *CanIVInc;
3338 [[maybe_unused]] bool FoundIncrement = match(
3339 LatchBr,
3341 m_Specific(&Plan.getVectorTripCount()))));
3342 assert(FoundIncrement &&
3343 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3344 m_Specific(&Plan.getVFxUF()))) &&
3345 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3346 "trip count");
3347
3348 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3349 VPBuilder Builder(LatchBr);
3350 LatchBr->setOperand(
3351 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3352}
3353
3355 VPlan &Plan, PredicatedScalarEvolution &PSE,
3356 const DenseMap<Value *, const SCEV *> &StridesMap) {
3357 // Replace VPValues for known constant strides guaranteed by predicate scalar
3358 // evolution.
3359 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3360 auto *R = cast<VPRecipeBase>(&U);
3361 return R->getRegion() ||
3362 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3363 };
3364 ValueToSCEVMapTy RewriteMap;
3365 for (const SCEV *Stride : StridesMap.values()) {
3366 using namespace SCEVPatternMatch;
3367 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3368 const APInt *StrideConst;
3369 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3370 // Only handle constant strides for now.
3371 continue;
3372
3373 auto *CI = Plan.getConstantInt(*StrideConst);
3374 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3375 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3376
3377 // The versioned value may not be used in the loop directly but through a
3378 // sext/zext. Add new live-ins in those cases.
3379 for (Value *U : StrideV->users()) {
3381 continue;
3382 VPValue *StrideVPV = Plan.getLiveIn(U);
3383 if (!StrideVPV)
3384 continue;
3385 unsigned BW = U->getType()->getScalarSizeInBits();
3386 APInt C =
3387 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3388 VPValue *CI = Plan.getConstantInt(C);
3389 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3390 }
3391 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3392 }
3393
3394 for (VPRecipeBase &R : *Plan.getEntry()) {
3395 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3396 if (!ExpSCEV)
3397 continue;
3398 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3399 auto *NewSCEV =
3400 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3401 if (NewSCEV != ScevExpr) {
3402 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3403 ExpSCEV->replaceAllUsesWith(NewExp);
3404 if (Plan.getTripCount() == ExpSCEV)
3405 Plan.resetTripCount(NewExp);
3406 }
3407 }
3408}
3409
3411 VPlan &Plan,
3412 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3413 // Collect recipes in the backward slice of `Root` that may generate a poison
3414 // value that is used after vectorization.
3416 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3418 Worklist.push_back(Root);
3419
3420 // Traverse the backward slice of Root through its use-def chain.
3421 while (!Worklist.empty()) {
3422 VPRecipeBase *CurRec = Worklist.pop_back_val();
3423
3424 if (!Visited.insert(CurRec).second)
3425 continue;
3426
3427 // Prune search if we find another recipe generating a widen memory
3428 // instruction. Widen memory instructions involved in address computation
3429 // will lead to gather/scatter instructions, which don't need to be
3430 // handled.
3432 VPHeaderPHIRecipe>(CurRec))
3433 continue;
3434
3435 // This recipe contributes to the address computation of a widen
3436 // load/store. If the underlying instruction has poison-generating flags,
3437 // drop them directly.
3438 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3439 VPValue *A, *B;
3440 // Dropping disjoint from an OR may yield incorrect results, as some
3441 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3442 // for dependence analysis). Instead, replace it with an equivalent Add.
3443 // This is possible as all users of the disjoint OR only access lanes
3444 // where the operands are disjoint or poison otherwise.
3445 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3446 RecWithFlags->isDisjoint()) {
3447 VPBuilder Builder(RecWithFlags);
3448 VPInstruction *New =
3449 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3450 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3451 RecWithFlags->replaceAllUsesWith(New);
3452 RecWithFlags->eraseFromParent();
3453 CurRec = New;
3454 } else
3455 RecWithFlags->dropPoisonGeneratingFlags();
3456 } else {
3459 (void)Instr;
3460 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3461 "found instruction with poison generating flags not covered by "
3462 "VPRecipeWithIRFlags");
3463 }
3464
3465 // Add new definitions to the worklist.
3466 for (VPValue *Operand : CurRec->operands())
3467 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3468 Worklist.push_back(OpDef);
3469 }
3470 });
3471
3472 // Traverse all the recipes in the VPlan and collect the poison-generating
3473 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3474 // VPInterleaveRecipe.
3475 auto Iter =
3478 for (VPRecipeBase &Recipe : *VPBB) {
3479 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3480 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3481 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3482 if (AddrDef && WidenRec->isConsecutive() &&
3483 BlockNeedsPredication(UnderlyingInstr.getParent()))
3484 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3485 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3486 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3487 if (AddrDef) {
3488 // Check if any member of the interleave group needs predication.
3489 const InterleaveGroup<Instruction> *InterGroup =
3490 InterleaveRec->getInterleaveGroup();
3491 bool NeedPredication = false;
3492 for (Instruction *Member : InterGroup->members())
3493 NeedPredication |= BlockNeedsPredication(Member->getParent());
3494
3495 if (NeedPredication)
3496 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3497 }
3498 }
3499 }
3500 }
3501}
3502
3504 VPlan &Plan,
3506 &InterleaveGroups,
3507 const bool &EpilogueAllowed) {
3508 if (InterleaveGroups.empty())
3509 return;
3510
3512 for (VPBasicBlock *VPBB :
3515 for (VPRecipeBase &R :
3517 auto &MemR = cast<VPWidenMemoryRecipe>(R);
3518 IRMemberToRecipe[&MemR.getIngredient()] = &MemR;
3519 }
3520
3521 // Interleave memory: for each Interleave Group we marked earlier as relevant
3522 // for this VPlan, replace the Recipes widening its memory instructions with a
3523 // single VPInterleaveRecipe at its insertion point.
3524 VPDominatorTree VPDT(Plan);
3525 for (const auto *IG : InterleaveGroups) {
3526 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3527 VPIRMetadata InterleaveMD(*Start);
3528 SmallVector<VPValue *, 4> StoredValues;
3529 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3530 StoredValues.push_back(StoreR->getStoredValue());
3531 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3532 Instruction *MemberI = IG->getMember(I);
3533 if (!MemberI)
3534 continue;
3535 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3536 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3537 StoredValues.push_back(StoreR->getStoredValue());
3538 InterleaveMD.intersect(*MemoryR);
3539 }
3540
3541 bool NeedsMaskForGaps =
3542 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3543 (!StoredValues.empty() && !IG->isFull());
3544
3545 Instruction *IRInsertPos = IG->getInsertPos();
3546 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3547
3549 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3550 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3551 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3552
3553 // Get or create the start address for the interleave group.
3554 VPValue *Addr = Start->getAddr();
3555 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3556 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3557 // We cannot re-use the address of member zero because it does not
3558 // dominate the insert position. Instead, use the address of the insert
3559 // position and create a PtrAdd adjusting it to the address of member
3560 // zero.
3561 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3562 // InsertPos or sink loads above zero members to join it.
3563 assert(IG->getIndex(IRInsertPos) != 0 &&
3564 "index of insert position shouldn't be zero");
3565 auto &DL = IRInsertPos->getDataLayout();
3566 APInt Offset(32,
3567 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3568 IG->getIndex(IRInsertPos),
3569 /*IsSigned=*/true);
3570 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3571 VPBuilder B(InsertPos);
3572 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3573 }
3574 // If the group is reverse, adjust the index to refer to the last vector
3575 // lane instead of the first. We adjust the index from the first vector
3576 // lane, rather than directly getting the pointer for lane VF - 1, because
3577 // the pointer operand of the interleaved access is supposed to be uniform.
3578 if (IG->isReverse()) {
3579 auto *ReversePtr = new VPVectorEndPointerRecipe(
3580 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3581 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3582 ReversePtr->insertBefore(InsertPos);
3583 Addr = ReversePtr;
3584 }
3585 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3586 InsertPos->getMask(), NeedsMaskForGaps,
3587 InterleaveMD, InsertPos->getDebugLoc());
3588 VPIG->insertBefore(InsertPos);
3589
3590 unsigned J = 0;
3591 for (unsigned i = 0; i < IG->getFactor(); ++i)
3592 if (Instruction *Member = IG->getMember(i)) {
3593 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member);
3594 if (!Member->getType()->isVoidTy()) {
3595 VPValue *OriginalV = MemberR->getVPSingleValue();
3596 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3597 J++;
3598 }
3599 MemberR->eraseFromParent();
3600 }
3601 }
3602}
3603
3604/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3605/// value, phi and backedge value. In the following example:
3606///
3607/// vector.ph:
3608/// Successor(s): vector loop
3609///
3610/// <x1> vector loop: {
3611/// vector.body:
3612/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3613/// ...
3614/// EMIT branch-on-count ...
3615/// No successors
3616/// }
3617///
3618/// WIDEN-INDUCTION will get expanded to:
3619///
3620/// vector.ph:
3621/// ...
3622/// vp<%induction.start> = ...
3623/// vp<%induction.increment> = ...
3624///
3625/// Successor(s): vector loop
3626///
3627/// <x1> vector loop: {
3628/// vector.body:
3629/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3630/// ...
3631/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3632/// EMIT branch-on-count ...
3633/// No successors
3634/// }
3635static void
3637 VPTypeAnalysis &TypeInfo) {
3638 VPlan *Plan = WidenIVR->getParent()->getPlan();
3639 VPValue *Start = WidenIVR->getStartValue();
3640 VPValue *Step = WidenIVR->getStepValue();
3641 VPValue *VF = WidenIVR->getVFValue();
3642 DebugLoc DL = WidenIVR->getDebugLoc();
3643
3644 // The value from the original loop to which we are mapping the new induction
3645 // variable.
3646 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3647
3648 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3651 VPIRFlags Flags = *WidenIVR;
3652 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3653 AddOp = Instruction::Add;
3654 MulOp = Instruction::Mul;
3655 } else {
3656 AddOp = ID.getInductionOpcode();
3657 MulOp = Instruction::FMul;
3658 }
3659
3660 // If the phi is truncated, truncate the start and step values.
3661 VPBuilder Builder(Plan->getVectorPreheader());
3662 Type *StepTy = TypeInfo.inferScalarType(Step);
3663 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3664 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3665 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3666 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3667 StepTy = Ty;
3668 }
3669
3670 // Construct the initial value of the vector IV in the vector loop preheader.
3671 Type *IVIntTy =
3673 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3674 if (StepTy->isFloatingPointTy())
3675 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3676
3677 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3678 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3679
3680 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3681 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3682 DebugLoc::getUnknown(), "induction");
3683
3684 // Create the widened phi of the vector IV.
3685 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3686 Init, WidenIVR->getDebugLoc(), "vec.ind");
3687
3688 // Create the backedge value for the vector IV.
3689 VPValue *Inc;
3690 VPValue *Prev;
3691 // If unrolled, use the increment and prev value from the operands.
3692 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3693 Inc = SplatVF;
3694 Prev = WidenIVR->getLastUnrolledPartOperand();
3695 } else {
3696 if (VPRecipeBase *R = VF->getDefiningRecipe())
3697 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3698 // Multiply the vectorization factor by the step using integer or
3699 // floating-point arithmetic as appropriate.
3700 if (StepTy->isFloatingPointTy())
3701 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3702 DL);
3703 else
3704 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3705 TypeInfo.inferScalarType(VF), DL);
3706
3707 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3708 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3709 Prev = WidePHI;
3710 }
3711
3713 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3714 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3715 WidenIVR->getDebugLoc(), "vec.ind.next");
3716
3717 WidePHI->addOperand(Next);
3718
3719 WidenIVR->replaceAllUsesWith(WidePHI);
3720}
3721
3722/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3723/// initial value, phi and backedge value. In the following example:
3724///
3725/// <x1> vector loop: {
3726/// vector.body:
3727/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3728/// ...
3729/// EMIT branch-on-count ...
3730/// }
3731///
3732/// WIDEN-POINTER-INDUCTION will get expanded to:
3733///
3734/// <x1> vector loop: {
3735/// vector.body:
3736/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3737/// EMIT %mul = mul %stepvector, %step
3738/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3739/// ...
3740/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3741/// EMIT branch-on-count ...
3742/// }
3744 VPTypeAnalysis &TypeInfo) {
3745 VPlan *Plan = R->getParent()->getPlan();
3746 VPValue *Start = R->getStartValue();
3747 VPValue *Step = R->getStepValue();
3748 VPValue *VF = R->getVFValue();
3749
3750 assert(R->getInductionDescriptor().getKind() ==
3752 "Not a pointer induction according to InductionDescriptor!");
3753 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3754 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3755 "Recipe should have been replaced");
3756
3757 VPBuilder Builder(R);
3758 DebugLoc DL = R->getDebugLoc();
3759
3760 // Build a scalar pointer phi.
3761 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3762
3763 // Create actual address geps that use the pointer phi as base and a
3764 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3765 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3766 Type *StepTy = TypeInfo.inferScalarType(Step);
3767 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3768 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3769 VPValue *PtrAdd =
3770 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3771 R->replaceAllUsesWith(PtrAdd);
3772
3773 // Create the backedge value for the scalar pointer phi.
3775 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3776 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3777 DL);
3778 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3779
3780 VPValue *InductionGEP =
3781 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3782 ScalarPtrPhi->addOperand(InductionGEP);
3783}
3784
3785/// Expand a VPDerivedIVRecipe into executable recipes.
3787 VPBuilder Builder(R);
3788 VPIRValue *Start = R->getStartValue();
3789 VPValue *Step = R->getStepValue();
3790 VPValue *Index = R->getIndex();
3791 Type *StepTy = TypeInfo.inferScalarType(Step);
3792 Type *IndexTy = TypeInfo.inferScalarType(Index);
3793 Index = StepTy->isIntegerTy()
3794 ? Builder.createScalarSExtOrTrunc(
3795 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3796 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3798 switch (R->getInductionKind()) {
3800 assert(TypeInfo.inferScalarType(Index) == TypeInfo.inferScalarType(Start) &&
3801 "Index type does not match StartValue type");
3802 return R->replaceAllUsesWith(Builder.createAdd(
3803 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3804 }
3806 return R->replaceAllUsesWith(Builder.createPtrAdd(
3807 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3809 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3810 const FPMathOperator *FPBinOp = R->getFPBinOp();
3811 assert(FPBinOp &&
3812 (FPBinOp->getOpcode() == Instruction::FAdd ||
3813 FPBinOp->getOpcode() == Instruction::FSub) &&
3814 "Original BinOp should be defined for FP induction");
3815 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3816 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3817 return R->replaceAllUsesWith(
3818 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3819 }
3821 return;
3822 }
3823 llvm_unreachable("Unhandled induction kind");
3824}
3825
3827 // Replace loop regions with explicity CFG.
3828 SmallVector<VPRegionBlock *> LoopRegions;
3830 vp_depth_first_deep(Plan.getEntry()))) {
3831 if (!R->isReplicator())
3832 LoopRegions.push_back(R);
3833 }
3834 for (VPRegionBlock *R : LoopRegions)
3835 R->dissolveToCFGLoop();
3836}
3837
3840 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3841 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3844 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3845 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3846 }
3847
3848 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3849 // single-condition branches:
3850 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3851 // the first condition is true, and otherwise jumps to a new interim block.
3852 // 2. A branch that ends the interim block, jumps to the second successor if
3853 // the second condition is true, and otherwise jumps to the third
3854 // successor.
3855 for (VPInstruction *Br : WorkList) {
3856 assert(Br->getNumOperands() == 2 &&
3857 "BranchOnTwoConds must have exactly 2 conditions");
3858 DebugLoc DL = Br->getDebugLoc();
3859 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3860 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3861 assert(Successors.size() == 3 &&
3862 "BranchOnTwoConds must have exactly 3 successors");
3863
3864 for (VPBlockBase *Succ : Successors)
3865 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3866
3867 VPValue *Cond0 = Br->getOperand(0);
3868 VPValue *Cond1 = Br->getOperand(1);
3869 VPBlockBase *Succ0 = Successors[0];
3870 VPBlockBase *Succ1 = Successors[1];
3871 VPBlockBase *Succ2 = Successors[2];
3872 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3873 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3874
3875 VPBasicBlock *InterimBB =
3876 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3877
3878 VPBuilder(BrOnTwoCondsBB)
3880 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3881 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3882
3884 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3885 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3886 Br->eraseFromParent();
3887 }
3888}
3889
3891 VPTypeAnalysis TypeInfo(Plan);
3894 vp_depth_first_deep(Plan.getEntry()))) {
3895 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3896 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3897 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3898 ToRemove.push_back(WidenIVR);
3899 continue;
3900 }
3901
3902 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3903 // If the recipe only generates scalars, scalarize it instead of
3904 // expanding it.
3905 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3906 VPBuilder Builder(WidenIVR);
3907 VPValue *PtrAdd =
3908 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3909 WidenIVR->replaceAllUsesWith(PtrAdd);
3910 ToRemove.push_back(WidenIVR);
3911 continue;
3912 }
3913 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3914 ToRemove.push_back(WidenIVR);
3915 continue;
3916 }
3917
3918 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
3919 expandVPDerivedIV(DerivedIVR, TypeInfo);
3920 ToRemove.push_back(DerivedIVR);
3921 continue;
3922 }
3923
3924 // Expand VPBlendRecipe into VPInstruction::Select.
3925 VPBuilder Builder(&R);
3926 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3927 VPValue *Select = Blend->getIncomingValue(0);
3928 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3929 Select = Builder.createSelect(Blend->getMask(I),
3930 Blend->getIncomingValue(I), Select,
3931 R.getDebugLoc(), "predphi", *Blend);
3932 Blend->replaceAllUsesWith(Select);
3933 ToRemove.push_back(Blend);
3934 }
3935
3936 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
3937 if (!VEPR->getOffset()) {
3938 assert(Plan.getConcreteUF() == 1 &&
3939 "Expected unroller to have materialized offset for UF != 1");
3940 VEPR->materializeOffset();
3941 }
3942 }
3943
3944 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3945 Expr->decompose();
3946 ToRemove.push_back(Expr);
3947 }
3948
3949 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3950 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3951 if (LastActiveL &&
3952 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3953 // Create Not(Mask) for all operands.
3955 for (VPValue *Op : LastActiveL->operands()) {
3956 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3957 NotMasks.push_back(NotMask);
3958 }
3959
3960 // Create FirstActiveLane on the inverted masks.
3961 VPValue *FirstInactiveLane = Builder.createNaryOp(
3963 LastActiveL->getDebugLoc(), "first.inactive.lane");
3964
3965 // Subtract 1 to get the last active lane.
3966 VPValue *One =
3967 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
3968 VPValue *LastLane =
3969 Builder.createSub(FirstInactiveLane, One,
3970 LastActiveL->getDebugLoc(), "last.active.lane");
3971
3972 LastActiveL->replaceAllUsesWith(LastLane);
3973 ToRemove.push_back(LastActiveL);
3974 continue;
3975 }
3976
3977 // Lower MaskedCond with block mask to LogicalAnd.
3979 auto *VPI = cast<VPInstruction>(&R);
3980 assert(VPI->isMasked() &&
3981 "Unmasked MaskedCond should be simplified earlier");
3982 VPI->replaceAllUsesWith(Builder.createNaryOp(
3983 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
3984 ToRemove.push_back(VPI);
3985 continue;
3986 }
3987
3988 // Lower CanonicalIVIncrementForPart to plain Add.
3989 if (match(
3990 &R,
3992 auto *VPI = cast<VPInstruction>(&R);
3993 VPValue *Add = Builder.createOverflowingOp(
3994 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
3995 VPI->getDebugLoc());
3996 VPI->replaceAllUsesWith(Add);
3997 ToRemove.push_back(VPI);
3998 continue;
3999 }
4000
4001 // Lower BranchOnCount to ICmp + BranchOnCond.
4002 VPValue *IV, *TC;
4003 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4004 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4005 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4006 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4007 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4008 ToRemove.push_back(BranchOnCountInst);
4009 continue;
4010 }
4011
4012 VPValue *VectorStep;
4013 VPValue *ScalarStep;
4015 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4016 continue;
4017
4018 // Expand WideIVStep.
4019 auto *VPI = cast<VPInstruction>(&R);
4020 Type *IVTy = TypeInfo.inferScalarType(VPI);
4021 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4023 ? Instruction::UIToFP
4024 : Instruction::Trunc;
4025 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4026 }
4027
4028 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4029 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4030 ScalarStep =
4031 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4032 }
4033
4034 VPIRFlags Flags;
4035 unsigned MulOpc;
4036 if (IVTy->isFloatingPointTy()) {
4037 MulOpc = Instruction::FMul;
4038 Flags = VPI->getFastMathFlags();
4039 } else {
4040 MulOpc = Instruction::Mul;
4041 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4042 }
4043
4044 VPInstruction *Mul = Builder.createNaryOp(
4045 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4046 VectorStep = Mul;
4047 VPI->replaceAllUsesWith(VectorStep);
4048 ToRemove.push_back(VPI);
4049 }
4050 }
4051
4052 for (VPRecipeBase *R : ToRemove)
4053 R->eraseFromParent();
4054}
4055
4057 VPBasicBlock *HeaderVPBB,
4058 VPBasicBlock *LatchVPBB,
4059 VPBasicBlock *MiddleVPBB,
4060 UncountableExitStyle Style) {
4061 struct EarlyExitInfo {
4062 VPBasicBlock *EarlyExitingVPBB;
4063 VPIRBasicBlock *EarlyExitVPBB;
4064 VPValue *CondToExit;
4065 };
4066
4067 VPDominatorTree VPDT(Plan);
4068 VPBuilder Builder(LatchVPBB->getTerminator());
4070 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4071 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4072 if (Pred == MiddleVPBB)
4073 continue;
4074 // Collect condition for this early exit.
4075 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4076 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4077 VPValue *CondOfEarlyExitingVPBB;
4078 [[maybe_unused]] bool Matched =
4079 match(EarlyExitingVPBB->getTerminator(),
4080 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4081 assert(Matched && "Terminator must be BranchOnCond");
4082
4083 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4084 // the correct block mask.
4085 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4086 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4088 TrueSucc == ExitBlock
4089 ? CondOfEarlyExitingVPBB
4090 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4091 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4092 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4093 VPDT.properlyDominates(
4094 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4095 LatchVPBB)) &&
4096 "exit condition must dominate the latch");
4097 Exits.push_back({
4098 EarlyExitingVPBB,
4099 ExitBlock,
4100 CondToEarlyExit,
4101 });
4102 }
4103 }
4104
4105 assert(!Exits.empty() && "must have at least one early exit");
4106 // Sort exits by RPO order to get correct program order. RPO gives a
4107 // topological ordering of the CFG, ensuring upstream exits are checked
4108 // before downstream exits in the dispatch chain.
4110 HeaderVPBB);
4112 for (const auto &[Num, VPB] : enumerate(RPOT))
4113 RPOIdx[VPB] = Num;
4114 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4115 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4116 });
4117#ifndef NDEBUG
4118 // After RPO sorting, verify that for any pair where one exit dominates
4119 // another, the dominating exit comes first. This is guaranteed by RPO
4120 // (topological order) and is required for the dispatch chain correctness.
4121 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4122 for (unsigned J = I + 1; J < Exits.size(); ++J)
4123 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4124 Exits[I].EarlyExitingVPBB) &&
4125 "RPO sort must place dominating exits before dominated ones");
4126#endif
4127
4128 // Build the AnyOf condition for the latch terminator using logical OR
4129 // to avoid poison propagation from later exit conditions when an earlier
4130 // exit is taken.
4131 VPValue *Combined = Exits[0].CondToExit;
4132 for (const EarlyExitInfo &Info : drop_begin(Exits))
4133 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4134
4135 VPValue *IsAnyExitTaken =
4136 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4137
4139 "Early exit store masking not implemented");
4140
4141 // Create the vector.early.exit blocks.
4142 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4143 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4144 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4145 VPBasicBlock *VectorEarlyExitVPBB =
4146 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4147 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4148 }
4149
4150 // Create the dispatch block (or reuse the single exit block if only one
4151 // exit). The dispatch block computes the first active lane of the combined
4152 // condition and, for multiple exits, chains through conditions to determine
4153 // which exit to take.
4154 VPBasicBlock *DispatchVPBB =
4155 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4156 : Plan.createVPBasicBlock("vector.early.exit.check");
4157 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4158 VPValue *FirstActiveLane =
4159 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4160 DebugLoc::getUnknown(), "first.active.lane");
4161
4162 // For each early exit, disconnect the original exiting block
4163 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4164 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4165 // values at the first active lane:
4166 //
4167 // Input:
4168 // early.exiting.I:
4169 // ...
4170 // EMIT branch-on-cond vp<%cond.I>
4171 // Successor(s): in.loop.succ, ir-bb<exit.I>
4172 //
4173 // ir-bb<exit.I>:
4174 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4175 //
4176 // Output:
4177 // early.exiting.I:
4178 // ...
4179 // Successor(s): in.loop.succ
4180 //
4181 // vector.early.exit.I:
4182 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4183 // Successor(s): ir-bb<exit.I>
4184 //
4185 // ir-bb<exit.I>:
4186 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4187 // vector.early.exit.I)
4188 //
4189 for (auto [Exit, VectorEarlyExitVPBB] :
4190 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4191 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4192 // Adjust the phi nodes in EarlyExitVPBB.
4193 // 1. remove incoming values from EarlyExitingVPBB,
4194 // 2. extract the incoming value at FirstActiveLane
4195 // 3. add back the extracts as last operands for the phis
4196 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4197 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4198 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4199 // values from VectorEarlyExitVPBB.
4200 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4201 auto *ExitIRI = cast<VPIRPhi>(&R);
4202 VPValue *IncomingVal =
4203 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4204 VPValue *NewIncoming = IncomingVal;
4205 if (!isa<VPIRValue>(IncomingVal)) {
4206 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4207 NewIncoming = EarlyExitBuilder.createNaryOp(
4208 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4209 DebugLoc::getUnknown(), "early.exit.value");
4210 }
4211 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4212 ExitIRI->addOperand(NewIncoming);
4213 }
4214
4215 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4216 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4217 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4218 }
4219
4220 // Chain through exits: for each exit, check if its condition is true at
4221 // the first active lane. If so, take that exit; otherwise, try the next.
4222 // The last exit needs no check since it must be taken if all others fail.
4223 //
4224 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4225 //
4226 // latch:
4227 // ...
4228 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4229 // ...
4230 //
4231 // vector.early.exit.check:
4232 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4233 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4234 // EMIT branch-on-cond vp<%at.cond.0>
4235 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4236 //
4237 // vector.early.exit.check.0:
4238 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4239 // EMIT branch-on-cond vp<%at.cond.1>
4240 // Successor(s): vector.early.exit.1, vector.early.exit.2
4241 VPBasicBlock *CurrentBB = DispatchVPBB;
4242 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4243 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4244 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4245 DebugLoc::getUnknown(), "exit.cond.at.lane");
4246
4247 // For the last dispatch, branch directly to the last exit on false;
4248 // otherwise, create a new check block.
4249 bool IsLastDispatch = (I + 2 == Exits.size());
4250 VPBasicBlock *FalseBB =
4251 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4252 : Plan.createVPBasicBlock(
4253 Twine("vector.early.exit.check.") + Twine(I));
4254
4255 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4256 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4257 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4258 FalseBB->setPredecessors({CurrentBB});
4259
4260 CurrentBB = FalseBB;
4261 DispatchBuilder.setInsertPoint(CurrentBB);
4262 }
4263
4264 // Replace the latch terminator with the new branching logic.
4265 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4266 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4267 "Unexpected terminator");
4268 auto *IsLatchExitTaken =
4269 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4270 LatchExitingBranch->getOperand(1));
4271
4272 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4273 LatchExitingBranch->eraseFromParent();
4274 Builder.setInsertPoint(LatchVPBB);
4275 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4276 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4277 LatchVPBB->clearSuccessors();
4278 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4279 DispatchVPBB->setPredecessors({LatchVPBB});
4280}
4281
4282/// This function tries convert extended in-loop reductions to
4283/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4284/// valid. The created recipe must be decomposed to its constituent
4285/// recipes before execution.
4286static VPExpressionRecipe *
4288 VFRange &Range) {
4289 Type *RedTy = Ctx.Types.inferScalarType(Red);
4290 VPValue *VecOp = Red->getVecOp();
4291
4292 assert(!Red->isPartialReduction() &&
4293 "This path does not support partial reductions");
4294
4295 // Clamp the range if using extended-reduction is profitable.
4296 auto IsExtendedRedValidAndClampRange =
4297 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4299 [&](ElementCount VF) {
4300 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4302
4304 InstructionCost ExtCost =
4305 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4306 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4307
4308 assert(!RedTy->isFloatingPointTy() &&
4309 "getExtendedReductionCost only supports integer types");
4310 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4311 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4312 Red->getFastMathFlags(), CostKind);
4313 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4314 },
4315 Range);
4316 };
4317
4318 VPValue *A;
4319 // Match reduce(ext)).
4321 IsExtendedRedValidAndClampRange(
4322 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4323 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4324 Ctx.Types.inferScalarType(A)))
4325 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4326
4327 return nullptr;
4328}
4329
4330/// This function tries convert extended in-loop reductions to
4331/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4332/// and valid. The created VPExpressionRecipe must be decomposed to its
4333/// constituent recipes before execution. Patterns of the
4334/// VPExpressionRecipe:
4335/// reduce.add(mul(...)),
4336/// reduce.add(mul(ext(A), ext(B))),
4337/// reduce.add(ext(mul(ext(A), ext(B)))).
4338/// reduce.fadd(fmul(ext(A), ext(B)))
4339static VPExpressionRecipe *
4341 VPCostContext &Ctx, VFRange &Range) {
4342 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4343 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4344 Opcode != Instruction::FAdd)
4345 return nullptr;
4346
4347 assert(!Red->isPartialReduction() &&
4348 "This path does not support partial reductions");
4349 Type *RedTy = Ctx.Types.inferScalarType(Red);
4350
4351 // Clamp the range if using multiply-accumulate-reduction is profitable.
4352 auto IsMulAccValidAndClampRange =
4354 VPWidenCastRecipe *OuterExt) -> bool {
4356 [&](ElementCount VF) {
4358 Type *SrcTy =
4359 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4360 InstructionCost MulAccCost;
4361
4362 // getMulAccReductionCost for in-loop reductions does not support
4363 // mixed or floating-point extends.
4364 if (Ext0 && Ext1 &&
4365 (Ext0->getOpcode() != Ext1->getOpcode() ||
4366 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4367 return false;
4368
4369 bool IsZExt =
4370 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4371 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4372 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4373 SrcVecTy, CostKind);
4374
4375 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4376 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4377 InstructionCost ExtCost = 0;
4378 if (Ext0)
4379 ExtCost += Ext0->computeCost(VF, Ctx);
4380 if (Ext1)
4381 ExtCost += Ext1->computeCost(VF, Ctx);
4382 if (OuterExt)
4383 ExtCost += OuterExt->computeCost(VF, Ctx);
4384
4385 return MulAccCost.isValid() &&
4386 MulAccCost < ExtCost + MulCost + RedCost;
4387 },
4388 Range);
4389 };
4390
4391 VPValue *VecOp = Red->getVecOp();
4392 VPRecipeBase *Sub = nullptr;
4393 VPValue *A, *B;
4394 VPValue *Tmp = nullptr;
4395
4396 if (RedTy->isFloatingPointTy())
4397 return nullptr;
4398
4399 // Sub reductions could have a sub between the add reduction and vec op.
4400 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4401 Sub = VecOp->getDefiningRecipe();
4402 VecOp = Tmp;
4403 }
4404
4405 // If ValB is a constant and can be safely extended, truncate it to the same
4406 // type as ExtA's operand, then extend it to the same type as ExtA. This
4407 // creates two uniform extends that can more easily be matched by the rest of
4408 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4409 // replaced with the new extend of the constant.
4410 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4411 VPWidenCastRecipe *&ExtB,
4412 VPValue *&ValB, VPWidenRecipe *Mul) {
4413 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4414 return;
4415 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4416 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4417 const APInt *Const;
4418 if (!match(ValB, m_APInt(Const)) ||
4420 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4421 return;
4422 // The truncate ensures that the type of each extended operand is the
4423 // same, and it's been proven that the constant can be extended from
4424 // NarrowTy safely. Necessary since ExtA's extended operand would be
4425 // e.g. an i8, while the const will likely be an i32. This will be
4426 // elided by later optimisations.
4427 VPBuilder Builder(Mul);
4428 auto *Trunc =
4429 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4430 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4431 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4432 Mul->setOperand(1, ExtB);
4433 };
4434
4435 // Try to match reduce.add(mul(...)).
4436 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4437 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4438 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4439 auto *Mul = cast<VPWidenRecipe>(VecOp);
4440
4441 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4442 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4443
4444 // Match reduce.add/sub(mul(ext, ext)).
4445 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4446 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4447 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4448 if (Sub)
4449 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4450 cast<VPWidenRecipe>(Sub), Red);
4451 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4452 }
4453 // TODO: Add an expression type for this variant with a negated mul
4454 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4455 return new VPExpressionRecipe(Mul, Red);
4456 }
4457 // TODO: Add an expression type for negated versions of other expression
4458 // variants.
4459 if (Sub)
4460 return nullptr;
4461
4462 // Match reduce.add(ext(mul(A, B))).
4463 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4464 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4465 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4466 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4467 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4468
4469 // reduce.add(ext(mul(ext, const)))
4470 // -> reduce.add(ext(mul(ext, ext(const))))
4471 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4472
4473 // reduce.add(ext(mul(ext(A), ext(B))))
4474 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4475 // The inner extends must either have the same opcode as the outer extend or
4476 // be the same, in which case the multiply can never result in a negative
4477 // value and the outer extend can be folded away by doing wider
4478 // extends for the operands of the mul.
4479 if (Ext0 && Ext1 &&
4480 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4481 Ext0->getOpcode() == Ext1->getOpcode() &&
4482 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4483 auto *NewExt0 = new VPWidenCastRecipe(
4484 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4485 *Ext0, *Ext0, Ext0->getDebugLoc());
4486 NewExt0->insertBefore(Ext0);
4487
4488 VPWidenCastRecipe *NewExt1 = NewExt0;
4489 if (Ext0 != Ext1) {
4490 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4491 Ext->getResultType(), nullptr, *Ext1,
4492 *Ext1, Ext1->getDebugLoc());
4493 NewExt1->insertBefore(Ext1);
4494 }
4495 Mul->setOperand(0, NewExt0);
4496 Mul->setOperand(1, NewExt1);
4497 Red->setOperand(1, Mul);
4498 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4499 }
4500 }
4501 return nullptr;
4502}
4503
4504/// This function tries to create abstract recipes from the reduction recipe for
4505/// following optimizations and cost estimation.
4507 VPCostContext &Ctx,
4508 VFRange &Range) {
4509 // Creation of VPExpressions for partial reductions is entirely handled in
4510 // transformToPartialReduction.
4511 assert(!Red->isPartialReduction() &&
4512 "This path does not support partial reductions");
4513
4514 VPExpressionRecipe *AbstractR = nullptr;
4515 auto IP = std::next(Red->getIterator());
4516 auto *VPBB = Red->getParent();
4517 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4518 AbstractR = MulAcc;
4519 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4520 AbstractR = ExtRed;
4521 // Cannot create abstract inloop reduction recipes.
4522 if (!AbstractR)
4523 return;
4524
4525 AbstractR->insertBefore(*VPBB, IP);
4526 Red->replaceAllUsesWith(AbstractR);
4527}
4528
4539
4541 if (Plan.hasScalarVFOnly())
4542 return;
4543
4544#ifndef NDEBUG
4545 VPDominatorTree VPDT(Plan);
4546#endif
4547
4548 SmallVector<VPValue *> VPValues;
4549 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4550 VPValues.push_back(BTC);
4551 append_range(VPValues, Plan.getLiveIns());
4552 for (VPRecipeBase &R : *Plan.getEntry())
4553 append_range(VPValues, R.definedValues());
4554
4555 auto *VectorPreheader = Plan.getVectorPreheader();
4556 for (VPValue *VPV : VPValues) {
4558 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4559 continue;
4560
4561 // Add explicit broadcast at the insert point that dominates all users.
4562 VPBasicBlock *HoistBlock = VectorPreheader;
4563 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4564 for (VPUser *User : VPV->users()) {
4565 if (User->usesScalars(VPV))
4566 continue;
4567 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4568 HoistPoint = HoistBlock->begin();
4569 else
4570 assert(VPDT.dominates(VectorPreheader,
4571 cast<VPRecipeBase>(User)->getParent()) &&
4572 "All users must be in the vector preheader or dominated by it");
4573 }
4574
4575 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4576 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4577 VPV->replaceUsesWithIf(Broadcast,
4578 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4579 return Broadcast != &U && !U.usesScalars(VPV);
4580 });
4581 }
4582}
4583
4585 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4586
4587 // Collect candidate loads with invariant addresses and noalias scopes
4588 // metadata and memory-writing recipes with noalias metadata.
4592 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4593 for (VPRecipeBase &R : *VPBB) {
4594 // Only handle single-scalar replicated loads with invariant addresses.
4595 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4596 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4597 RepR->getOpcode() != Instruction::Load)
4598 continue;
4599
4600 VPValue *Addr = RepR->getOperand(0);
4601 if (Addr->isDefinedOutsideLoopRegions()) {
4603 if (!Loc.AATags.Scope)
4604 continue;
4605 CandidateLoads.push_back({RepR, Loc});
4606 }
4607 }
4608 if (R.mayWriteToMemory()) {
4610 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4611 return;
4612 Stores.push_back(*Loc);
4613 }
4614 }
4615 }
4616
4617 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4618 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4619 // Hoist the load to the preheader if it doesn't alias with any stores
4620 // according to the noalias metadata. Other loads should have been hoisted
4621 // by other passes
4622 const AAMDNodes &LoadAA = LoadLoc.AATags;
4623 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4625 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4626 })) {
4627 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4628 }
4629 }
4630}
4631
4632// Collect common metadata from a group of replicate recipes by intersecting
4633// metadata from all recipes in the group.
4635 VPIRMetadata CommonMetadata = *Recipes.front();
4636 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4637 CommonMetadata.intersect(*Recipe);
4638 return CommonMetadata;
4639}
4640
4641template <unsigned Opcode>
4645 const Loop *L) {
4646 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4647 "Only Load and Store opcodes supported");
4648 constexpr bool IsLoad = (Opcode == Instruction::Load);
4649 VPTypeAnalysis TypeInfo(Plan);
4650
4651 // For each address, collect operations with the same or complementary masks.
4653 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4654 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4655 };
4657 Plan, PSE, L,
4658 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4659 for (auto Recipes : Groups) {
4660 if (Recipes.size() < 2)
4661 continue;
4662
4663 // Collect groups with the same or complementary masks.
4664 for (VPReplicateRecipe *&RecipeI : Recipes) {
4665 if (!RecipeI)
4666 continue;
4667
4668 VPValue *MaskI = RecipeI->getMask();
4669 Type *TypeI = GetLoadStoreValueType(RecipeI);
4671 Group.push_back(RecipeI);
4672 RecipeI = nullptr;
4673
4674 // Find all operations with the same or complementary masks.
4675 bool HasComplementaryMask = false;
4676 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4677 if (!RecipeJ)
4678 continue;
4679
4680 VPValue *MaskJ = RecipeJ->getMask();
4681 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4682 if (TypeI == TypeJ) {
4683 // Check if any operation in the group has a complementary mask with
4684 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4685 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4686 match(MaskJ, m_Not(m_Specific(MaskI)));
4687 Group.push_back(RecipeJ);
4688 RecipeJ = nullptr;
4689 }
4690 }
4691
4692 if (HasComplementaryMask) {
4693 assert(Group.size() >= 2 && "must have at least 2 entries");
4694 AllGroups.push_back(std::move(Group));
4695 }
4696 }
4697 }
4698
4699 return AllGroups;
4700}
4701
4702// Find the recipe with minimum alignment in the group.
4703template <typename InstType>
4704static VPReplicateRecipe *
4706 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4707 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4708 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4709 });
4710}
4711
4714 const Loop *L) {
4715 auto Groups =
4717 if (Groups.empty())
4718 return;
4719
4720 // Process each group of loads.
4721 for (auto &Group : Groups) {
4722 // Try to use the earliest (most dominating) load to replace all others.
4723 VPReplicateRecipe *EarliestLoad = Group[0];
4724 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4725 VPBasicBlock *LastBB = Group.back()->getParent();
4726
4727 // Check that the load doesn't alias with stores between first and last.
4728 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4729 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4730 continue;
4731
4732 // Collect common metadata from all loads in the group.
4733 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4734
4735 // Find the load with minimum alignment to use.
4736 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4737
4738 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4739 assert(all_of(Group,
4740 [IsSingleScalar](VPReplicateRecipe *R) {
4741 return R->isSingleScalar() == IsSingleScalar;
4742 }) &&
4743 "all members in group must agree on IsSingleScalar");
4744
4745 // Create an unpredicated version of the earliest load with common
4746 // metadata.
4747 auto *UnpredicatedLoad = new VPReplicateRecipe(
4748 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4749 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4750
4751 UnpredicatedLoad->insertBefore(EarliestLoad);
4752
4753 // Replace all loads in the group with the unpredicated load.
4754 for (VPReplicateRecipe *Load : Group) {
4755 Load->replaceAllUsesWith(UnpredicatedLoad);
4756 Load->eraseFromParent();
4757 }
4758 }
4759}
4760
4761static bool
4763 PredicatedScalarEvolution &PSE, const Loop &L,
4764 VPTypeAnalysis &TypeInfo) {
4765 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4766 if (!StoreLoc || !StoreLoc->AATags.Scope)
4767 return false;
4768
4769 // When sinking a group of stores, all members of the group alias each other.
4770 // Skip them during the alias checks.
4771 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4772 StoresToSink.end());
4773
4774 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4775 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4776 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4777 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4778}
4779
4782 const Loop *L) {
4783 auto Groups =
4785 if (Groups.empty())
4786 return;
4787
4788 VPTypeAnalysis TypeInfo(Plan);
4789
4790 for (auto &Group : Groups) {
4791 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4792 continue;
4793
4794 // Use the last (most dominated) store's location for the unconditional
4795 // store.
4796 VPReplicateRecipe *LastStore = Group.back();
4797 VPBasicBlock *InsertBB = LastStore->getParent();
4798
4799 // Collect common alias metadata from all stores in the group.
4800 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4801
4802 // Build select chain for stored values.
4803 VPValue *SelectedValue = Group[0]->getOperand(0);
4804 VPBuilder Builder(InsertBB, LastStore->getIterator());
4805
4806 bool IsSingleScalar = Group[0]->isSingleScalar();
4807 for (unsigned I = 1; I < Group.size(); ++I) {
4808 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4809 "all members in group must agree on IsSingleScalar");
4810 VPValue *Mask = Group[I]->getMask();
4811 VPValue *Value = Group[I]->getOperand(0);
4812 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4813 Group[I]->getDebugLoc());
4814 }
4815
4816 // Find the store with minimum alignment to use.
4817 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4818
4819 // Create unconditional store with selected value and common metadata.
4820 auto *UnpredicatedStore = new VPReplicateRecipe(
4821 StoreWithMinAlign->getUnderlyingInstr(),
4822 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4823 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4824 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4825
4826 // Remove all predicated stores from the group.
4827 for (VPReplicateRecipe *Store : Group)
4828 Store->eraseFromParent();
4829 }
4830}
4831
4833 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4835 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4836 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4837
4838 VPValue *TC = Plan.getTripCount();
4839 if (TC->getNumUsers() == 0)
4840 return;
4841
4842 // Skip cases for which the trip count may be non-trivial to materialize.
4843 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4844 // tail is required.
4845 if (!Plan.hasScalarTail() ||
4847 Plan.getScalarPreheader() ||
4848 !isa<VPIRValue>(TC))
4849 return;
4850
4851 // Materialize vector trip counts for constants early if it can simply
4852 // be computed as (Original TC / VF * UF) * VF * UF.
4853 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4854 // tail-folded loops.
4855 ScalarEvolution &SE = *PSE.getSE();
4856 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4857 if (!isa<SCEVConstant>(TCScev))
4858 return;
4859 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4860 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4861 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4862 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4863}
4864
4866 VPBasicBlock *VectorPH) {
4868 if (BTC->getNumUsers() == 0)
4869 return;
4870
4871 VPBuilder Builder(VectorPH, VectorPH->begin());
4872 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4873 auto *TCMO =
4874 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4875 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4876 BTC->replaceAllUsesWith(TCMO);
4877}
4878
4880 if (Plan.hasScalarVFOnly())
4881 return;
4882
4883 VPTypeAnalysis TypeInfo(Plan);
4884 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4885 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4887 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4888 vp_depth_first_shallow(LoopRegion->getEntry()));
4889 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4890 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4891 // regions. Those are not materialized explicitly yet. Those vector users are
4892 // still handled in VPReplicateRegion::execute(), via shouldPack().
4893 // TODO: materialize build vectors for replicating recipes in replicating
4894 // regions.
4895 for (VPBasicBlock *VPBB :
4896 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4897 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4899 continue;
4900 auto *DefR = cast<VPSingleDefRecipe>(&R);
4901 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4902 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4903 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4904 };
4905 if ((isa<VPReplicateRecipe>(DefR) &&
4906 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4907 (isa<VPInstruction>(DefR) &&
4909 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4910 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4911 continue;
4912
4913 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4914 unsigned Opcode = ScalarTy->isStructTy()
4917 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4918 BuildVector->insertAfter(DefR);
4919
4920 DefR->replaceUsesWithIf(
4921 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4922 VPUser &U, unsigned) {
4923 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4924 });
4925 }
4926 }
4927
4928 // Create explicit VPInstructions to convert vectors to scalars. The current
4929 // implementation is conservative - it may miss some cases that may or may not
4930 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4931 // if they are known to operate on scalar values.
4932 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4933 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4935 VPDerivedIVRecipe>(&R))
4936 continue;
4937 for (VPValue *Def : R.definedValues()) {
4938 // Skip recipes that are single-scalar or only have their first lane
4939 // used.
4940 // TODO: The Defs skipped here may or may not be vector values.
4941 // Introduce Unpacks, and remove them later, if they are guaranteed to
4942 // produce scalar values.
4944 continue;
4945
4946 // At the moment, we create unpacks only for scalar users outside
4947 // replicate regions. Recipes inside replicate regions still extract the
4948 // required lanes implicitly.
4949 // TODO: Remove once replicate regions are unrolled completely.
4950 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4951 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4952 return U->usesScalars(Def) &&
4953 (!ParentRegion || !ParentRegion->isReplicator());
4954 };
4955 if (none_of(Def->users(), IsCandidateUnpackUser))
4956 continue;
4957
4958 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4959 if (R.isPhi())
4960 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4961 else
4962 Unpack->insertAfter(&R);
4963 Def->replaceUsesWithIf(Unpack,
4964 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4965 return IsCandidateUnpackUser(&U);
4966 });
4967 }
4968 }
4969 }
4970}
4971
4973 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
4974 bool RequiresScalarEpilogue, VPValue *Step,
4975 std::optional<uint64_t> MaxRuntimeStep) {
4976 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4977 // There's nothing to do if there are no users of the vector trip count or its
4978 // IR value has already been set.
4979 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4980 return;
4981
4982 VPValue *TC = Plan.getTripCount();
4983 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4984 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
4985 if (auto *StepR = Step->getDefiningRecipe()) {
4986 assert(StepR->getParent() == VectorPHVPBB &&
4987 "Step must be defined in VectorPHVPBB");
4988 // Insert after Step's definition to maintain valid def-use ordering.
4989 InsertPt = std::next(StepR->getIterator());
4990 }
4991 VPBuilder Builder(VectorPHVPBB, InsertPt);
4992
4993 // For scalable steps, if TC is a constant and is divisible by the maximum
4994 // possible runtime step, then TC % Step == 0 for all valid vscale values
4995 // and the vector trip count equals TC directly.
4996 const APInt *TCVal;
4997 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
4998 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
4999 VectorTC.replaceAllUsesWith(TC);
5000 return;
5001 }
5002
5003 // If the tail is to be folded by masking, round the number of iterations N
5004 // up to a multiple of Step instead of rounding down. This is done by first
5005 // adding Step-1 and then rounding down. Note that it's ok if this addition
5006 // overflows: the vector induction variable will eventually wrap to zero given
5007 // that it starts at zero and its Step is a power of two; the loop will then
5008 // exit, with the last early-exit vector comparison also producing all-true.
5009 if (TailByMasking) {
5010 TC = Builder.createAdd(
5011 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5012 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5013 }
5014
5015 // Now we need to generate the expression for the part of the loop that the
5016 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5017 // iterations are not required for correctness, or N - Step, otherwise. Step
5018 // is equal to the vectorization factor (number of SIMD elements) times the
5019 // unroll factor (number of SIMD instructions).
5020 VPValue *R =
5021 Builder.createNaryOp(Instruction::URem, {TC, Step},
5022 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5023
5024 // There are cases where we *must* run at least one iteration in the remainder
5025 // loop. See the cost model for when this can happen. If the step evenly
5026 // divides the trip count, we set the remainder to be equal to the step. If
5027 // the step does not evenly divide the trip count, no adjustment is necessary
5028 // since there will already be scalar iterations. Note that the minimum
5029 // iterations check ensures that N >= Step.
5030 if (RequiresScalarEpilogue) {
5031 assert(!TailByMasking &&
5032 "requiring scalar epilogue is not supported with fail folding");
5033 VPValue *IsZero =
5034 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5035 R = Builder.createSelect(IsZero, Step, R);
5036 }
5037
5038 VPValue *Res =
5039 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5040 VectorTC.replaceAllUsesWith(Res);
5041}
5042
5044 ElementCount VFEC) {
5045 // If VF and VFxUF have already been materialized (no remaining users),
5046 // there's nothing more to do.
5047 if (Plan.getVF().isMaterialized()) {
5048 assert(Plan.getVFxUF().isMaterialized() &&
5049 "VF and VFxUF must be materialized together");
5050 return;
5051 }
5052
5053 VPBuilder Builder(VectorPH, VectorPH->begin());
5054 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5055 VPValue &VF = Plan.getVF();
5056 VPValue &VFxUF = Plan.getVFxUF();
5057 // If there are no users of the runtime VF, compute VFxUF by constant folding
5058 // the multiplication of VF and UF.
5059 if (VF.getNumUsers() == 0) {
5060 VPValue *RuntimeVFxUF =
5061 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5062 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5063 return;
5064 }
5065
5066 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5067 // vscale) * UF.
5068 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5070 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5072 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5073 }
5074 VF.replaceAllUsesWith(RuntimeVF);
5075
5076 VPValue *MulByUF = Builder.createOverflowingOp(
5077 Instruction::Mul,
5078 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5079 {true, false});
5080 VFxUF.replaceAllUsesWith(MulByUF);
5081}
5082
5085 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5086
5087 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5088 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5089 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5090 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5092 continue;
5093 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5094 if (!ExpSCEV)
5095 break;
5096 const SCEV *Expr = ExpSCEV->getSCEV();
5097 Value *Res =
5098 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5099 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5100 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5101 ExpSCEV->replaceAllUsesWith(Exp);
5102 if (Plan.getTripCount() == ExpSCEV)
5103 Plan.resetTripCount(Exp);
5104 ExpSCEV->eraseFromParent();
5105 }
5107 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5108 "before any VPIRInstructions");
5109 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5110 // to the VPIRBasicBlock.
5111 auto EI = Entry->begin();
5112 for (Instruction &I : drop_end(*EntryBB)) {
5113 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5114 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5115 EI++;
5116 continue;
5117 }
5119 }
5120
5121 return ExpandedSCEVs;
5122}
5123
5124/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5125/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5126/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5127/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5128/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5129/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5130/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5131/// is defined at \p Idx of a load interleave group.
5132static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5133 VPValue *OpV, unsigned Idx, bool IsScalable) {
5134 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5135 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5136 if (!Member0OpR)
5137 return Member0Op == OpV;
5138 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5139 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5140 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5141 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5142 Member0Op == OpV;
5143 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5144 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5145 return false;
5146}
5147
5148static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5150 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5151 if (!WideMember0)
5152 return false;
5153 for (VPValue *V : Ops) {
5155 return false;
5156 auto *R = cast<VPSingleDefRecipe>(V);
5157 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5158 return false;
5159 }
5160
5161 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5163 for (VPValue *Op : Ops)
5164 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5165
5166 if (canNarrowOps(OpsI, IsScalable))
5167 continue;
5168
5169 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5170 const auto &[OpIdx, OpV] = P;
5171 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5172 }))
5173 return false;
5174 }
5175
5176 return true;
5177}
5178
5179/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5180/// number of members both equal to VF. The interleave group must also access
5181/// the full vector width.
5182static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5184 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5185 if (!InterleaveR || InterleaveR->getMask())
5186 return std::nullopt;
5187
5188 Type *GroupElementTy = nullptr;
5189 if (InterleaveR->getStoredValues().empty()) {
5190 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5191 if (!all_of(InterleaveR->definedValues(),
5192 [&TypeInfo, GroupElementTy](VPValue *Op) {
5193 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5194 }))
5195 return std::nullopt;
5196 } else {
5197 GroupElementTy =
5198 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5199 if (!all_of(InterleaveR->getStoredValues(),
5200 [&TypeInfo, GroupElementTy](VPValue *Op) {
5201 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5202 }))
5203 return std::nullopt;
5204 }
5205
5206 auto IG = InterleaveR->getInterleaveGroup();
5207 if (IG->getFactor() != IG->getNumMembers())
5208 return std::nullopt;
5209
5210 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5211 TypeSize Size = TTI.getRegisterBitWidth(
5214 assert(Size.isScalable() == VF.isScalable() &&
5215 "if Size is scalable, VF must be scalable and vice versa");
5216 return Size.getKnownMinValue();
5217 };
5218
5219 for (ElementCount VF : VFs) {
5220 unsigned MinVal = VF.getKnownMinValue();
5221 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5222 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5223 return {VF};
5224 }
5225 return std::nullopt;
5226}
5227
5228/// Returns true if \p VPValue is a narrow VPValue.
5229static bool isAlreadyNarrow(VPValue *VPV) {
5230 if (isa<VPIRValue>(VPV))
5231 return true;
5232 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5233 return RepR && RepR->isSingleScalar();
5234}
5235
5236// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5237// a narrow variant.
5238static VPValue *
5240 auto *R = V->getDefiningRecipe();
5241 if (!R || NarrowedOps.contains(V))
5242 return V;
5243
5244 if (isAlreadyNarrow(V))
5245 return V;
5246
5248 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5249 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5250 WideMember0->setOperand(
5251 Idx,
5252 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5253 return V;
5254 }
5255
5256 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5257 // Narrow interleave group to wide load, as transformed VPlan will only
5258 // process one original iteration.
5259 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5260 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5261 LoadGroup->getMask(), /*Consecutive=*/true,
5262 {}, LoadGroup->getDebugLoc());
5263 L->insertBefore(LoadGroup);
5264 NarrowedOps.insert(L);
5265 return L;
5266 }
5267
5268 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5269 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5270 "must be a single scalar load");
5271 NarrowedOps.insert(RepR);
5272 return RepR;
5273 }
5274
5275 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5276 VPValue *PtrOp = WideLoad->getAddr();
5277 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5278 PtrOp = VecPtr->getOperand(0);
5279 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5280 // process one original iteration.
5281 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5282 /*IsUniform*/ true,
5283 /*Mask*/ nullptr, {}, *WideLoad);
5284 N->insertBefore(WideLoad);
5285 NarrowedOps.insert(N);
5286 return N;
5287}
5288
5289std::unique_ptr<VPlan>
5291 const TargetTransformInfo &TTI) {
5292 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5293
5294 if (!VectorLoop)
5295 return nullptr;
5296
5297 // Only handle single-block loops for now.
5298 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5299 return nullptr;
5300
5301 // Skip plans when we may not be able to properly narrow.
5302 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5303 if (!match(&Exiting->back(), m_BranchOnCount()))
5304 return nullptr;
5305
5306 assert(match(&Exiting->back(),
5308 m_Specific(&Plan.getVectorTripCount()))) &&
5309 "unexpected branch-on-count");
5310
5311 VPTypeAnalysis TypeInfo(Plan);
5313 std::optional<ElementCount> VFToOptimize;
5314 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5317 continue;
5318
5319 // Bail out on recipes not supported at the moment:
5320 // * phi recipes other than the canonical induction
5321 // * recipes writing to memory except interleave groups
5322 // Only support plans with a canonical induction phi.
5323 if (R.isPhi())
5324 return nullptr;
5325
5326 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5327 if (R.mayWriteToMemory() && !InterleaveR)
5328 return nullptr;
5329
5330 // All other ops are allowed, but we reject uses that cannot be converted
5331 // when checking all allowed consumers (store interleave groups) below.
5332 if (!InterleaveR)
5333 continue;
5334
5335 // Try to find a single VF, where all interleave groups are consecutive and
5336 // saturate the full vector width. If we already have a candidate VF, check
5337 // if it is applicable for the current InterleaveR, otherwise look for a
5338 // suitable VF across the Plan's VFs.
5340 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5341 : to_vector(Plan.vectorFactors());
5342 std::optional<ElementCount> NarrowedVF =
5343 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5344 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5345 return nullptr;
5346 VFToOptimize = NarrowedVF;
5347
5348 // Skip read interleave groups.
5349 if (InterleaveR->getStoredValues().empty())
5350 continue;
5351
5352 // Narrow interleave groups, if all operands are already matching narrow
5353 // ops.
5354 auto *Member0 = InterleaveR->getStoredValues()[0];
5355 if (isAlreadyNarrow(Member0) &&
5356 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5357 StoreGroups.push_back(InterleaveR);
5358 continue;
5359 }
5360
5361 // For now, we only support full interleave groups storing load interleave
5362 // groups.
5363 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5364 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5365 if (!DefR)
5366 return false;
5367 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5368 return IR && IR->getInterleaveGroup()->isFull() &&
5369 IR->getVPValue(Op.index()) == Op.value();
5370 })) {
5371 StoreGroups.push_back(InterleaveR);
5372 continue;
5373 }
5374
5375 // Check if all values feeding InterleaveR are matching wide recipes, which
5376 // operands that can be narrowed.
5377 if (!canNarrowOps(InterleaveR->getStoredValues(),
5378 VFToOptimize->isScalable()))
5379 return nullptr;
5380 StoreGroups.push_back(InterleaveR);
5381 }
5382
5383 if (StoreGroups.empty())
5384 return nullptr;
5385
5386 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5387 bool RequiresScalarEpilogue =
5388 MiddleVPBB->getNumSuccessors() == 1 &&
5389 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5390 // Bail out for tail-folding (middle block with a single successor to exit).
5391 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5392 return nullptr;
5393
5394 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5395 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5396 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5397 // TODO: Handle cases where only some interleave groups can be narrowed.
5398 std::unique_ptr<VPlan> NewPlan;
5399 if (size(Plan.vectorFactors()) != 1) {
5400 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5401 Plan.setVF(*VFToOptimize);
5402 NewPlan->removeVF(*VFToOptimize);
5403 }
5404
5405 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5406 SmallPtrSet<VPValue *, 4> NarrowedOps;
5407 // Narrow operation tree rooted at store groups.
5408 for (auto *StoreGroup : StoreGroups) {
5409 VPValue *Res =
5410 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5411 auto *SI =
5412 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5413 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5414 /*Consecutive=*/true, {},
5415 StoreGroup->getDebugLoc());
5416 S->insertBefore(StoreGroup);
5417 StoreGroup->eraseFromParent();
5418 }
5419
5420 // Adjust induction to reflect that the transformed plan only processes one
5421 // original iteration.
5423 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5424 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5425 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5426
5427 VPValue *UF = &Plan.getUF();
5428 VPValue *Step;
5429 if (VFToOptimize->isScalable()) {
5430 VPValue *VScale =
5431 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5432 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5433 {true, false});
5434 Plan.getVF().replaceAllUsesWith(VScale);
5435 } else {
5436 Step = UF;
5437 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5438 }
5439 // Materialize vector trip count with the narrowed step.
5440 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5441 RequiresScalarEpilogue, Step);
5442
5443 CanIVInc->setOperand(1, Step);
5444 Plan.getVFxUF().replaceAllUsesWith(Step);
5445
5446 removeDeadRecipes(Plan);
5447 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5449 "All VPVectorPointerRecipes should have been removed");
5450 return NewPlan;
5451}
5452
5453/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5454/// BranchOnCond recipe.
5456 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5457 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5458 auto *MiddleTerm =
5460 // Only add branch metadata if there is a (conditional) terminator.
5461 if (!MiddleTerm)
5462 return;
5463
5464 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5465 "must have a BranchOnCond");
5466 // Assume that `TripCount % VectorStep ` is equally distributed.
5467 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5468 if (VF.isScalable() && VScaleForTuning.has_value())
5469 VectorStep *= *VScaleForTuning;
5470 assert(VectorStep > 0 && "trip count should not be zero");
5471 MDBuilder MDB(Plan.getContext());
5472 MDNode *BranchWeights =
5473 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5474 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5475}
5476
5478 VFRange &Range) {
5479 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5480 auto *MiddleVPBB = Plan.getMiddleBlock();
5481 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5482 VPTypeAnalysis TypeInfo(Plan);
5483
5484 auto IsScalableOne = [](ElementCount VF) -> bool {
5485 return VF == ElementCount::getScalable(1);
5486 };
5487
5488 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5489 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5490 if (!FOR)
5491 continue;
5492
5493 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5494 "Cannot handle loops with uncountable early exits");
5495
5496 // Find the existing splice for this FOR, created in
5497 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5498 // RecurSplice there; only RecurSplice itself still references FOR.
5499 auto *RecurSplice =
5501 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5502
5503 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5504 // penultimate value of the recurrence. Instead we rely on the existing
5505 // extract of the last element from the result of
5506 // VPInstruction::FirstOrderRecurrenceSplice.
5507 // TODO: Consider vscale_range info and UF.
5508 if (any_of(RecurSplice->users(),
5509 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5511 Range))
5512 return;
5513
5514 // This is the second phase of vectorizing first-order recurrences, creating
5515 // extracts for users outside the loop. An overview of the transformation is
5516 // described below. Suppose we have the following loop with some use after
5517 // the loop of the last a[i-1],
5518 //
5519 // for (int i = 0; i < n; ++i) {
5520 // t = a[i - 1];
5521 // b[i] = a[i] - t;
5522 // }
5523 // use t;
5524 //
5525 // There is a first-order recurrence on "a". For this loop, the shorthand
5526 // scalar IR looks like:
5527 //
5528 // scalar.ph:
5529 // s.init = a[-1]
5530 // br scalar.body
5531 //
5532 // scalar.body:
5533 // i = phi [0, scalar.ph], [i+1, scalar.body]
5534 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5535 // s2 = a[i]
5536 // b[i] = s2 - s1
5537 // br cond, scalar.body, exit.block
5538 //
5539 // exit.block:
5540 // use = lcssa.phi [s1, scalar.body]
5541 //
5542 // In this example, s1 is a recurrence because it's value depends on the
5543 // previous iteration. In the first phase of vectorization, we created a
5544 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5545 // for users in the scalar preheader and exit block.
5546 //
5547 // vector.ph:
5548 // v_init = vector(..., ..., ..., a[-1])
5549 // br vector.body
5550 //
5551 // vector.body
5552 // i = phi [0, vector.ph], [i+4, vector.body]
5553 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5554 // v2 = a[i, i+1, i+2, i+3]
5555 // v1' = splice(v1(3), v2(0, 1, 2))
5556 // b[i, i+1, i+2, i+3] = v2 - v1'
5557 // br cond, vector.body, middle.block
5558 //
5559 // middle.block:
5560 // vector.recur.extract.for.phi = v2(2)
5561 // vector.recur.extract = v2(3)
5562 // br cond, scalar.ph, exit.block
5563 //
5564 // scalar.ph:
5565 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5566 // [s.init, otherwise]
5567 // br scalar.body
5568 //
5569 // scalar.body:
5570 // i = phi [0, scalar.ph], [i+1, scalar.body]
5571 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5572 // s2 = a[i]
5573 // b[i] = s2 - s1
5574 // br cond, scalar.body, exit.block
5575 //
5576 // exit.block:
5577 // lo = lcssa.phi [s1, scalar.body],
5578 // [vector.recur.extract.for.phi, middle.block]
5579 //
5580 // Update extracts of the splice in the middle block: they extract the
5581 // penultimate element of the recurrence.
5583 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5584 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5585 continue;
5586
5587 auto *ExtractR = cast<VPInstruction>(&R);
5588 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5589 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5590 {}, "vector.recur.extract.for.phi");
5591 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5592 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
5593 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
5594 }
5595 }
5596 }
5597}
5598
5599/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5600/// value. Returns the widened IV if found, nullptr otherwise.
5602 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5603 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5604 Instruction::isIntDivRem(BinOp->getOpcode()))
5605 return nullptr;
5606
5607 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5608 VPValue *InvariantCandidate = BinOp->getOperand(1);
5609 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5610 std::swap(WidenIVCandidate, InvariantCandidate);
5611
5612 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5613 return nullptr;
5614
5615 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5616}
5617
5618/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5619/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5623 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5624 auto *ClonedOp = BinOp->clone();
5625 if (ClonedOp->getOperand(0) == WidenIV) {
5626 ClonedOp->setOperand(0, ScalarIV);
5627 } else {
5628 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5629 ClonedOp->setOperand(1, ScalarIV);
5630 }
5631 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5632 return ClonedOp;
5633}
5634
5637 Loop &L) {
5638 ScalarEvolution &SE = *PSE.getSE();
5639 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5640
5641 // Helper lambda to check if the IV range excludes the sentinel value. Try
5642 // signed first, then unsigned. Return an excluded sentinel if found,
5643 // otherwise return std::nullopt.
5644 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5645 bool UseMax) -> std::optional<APSInt> {
5646 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5647 for (bool Signed : {true, false}) {
5648 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5649 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5650
5651 ConstantRange IVRange =
5652 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5653 if (!IVRange.contains(Sentinel))
5654 return Sentinel;
5655 }
5656 return std::nullopt;
5657 };
5658
5659 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5660 for (VPRecipeBase &Phi :
5661 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5662 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5664 PhiR->getRecurrenceKind()))
5665 continue;
5666
5667 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5668 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5669 continue;
5670
5671 // If there's a header mask, the backedge select will not be the find-last
5672 // select.
5673 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5674 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5675 if (HeaderMask &&
5676 !match(BackedgeVal,
5677 m_Select(m_Specific(HeaderMask),
5678 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5679 llvm_unreachable("expected header mask select");
5680
5681 // Get the find-last expression from the find-last select of the reduction
5682 // phi. The find-last select should be a select between the phi and the
5683 // find-last expression.
5684 VPValue *Cond, *FindLastExpression;
5685 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5686 m_VPValue(FindLastExpression))) &&
5687 !match(FindLastSelect,
5688 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5689 m_Specific(PhiR))))
5690 continue;
5691
5692 // Check if FindLastExpression is a simple expression of a widened IV. If
5693 // so, we can track the underlying IV instead and sink the expression.
5694 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5695 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5696 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5697 &L);
5698 const SCEV *Step;
5699 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5700 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5702 "IVOfExpressionToSink not being an AddRec must imply "
5703 "FindLastExpression not being an AddRec.");
5704 continue;
5705 }
5706
5707 // Determine direction from SCEV step.
5708 if (!SE.isKnownNonZero(Step))
5709 continue;
5710
5711 // Positive step means we need UMax/SMax to find the last IV value, and
5712 // UMin/SMin otherwise.
5713 bool UseMax = SE.isKnownPositive(Step);
5714 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5715 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5716
5717 // Sinking an expression will disable epilogue vectorization. Only use it,
5718 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5719 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5720 // multiply or divide by large constant, respectively), which also makes
5721 // sinking undesirable.
5722 if (IVOfExpressionToSink) {
5723 const SCEV *FindLastExpressionSCEV =
5724 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5725 if (match(FindLastExpressionSCEV,
5726 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5727 bool NewUseMax = SE.isKnownPositive(Step);
5728 if (auto NewSentinel =
5729 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5730 // The original expression already has a sentinel, so prefer not
5731 // sinking to keep epilogue vectorization possible.
5732 SentinelVal = *NewSentinel;
5733 UseSigned = NewSentinel->isSigned();
5734 UseMax = NewUseMax;
5735 IVSCEV = FindLastExpressionSCEV;
5736 IVOfExpressionToSink = nullptr;
5737 }
5738 }
5739 }
5740
5741 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5742 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5743 // cannot use min/max.
5744 if (!SentinelVal) {
5745 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5746 if (AR->hasNoSignedWrap())
5747 UseSigned = true;
5748 else if (AR->hasNoUnsignedWrap())
5749 UseSigned = false;
5750 else
5751 continue;
5752 }
5753
5755 BackedgeVal,
5757
5758 VPValue *NewFindLastSelect = BackedgeVal;
5759 VPValue *SelectCond = Cond;
5760 if (!SentinelVal || IVOfExpressionToSink) {
5761 // When we need to create a new select, normalize the condition so that
5762 // PhiR is the last operand and include the header mask if needed.
5763 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5764 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5765 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5766 SelectCond = LoopBuilder.createNot(SelectCond);
5767
5768 // When tail folding, mask the condition with the header mask to prevent
5769 // propagating poison from inactive lanes in the last vector iteration.
5770 if (HeaderMask)
5771 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5772
5773 if (SelectCond != Cond || IVOfExpressionToSink) {
5774 NewFindLastSelect = LoopBuilder.createSelect(
5775 SelectCond,
5776 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5777 PhiR, DL);
5778 }
5779 }
5780
5781 // Create the reduction result in the middle block using sentinel directly.
5782 RecurKind MinMaxKind =
5783 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5784 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5785 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5786 FastMathFlags());
5787 DebugLoc ExitDL = RdxResult->getDebugLoc();
5788 VPBuilder MiddleBuilder(RdxResult);
5789 VPValue *ReducedIV =
5791 NewFindLastSelect, Flags, ExitDL);
5792
5793 // If IVOfExpressionToSink is an expression to sink, sink it now.
5794 VPValue *VectorRegionExitingVal = ReducedIV;
5795 if (IVOfExpressionToSink)
5796 VectorRegionExitingVal =
5797 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5798 ReducedIV, IVOfExpressionToSink);
5799
5800 VPValue *NewRdxResult;
5801 VPValue *StartVPV = PhiR->getStartValue();
5802 if (SentinelVal) {
5803 // Sentinel-based approach: reduce IVs with min/max, compare against
5804 // sentinel to detect if condition was ever true, select accordingly.
5805 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5806 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5807 Sentinel, ExitDL);
5808 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5809 StartVPV, ExitDL);
5810 StartVPV = Sentinel;
5811 } else {
5812 // Introduce a boolean AnyOf reduction to track if the condition was ever
5813 // true in the loop. Use it to select the initial start value, if it was
5814 // never true.
5815 auto *AnyOfPhi = new VPReductionPHIRecipe(
5816 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5817 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5818 AnyOfPhi->insertAfter(PhiR);
5819
5820 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5821 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5822 AnyOfPhi->setOperand(1, OrVal);
5823
5824 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5825 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5826
5827 // Initialize the IV reduction phi with the neutral element, not the
5828 // original start value, to ensure correct min/max reduction results.
5829 StartVPV = Plan.getOrAddLiveIn(
5830 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5831 }
5832 RdxResult->replaceAllUsesWith(NewRdxResult);
5833 RdxResult->eraseFromParent();
5834
5835 auto *NewPhiR = new VPReductionPHIRecipe(
5836 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5837 *NewFindLastSelect, RdxUnordered{1}, {},
5838 PhiR->hasUsesOutsideReductionChain());
5839 NewPhiR->insertBefore(PhiR);
5840 PhiR->replaceAllUsesWith(NewPhiR);
5841 PhiR->eraseFromParent();
5842 }
5843}
5844
5845namespace {
5846
5847using ExtendKind = TTI::PartialReductionExtendKind;
5848struct ReductionExtend {
5849 Type *SrcType = nullptr;
5850 ExtendKind Kind = ExtendKind::PR_None;
5851};
5852
5853/// Describes the extends used to compute the extended reduction operand.
5854/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5855/// operation.
5856struct ExtendedReductionOperand {
5857 /// The recipe that consumes the extends.
5858 VPWidenRecipe *ExtendsUser = nullptr;
5859 /// Extend descriptions (inputs to getPartialReductionCost).
5860 ReductionExtend ExtendA, ExtendB;
5861};
5862
5863/// A chain of recipes that form a partial reduction. Matches either
5864/// reduction_bin_op (extended op, accumulator), or
5865/// reduction_bin_op (accumulator, extended op).
5866/// The possible forms of the "extended op" are listed in
5867/// matchExtendedReductionOperand.
5868struct VPPartialReductionChain {
5869 /// The top-level binary operation that forms the reduction to a scalar
5870 /// after the loop body.
5871 VPWidenRecipe *ReductionBinOp = nullptr;
5872 /// The user of the extends that is then reduced.
5873 ExtendedReductionOperand ExtendedOp;
5874 /// The recurrence kind for the entire partial reduction chain.
5875 /// This allows distinguishing between Sub and AddWithSub recurrences,
5876 /// when the ReductionBinOp is a Instruction::Sub.
5877 RecurKind RK;
5878 /// The index of the accumulator operand of ReductionBinOp. The extended op
5879 /// is `1 - AccumulatorOpIdx`.
5880 unsigned AccumulatorOpIdx;
5881 unsigned ScaleFactor;
5882};
5883
5884static VPSingleDefRecipe *
5885optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
5886 VPTypeAnalysis &TypeInfo) {
5887 // reduce.add(mul(ext(A), C))
5888 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5889 const APInt *Const;
5890 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5891 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
5892 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5893 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5894 if (!Op->hasOneUse() ||
5896 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5897 return Op;
5898
5899 VPBuilder Builder(Op);
5900 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5901 Op->getOperand(1), NarrowTy);
5902 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5903 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5904 return Op;
5905 }
5906
5907 // reduce.add(abs(sub(ext(A), ext(B))))
5908 // -> reduce.add(ext(absolute-difference(A, B)))
5909 VPValue *X, *Y;
5912 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
5913 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
5914 assert(Ext->getOpcode() ==
5915 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
5916 "Expected both the LHS and RHS extends to be the same");
5917 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
5918 VPBuilder Builder(Op);
5919 Type *SrcTy = TypeInfo.inferScalarType(X);
5920 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
5921 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
5922 auto *Max = Builder.insert(
5923 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
5924 {FreezeX, FreezeY}, SrcTy));
5925 auto *Min = Builder.insert(
5926 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
5927 {FreezeX, FreezeY}, SrcTy));
5928 auto *AbsDiff =
5929 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
5930 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
5931 TypeInfo.inferScalarType(Op));
5932 }
5933
5934 // reduce.add(ext(mul(ext(A), ext(B))))
5935 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5936 // TODO: Support this optimization for float types.
5938 m_ZExtOrSExt(m_VPValue()))))) {
5939 auto *Ext = cast<VPWidenCastRecipe>(Op);
5940 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5941 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5942 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5943 if (!Mul->hasOneUse() ||
5944 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5945 MulLHS->getOpcode() != MulRHS->getOpcode())
5946 return Op;
5947 VPBuilder Builder(Mul);
5948 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5949 MulLHS->getOperand(0),
5950 Ext->getResultType()));
5951 Mul->setOperand(1, MulLHS == MulRHS
5952 ? Mul->getOperand(0)
5953 : Builder.createWidenCast(MulRHS->getOpcode(),
5954 MulRHS->getOperand(0),
5955 Ext->getResultType()));
5956 return Mul;
5957 }
5958
5959 return Op;
5960}
5961
5962static VPExpressionRecipe *
5963createPartialReductionExpression(VPReductionRecipe *Red) {
5964 VPValue *VecOp = Red->getVecOp();
5965
5966 // reduce.[f]add(ext(op))
5967 // -> VPExpressionRecipe(op, red)
5968 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
5969 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
5970
5971 // reduce.[f]add([f]mul(ext(a), ext(b)))
5972 // -> VPExpressionRecipe(a, b, mul, red)
5973 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
5974 match(VecOp,
5976 auto *Mul = cast<VPWidenRecipe>(VecOp);
5977 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5978 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5979 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
5980 }
5981
5982 // reduce.add(neg(mul(ext(a), ext(b))))
5983 // -> VPExpressionRecipe(a, b, mul, sub, red)
5985 m_ZExtOrSExt(m_VPValue()))))) {
5986 auto *Sub = cast<VPWidenRecipe>(VecOp);
5987 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
5988 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5989 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5990 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
5991 }
5992
5993 llvm_unreachable("Unsupported expression");
5994}
5995
5996// Helper to transform a partial reduction chain into a partial reduction
5997// recipe. Assumes profitability has been checked.
5998static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5999 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6000 VPReductionPHIRecipe *RdxPhi) {
6001 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6002 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6003
6004 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6005 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6006 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6007
6008 // Sub-reductions can be implemented in two ways:
6009 // (1) negate the operand in the vector loop (the default way).
6010 // (2) subtract the reduced value from the init value in the middle block.
6011 // Both ways keep the reduction itself as an 'add' reduction.
6012 //
6013 // The ISD nodes for partial reductions don't support folding the
6014 // sub/negation into its operands because the following is not a valid
6015 // transformation:
6016 // sub(0, mul(ext(a), ext(b)))
6017 // -> mul(ext(a), ext(sub(0, b)))
6018 //
6019 // It's therefore better to choose option (2) such that the partial
6020 // reduction is always positive (starting at '0') and to do a final
6021 // subtract in the middle block.
6022 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6023 Chain.RK != RecurKind::Sub) {
6024 VPBuilder Builder(WidenRecipe);
6025 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6026 auto *Zero = Plan.getZero(ElemTy);
6027 auto *NegRecipe =
6028 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6030 Builder.insert(NegRecipe);
6031 ExtendedOp = NegRecipe;
6032 }
6033
6034 // FIXME: Do these transforms before invoking the cost-model.
6035 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6036
6037 // Check if WidenRecipe is the final result of the reduction. If so look
6038 // through selects for predicated reductions.
6039 VPValue *Cond = nullptr;
6041 WidenRecipe,
6042 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6043 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6044 RdxPhi->getBackedgeValue() == ExitValue;
6045 assert((!ExitValue || IsLastInChain) &&
6046 "if we found ExitValue, it must match RdxPhi's backedge value");
6047
6048 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6049 RecurKind RdxKind =
6051 auto *PartialRed = new VPReductionRecipe(
6052 RdxKind,
6053 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6054 : FastMathFlags(),
6055 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6056 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6057 PartialRed->insertBefore(WidenRecipe);
6058
6059 if (Cond)
6060 ExitValue->replaceAllUsesWith(PartialRed);
6061 WidenRecipe->replaceAllUsesWith(PartialRed);
6062
6063 // For cost-model purposes, fold this into a VPExpression.
6064 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6065 E->insertBefore(WidenRecipe);
6066 PartialRed->replaceAllUsesWith(E);
6067
6068 // We only need to update the PHI node once, which is when we find the
6069 // last reduction in the chain.
6070 if (!IsLastInChain)
6071 return;
6072
6073 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6074 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6075 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6076
6077 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6078 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6079 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6080 StartInst->setOperand(2, NewScaleFactor);
6081
6082 // If this is the last value in a sub-reduction chain, then update the PHI
6083 // node to start at `0` and update the reduction-result to subtract from
6084 // the PHI's start value.
6085 if (Chain.RK != RecurKind::Sub)
6086 return;
6087
6088 VPValue *OldStartValue = StartInst->getOperand(0);
6089 StartInst->setOperand(0, StartInst->getOperand(1));
6090
6091 // Replace reduction_result by 'sub (startval, reductionresult)'.
6093 assert(RdxResult && "Could not find reduction result");
6094
6095 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6096 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6097 VPInstruction *NewResult = Builder.createNaryOp(
6098 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6099 RdxPhi->getDebugLoc());
6100 RdxResult->replaceUsesWithIf(
6101 NewResult,
6102 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6103}
6104
6105/// Returns the cost of a link in a partial-reduction chain for a given VF.
6106static InstructionCost
6107getPartialReductionLinkCost(VPCostContext &CostCtx,
6108 const VPPartialReductionChain &Link,
6109 ElementCount VF) {
6110 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6111 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6112 std::optional<unsigned> BinOpc = std::nullopt;
6113 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6114 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6115 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6116
6117 std::optional<llvm::FastMathFlags> Flags;
6118 if (RdxType->isFloatingPointTy())
6119 Flags = Link.ReductionBinOp->getFastMathFlags();
6120
6121 unsigned Opcode = Link.RK == RecurKind::Sub
6122 ? (unsigned)Instruction::Add
6123 : Link.ReductionBinOp->getOpcode();
6124 return CostCtx.TTI.getPartialReductionCost(
6125 Opcode, ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType, RdxType,
6126 VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6127 CostCtx.CostKind, Flags);
6128}
6129
6130static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6132}
6133
6134/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6135/// operand. This is an operand where the source of the value (e.g. a load) has
6136/// been extended (sext, zext, or fpext) before it is used in the reduction.
6137///
6138/// Possible forms matched by this function:
6139/// - UpdateR(PrevValue, ext(...))
6140/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6141/// - UpdateR(PrevValue, mul(ext(...), Constant))
6142/// - UpdateR(PrevValue, neg(mul(ext(...), ext(...))))
6143/// - UpdateR(PrevValue, neg(mul(ext(...), Constant)))
6144/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6145/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6146/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6147///
6148/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6149static std::optional<ExtendedReductionOperand>
6150matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6151 VPTypeAnalysis &TypeInfo) {
6152 assert(is_contained(UpdateR->operands(), Op) &&
6153 "Op should be operand of UpdateR");
6154
6155 // Try matching an absolute difference operand of the form
6156 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6157 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6158 // difference on a wider type and get the extend for "free" from the partial
6159 // reduction.
6160 VPValue *X, *Y;
6161 if (Op->hasOneUse() &&
6165 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6166 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6167 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6168 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6169 Type *LHSInputType = TypeInfo.inferScalarType(X);
6170 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6171 if (LHSInputType != RHSInputType ||
6172 LHSExt->getOpcode() != RHSExt->getOpcode())
6173 return std::nullopt;
6174 // Note: This is essentially the same as matching ext(...) as we will
6175 // rewrite this operand to ext(absolute-difference(A, B)).
6176 return ExtendedReductionOperand{
6177 Sub,
6178 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6179 /*ExtendB=*/{}};
6180 }
6181
6182 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6184 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6185 VPValue *CastSource = CastRecipe->getOperand(0);
6186 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6187 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6188 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6189 // Match: ext(mul(...))
6190 // Record the outer extend kind and set `Op` to the mul. We can then match
6191 // this as a binary operation. Note: We can optimize out the outer extend
6192 // by widening the inner extends to match it. See
6193 // optimizeExtendsForPartialReduction.
6194 Op = CastSource;
6195 // FIXME: createPartialReductionExpression can't handle sub(ext(mul(...)))
6196 if (UpdateR->getOpcode() == Instruction::Sub)
6197 return std::nullopt;
6198 } else if (UpdateR->getOpcode() == Instruction::Add ||
6199 UpdateR->getOpcode() == Instruction::FAdd) {
6200 // Match: UpdateR(PrevValue, ext(...))
6201 // TODO: Remove the add/fadd restriction (we should be able to handle this
6202 // case for sub reductions too).
6203 return ExtendedReductionOperand{
6204 UpdateR,
6205 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6206 /*ExtendB=*/{}};
6207 }
6208 }
6209
6210 if (!Op->hasOneUse())
6211 return std::nullopt;
6212
6214 if (!MulOp ||
6215 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6216 return std::nullopt;
6217
6218 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6219 // binary operation.
6220
6221 VPValue *LHS = MulOp->getOperand(0);
6222 VPValue *RHS = MulOp->getOperand(1);
6223
6224 // The LHS of the operation must always be an extend.
6226 return std::nullopt;
6227
6228 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6229 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6230 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6231
6232 // The RHS of the operation can be an extend or a constant integer.
6233 const APInt *RHSConst = nullptr;
6234 VPWidenCastRecipe *RHSCast = nullptr;
6236 RHSCast = cast<VPWidenCastRecipe>(RHS);
6237 else if (!match(RHS, m_APInt(RHSConst)) ||
6238 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6239 return std::nullopt;
6240
6241 // The outer extend kind must match the inner extends for folding.
6242 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6243 if (Cast && OuterExtKind &&
6244 getPartialReductionExtendKind(Cast) != OuterExtKind)
6245 return std::nullopt;
6246
6247 Type *RHSInputType = LHSInputType;
6248 ExtendKind RHSExtendKind = LHSExtendKind;
6249 if (RHSCast) {
6250 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6251 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6252 }
6253
6254 return ExtendedReductionOperand{
6255 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6256}
6257
6258/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6259/// and determines if the target can use a cheaper operation with a wider
6260/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6261/// of operations in the reduction.
6262static std::optional<SmallVector<VPPartialReductionChain>>
6263getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6264 VFRange &Range) {
6265 // Get the backedge value from the reduction PHI and find the
6266 // ComputeReductionResult that uses it (directly or through a select for
6267 // predicated reductions).
6268 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6269 if (!RdxResult)
6270 return std::nullopt;
6271 VPValue *ExitValue = RdxResult->getOperand(0);
6272 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6273
6274 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6276 RecurKind RK = RedPhiR->getRecurrenceKind();
6277 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6278 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6279
6280 // Work backwards from the ExitValue examining each reduction operation.
6281 VPValue *CurrentValue = ExitValue;
6282 while (CurrentValue != RedPhiR) {
6283 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6284 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6285 return std::nullopt;
6286
6287 VPValue *Op = UpdateR->getOperand(1);
6288 VPValue *PrevValue = UpdateR->getOperand(0);
6289
6290 // Find the extended operand. The other operand (PrevValue) is the next link
6291 // in the reduction chain.
6292 std::optional<ExtendedReductionOperand> ExtendedOp =
6293 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6294 if (!ExtendedOp) {
6295 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6296 if (!ExtendedOp)
6297 return std::nullopt;
6298 std::swap(Op, PrevValue);
6299 }
6300
6301 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6302 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6303 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6304 return std::nullopt;
6305
6306 // Check if a partial reduction chain is supported by the target (i.e. does
6307 // not have an invalid cost) for the given VF range. Clamps the range and
6308 // returns true if feasible for any VF.
6309 VPPartialReductionChain Link(
6310 {UpdateR, *ExtendedOp, RK,
6311 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6312 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6313 Chain.push_back(Link);
6314 CurrentValue = PrevValue;
6315 }
6316
6317 // The chain links were collected by traversing backwards from the exit value.
6318 // Reverse the chains so they are in program order.
6319 std::reverse(Chain.begin(), Chain.end());
6320 return Chain;
6321}
6322} // namespace
6323
6325 VPCostContext &CostCtx,
6326 VFRange &Range) {
6327 // Find all possible valid partial reductions, grouping chains by their PHI.
6328 // This grouping allows invalidating the whole chain, if any link is not a
6329 // valid partial reduction.
6331 ChainsByPhi;
6332 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6333 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6334 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6335 if (!RedPhiR)
6336 continue;
6337
6338 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6339 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6340 }
6341
6342 if (ChainsByPhi.empty())
6343 return;
6344
6345 // Build set of partial reduction operations for extend user validation and
6346 // a map of reduction bin ops to their scale factors for scale validation.
6347 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6348 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6349 for (const auto &[_, Chains] : ChainsByPhi)
6350 for (const VPPartialReductionChain &Chain : Chains) {
6351 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6352 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6353 }
6354
6355 // A partial reduction is invalid if any of its extends are used by
6356 // something that isn't another partial reduction. This is because the
6357 // extends are intended to be lowered along with the reduction itself.
6358 auto ExtendUsersValid = [&](VPValue *Ext) {
6359 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6360 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6361 });
6362 };
6363
6364 auto IsProfitablePartialReductionChainForVF =
6365 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6366 InstructionCost PartialCost = 0, RegularCost = 0;
6367
6368 // The chain is a profitable partial reduction chain if the cost of handling
6369 // the entire chain is cheaper when using partial reductions than when
6370 // handling the entire chain using regular reductions.
6371 for (const VPPartialReductionChain &Link : Chain) {
6372 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6373 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6374 if (!LinkCost.isValid())
6375 return false;
6376
6377 PartialCost += LinkCost;
6378 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6379 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6380 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6381 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6382 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6383 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6384 RegularCost += Extend->computeCost(VF, CostCtx);
6385 }
6386 return PartialCost.isValid() && PartialCost < RegularCost;
6387 };
6388
6389 // Validate chains: check that extends are only used by partial reductions,
6390 // and that reduction bin ops are only used by other partial reductions with
6391 // matching scale factors, are outside the loop region or the select
6392 // introduced by tail-folding. Otherwise we would create users of scaled
6393 // reductions where the types of the other operands don't match.
6394 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6395 for (const VPPartialReductionChain &Chain : Chains) {
6396 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6397 Chains.clear();
6398 break;
6399 }
6400 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6401 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6402 return PhiR == RedPhiR;
6403 auto *R = cast<VPSingleDefRecipe>(U);
6404 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6406 m_Specific(Chain.ReductionBinOp))) ||
6407 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6408 m_Specific(RedPhiR)));
6409 };
6410 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6411 Chains.clear();
6412 break;
6413 }
6414
6415 // Check if the compute-reduction-result is used by a sunk store.
6416 // TODO: Also form partial reductions in those cases.
6417 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6418 if (any_of(RdxResult->users(), [](VPUser *U) {
6419 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6420 return RepR && RepR->getOpcode() == Instruction::Store;
6421 })) {
6422 Chains.clear();
6423 break;
6424 }
6425 }
6426 }
6427
6428 // Clear the chain if it is not profitable.
6430 [&, &Chains = Chains](ElementCount VF) {
6431 return IsProfitablePartialReductionChainForVF(Chains, VF);
6432 },
6433 Range))
6434 Chains.clear();
6435 }
6436
6437 for (auto &[Phi, Chains] : ChainsByPhi)
6438 for (const VPPartialReductionChain &Chain : Chains)
6439 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6440}
6441
6443 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6444 // Collect all loads/stores first. We will start with ones having simpler
6445 // decisions followed by more complex ones that are potentially
6446 // guided/dependent on the simpler ones.
6448 for (VPBasicBlock *VPBB :
6451 for (VPRecipeBase &R : *VPBB) {
6452 auto *VPI = dyn_cast<VPInstruction>(&R);
6453 if (VPI && VPI->getUnderlyingValue() &&
6454 is_contained({Instruction::Load, Instruction::Store},
6455 VPI->getOpcode()))
6456 MemOps.push_back(VPI);
6457 }
6458 }
6459
6460 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6461 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6462
6463 for (VPInstruction *VPI : MemOps) {
6464 auto ReplaceWith = [&](VPRecipeBase *New) {
6465 New->insertBefore(VPI);
6466 if (VPI->getOpcode() == Instruction::Load)
6467 VPI->replaceAllUsesWith(New->getVPSingleValue());
6468 VPI->eraseFromParent();
6469 };
6470
6471 // Note: we must do that for scalar VPlan as well.
6472 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6473 FinalRedStoresBuilder))
6474 continue;
6475
6476 // Filter out scalar VPlan for the remaining memory operations.
6478 [](ElementCount VF) { return VF.isScalar(); }, Range))
6479 continue;
6480
6481 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6482 ReplaceWith(Histogram);
6483 continue;
6484 }
6485
6486 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6487 if (!Recipe)
6488 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6489
6490 ReplaceWith(Recipe);
6491 }
6492}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void expandVPDerivedIV(VPDerivedIVRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPDerivedIVRecipe into executable recipes.
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1054
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1027
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
auto members() const
Return an iterator range over the non-null members of this group, in index order.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1686
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3787
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4154
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4229
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4181
iterator end()
Definition VPlan.h:4191
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4189
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4242
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:233
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:566
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:645
const VPRecipeBase & back() const
Definition VPlan.h:4203
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2775
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2811
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2801
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2817
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2797
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:97
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:318
VPRegionBlock * getParent()
Definition VPlan.h:189
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:203
size_t getNumSuccessors() const
Definition VPlan.h:240
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:309
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:225
VPlan * getPlan()
Definition VPlan.cpp:178
const std::string & getName() const
Definition VPlan.h:180
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:328
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:236
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:183
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:282
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:230
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:214
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:313
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:214
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:232
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:250
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:286
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:270
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3271
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1671
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3819
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:498
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:471
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:483
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:493
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3903
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3316
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2295
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2337
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2326
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2040
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4307
Class to record and manage LLVM IR flags.
Definition VPlan.h:687
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1167
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1222
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1262
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1313
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1257
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1254
@ CanonicalIVIncrementForPart
Definition VPlan.h:1238
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1265
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2912
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2904
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2933
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2985
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2943
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3458
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:405
VPBasicBlock * getParent()
Definition VPlan.h:479
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:553
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3145
A recipe for handling reduction phis.
Definition VPlan.h:2681
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2728
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2721
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2739
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3036
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4364
const VPBlockBase * getEntry() const
Definition VPlan.h:4408
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4440
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:881
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4425
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4484
bool hasCanonicalIVNUW() const
Indicates if NUW is set for the canonical IV increment, for loop regions.
Definition VPlan.h:4489
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4492
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4476
const VPBlockBase * getExiting() const
Definition VPlan.h:4420
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4433
VPValues defined by a VPRegionBlock, like the canonical IV.
Definition VPlanValue.h:209
DebugLoc getDebugLoc() const
Returns the debug location of the VPRegionValue.
Definition VPlanValue.h:228
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3190
bool isSingleScalar() const
Definition VPlan.h:3231
bool isPredicated() const
Definition VPlan.h:3233
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3255
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3974
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:605
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:672
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:329
operand_range operands()
Definition VPlanValue.h:397
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:373
unsigned getNumOperands() const
Definition VPlanValue.h:367
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:368
void addOperand(VPValue *Operand)
Definition VPlanValue.h:362
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:138
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1496
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:128
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1499
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1505
user_range users()
Definition VPlanValue.h:155
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2146
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1832
Instruction::CastOps getOpcode() const
Definition VPlan.h:1870
A recipe for handling GEP instructions.
Definition VPlan.h:2082
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2361
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2389
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2392
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2412
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2443
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2490
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2494
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2521
A recipe for widening vector intrinsics.
Definition VPlan.h:1884
A common base class for widening memory operations.
Definition VPlan.h:3501
A recipe for widened phis.
Definition VPlan.h:2579
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1776
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1796
unsigned getOpcode() const
Definition VPlan.h:1813
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4512
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4828
bool hasVF(ElementCount VF) const
Definition VPlan.h:4726
const DataLayout & getDataLayout() const
Definition VPlan.h:4708
LLVMContext & getContext() const
Definition VPlan.h:4704
VPBasicBlock * getEntry()
Definition VPlan.h:4604
bool hasScalableVF() const
Definition VPlan.h:4727
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4663
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4684
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4733
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4799
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4702
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4805
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4877
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4831
bool hasUF(unsigned UF) const
Definition VPlan.h:4751
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4653
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4692
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4689
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4776
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4802
void setVF(ElementCount VF)
Definition VPlan.h:4714
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4767
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1096
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4754
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4677
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4629
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4854
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4796
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4609
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4699
bool hasScalarVFOnly() const
Definition VPlan.h:4744
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4643
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4695
void setUF(unsigned UF)
Definition VPlan.h:4759
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4909
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1244
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4810
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2814
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:115
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:136
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:265
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:82
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:87
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1891
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2663
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:240
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:142
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:288
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3620
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3580
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3704
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3661
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...