LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/TypeSwitch.h"
32#include "llvm/Analysis/Loads.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
46
47using namespace llvm;
48using namespace VPlanPatternMatch;
49using namespace SCEVPatternMatch;
50
52 VPlan &Plan, const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
76 Phi->getName());
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
91 Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc(), GEP);
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
158 VPReplicateRecipe &GroupLeader;
159 PredicatedScalarEvolution *PSE = nullptr;
160 const Loop *L = nullptr;
161
162 // Return true if \p A and \p B are known to not alias for all VFs in the
163 // plan, checked via the distance between the accesses
164 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
165 if (A->getOpcode() != Instruction::Store ||
166 B->getOpcode() != Instruction::Store)
167 return false;
168
169 if (!PSE || !L)
170 return A == B;
171
172 VPValue *AddrA = A->getOperand(1);
173 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, *PSE, L);
174 VPValue *AddrB = B->getOperand(1);
175 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, *PSE, L);
177 return false;
178
179 const APInt *Distance;
180 ScalarEvolution &SE = *PSE->getSE();
181 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
182 return false;
183
184 const DataLayout &DL = SE.getDataLayout();
185 Type *TyA = A->getOperand(0)->getScalarType();
186 uint64_t SizeA = DL.getTypeStoreSize(TyA);
187 Type *TyB = B->getOperand(0)->getScalarType();
188 uint64_t SizeB = DL.getTypeStoreSize(TyB);
189
190 // Use the maximum store size to ensure no overlap from either direction.
191 // Currently only handles fixed sizes, as it is only used for
192 // replicating VPReplicateRecipes.
193 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
194
195 auto VFs = B->getParent()->getPlan()->vectorFactors();
197 if (MaxVF.isScalable())
198 return false;
199 return Distance->abs().uge(
200 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
201 }
202
203public:
206 const Loop &L)
207 : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()),
208 GroupLeader(GroupLeader), PSE(&PSE), L(&L) {}
209
210 SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {}
211
212 /// Return true if \p R should be skipped during alias checking, either
213 /// because it's in the exclude set or because no-alias can be proven via
214 /// SCEV.
215 bool shouldSkip(VPRecipeBase &R) const {
216 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
217 return ExcludeRecipes.contains(Store) ||
218 (Store && isNoAliasViaDistance(Store, &GroupLeader));
219 }
220};
221
222/// Check if a memory operation doesn't alias with memory operations using
223/// scoped noalias metadata, in blocks in the single-successor chain between \p
224/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
225/// write to memory are checked (for load hoisting). Otherwise recipes that both
226/// read and write memory are checked, and SCEV is used to prove no-alias
227/// between the group leader and other replicate recipes (for store sinking).
228static bool
230 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
231 std::optional<SinkStoreInfo> SinkInfo = {}) {
232 bool CheckReads = SinkInfo.has_value();
233 for (VPBasicBlock *VPBB :
235 for (VPRecipeBase &R : *VPBB) {
236 if (SinkInfo && SinkInfo->shouldSkip(R))
237 continue;
238
239 // Skip recipes that don't need checking.
240 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
241 continue;
242
244 if (!Loc)
245 // Conservatively assume aliasing for memory operations without
246 // location.
247 return false;
248
250 return false;
251 }
252 }
253 return true;
254}
255
256/// Get the value type of the replicate load or store. \p IsLoad indicates
257/// whether it is a load.
259 return (IsLoad ? R : R->getOperand(0))->getScalarType();
260}
261
262/// Collect either replicated Loads or Stores grouped by their address SCEV and
263/// their load-store type, in a deep-traversal of the vector loop region in \p
264/// Plan.
265template <unsigned Opcode>
268 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
269 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
270 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
271 "Only Load and Store opcodes supported");
272 constexpr bool IsLoad = (Opcode == Instruction::Load);
275 RecipesByAddressAndType;
278 for (VPRecipeBase &R : *VPBB) {
279 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
280 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
281 continue;
282
283 // For loads, operand 0 is address; for stores, operand 1 is address.
284 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
285 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
286 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
287 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
288 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
289 }
290 }
291 auto Groups = to_vector(RecipesByAddressAndType.values());
292 VPDominatorTree VPDT(Plan);
293 for (auto &Group : Groups) {
294 // Sort mem ops by dominance order, with earliest (most dominating) first.
296 return VPDT.properlyDominates(A, B);
297 });
298 }
299 return Groups;
300}
301
302static bool sinkScalarOperands(VPlan &Plan) {
303 auto Iter = vp_depth_first_deep(Plan.getEntry());
304 bool ScalarVFOnly = Plan.hasScalarVFOnly();
305 bool Changed = false;
306
308 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
309 VPBasicBlock *SinkTo, VPValue *Op) {
310 auto *Candidate =
311 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
312 if (!Candidate)
313 return;
314
315 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
316 // for now.
318 return;
319
320 if (Candidate->getParent() == SinkTo ||
321 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
322 return;
323
324 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
325 if (!ScalarVFOnly && RepR->isSingleScalar())
326 return;
327
328 WorkList.insert({SinkTo, Candidate});
329 };
330
331 // First, collect the operands of all recipes in replicate blocks as seeds for
332 // sinking.
334 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
335 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
336 continue;
337 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
338 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
339 continue;
340 for (auto &Recipe : *VPBB)
341 for (VPValue *Op : Recipe.operands())
342 InsertIfValidSinkCandidate(VPBB, Op);
343 }
344
345 // Try to sink each replicate or scalar IV steps recipe in the worklist.
346 for (unsigned I = 0; I != WorkList.size(); ++I) {
347 VPBasicBlock *SinkTo;
348 VPSingleDefRecipe *SinkCandidate;
349 std::tie(SinkTo, SinkCandidate) = WorkList[I];
350
351 // All recipe users of SinkCandidate must be in the same block SinkTo or all
352 // users outside of SinkTo must only use the first lane of SinkCandidate. In
353 // the latter case, we need to duplicate SinkCandidate.
354 auto UsersOutsideSinkTo =
355 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
356 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
357 });
358 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
359 return !U->usesFirstLaneOnly(SinkCandidate);
360 }))
361 continue;
362 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
363
364 if (NeedsDuplicating) {
365 if (ScalarVFOnly)
366 continue;
367 VPSingleDefRecipe *Clone;
368 if (auto *SinkCandidateRepR =
369 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
370 // TODO: Handle converting to uniform recipes as separate transform,
371 // then cloning should be sufficient here.
373 SinkCandidateRepR->getOpcode(), SinkCandidate->operands(),
374 /*Mask=*/nullptr, *SinkCandidateRepR, *SinkCandidateRepR,
375 SinkCandidate->getDebugLoc(), SinkCandidate->getUnderlyingInstr());
376 // TODO: add ".cloned" suffix to name of Clone's VPValue.
377 } else {
378 Clone = SinkCandidate->clone();
379 }
380
381 Clone->insertBefore(SinkCandidate);
382 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
383 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
384 });
385 }
386 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
387 for (VPValue *Op : SinkCandidate->operands())
388 InsertIfValidSinkCandidate(SinkTo, Op);
389 Changed = true;
390 }
391 return Changed;
392}
393
394/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
395/// the mask.
397 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
398 if (!EntryBB || EntryBB->size() != 1 ||
399 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
400 return nullptr;
401
402 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
403}
404
405/// If \p R is a triangle region, return the 'then' block of the triangle.
407 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
408 if (EntryBB->getNumSuccessors() != 2)
409 return nullptr;
410
411 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
412 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
413 if (!Succ0 || !Succ1)
414 return nullptr;
415
416 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
417 return nullptr;
418 if (Succ0->getSingleSuccessor() == Succ1)
419 return Succ0;
420 if (Succ1->getSingleSuccessor() == Succ0)
421 return Succ1;
422 return nullptr;
423}
424
425// Merge replicate regions in their successor region, if a replicate region
426// is connected to a successor replicate region with the same predicate by a
427// single, empty VPBasicBlock.
429 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
430
431 // Collect replicate regions followed by an empty block, followed by another
432 // replicate region with matching masks to process front. This is to avoid
433 // iterator invalidation issues while merging regions.
436 vp_depth_first_deep(Plan.getEntry()))) {
437 if (!Region1->isReplicator())
438 continue;
439 auto *MiddleBasicBlock =
440 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
441 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
442 continue;
443
444 auto *Region2 =
445 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
446 if (!Region2 || !Region2->isReplicator())
447 continue;
448
449 VPValue *Mask1 = getPredicatedMask(Region1);
450 VPValue *Mask2 = getPredicatedMask(Region2);
451 if (!Mask1 || Mask1 != Mask2)
452 continue;
453
454 assert(Mask1 && Mask2 && "both region must have conditions");
455 WorkList.push_back(Region1);
456 }
457
458 // Move recipes from Region1 to its successor region, if both are triangles.
459 for (VPRegionBlock *Region1 : WorkList) {
460 if (TransformedRegions.contains(Region1))
461 continue;
462 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
463 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
464
465 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
466 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
467 if (!Then1 || !Then2)
468 continue;
469
470 // Note: No fusion-preventing memory dependencies are expected in either
471 // region. Such dependencies should be rejected during earlier dependence
472 // checks, which guarantee accesses can be re-ordered for vectorization.
473 //
474 // Move recipes to the successor region.
475 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
476 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
477
478 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
479 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
480
481 // Move VPPredInstPHIRecipes from the merge block to the successor region's
482 // merge block. Update all users inside the successor region to use the
483 // original values.
484 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
485 VPValue *PredInst1 =
486 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
487 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
488 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
489 return cast<VPRecipeBase>(&U)->getParent() == Then2;
490 });
491
492 // Remove phi recipes that are unused after merging the regions.
493 if (Phi1ToMove.getVPSingleValue()->user_empty()) {
494 Phi1ToMove.eraseFromParent();
495 continue;
496 }
497 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
498 }
499
500 // Remove the dead recipes in Region1's entry block.
501 for (VPRecipeBase &R :
502 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
503 R.eraseFromParent();
504
505 // Finally, remove the first region.
506 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
507 VPBlockUtils::disconnectBlocks(Pred, Region1);
508 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
509 }
510 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
511 TransformedRegions.insert(Region1);
512 }
513
514 return !TransformedRegions.empty();
515}
516
518 VPRegionBlock *ParentRegion,
519 VPlan &Plan) {
520 Instruction *Instr = PredRecipe->getUnderlyingInstr();
521 // Build the triangular if-then region.
522 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
523 assert(Instr->getParent() && "Predicated instruction not in any basic block");
524 auto *BlockInMask = PredRecipe->getMask();
525 auto *MaskDef = BlockInMask->getDefiningRecipe();
526 auto *BOMRecipe = new VPBranchOnMaskRecipe(
527 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
528 auto *Entry =
529 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
530
531 // Replace predicated replicate recipe with a replicate recipe without a
532 // mask but in the replicate region.
533 auto *RecipeWithoutMask = new VPReplicateRecipe(
534 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
535 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
536 PredRecipe->getDebugLoc());
537 auto *Pred =
538 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
539 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
541 Plan.createReplicateRegion(Entry, Exiting, RegionName);
542
543 // Note: first set Entry as region entry and then connect successors starting
544 // from it in order, to propagate the "parent" of each VPBasicBlock.
545 Region->setParent(ParentRegion);
546 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
547 VPBlockUtils::connectBlocks(Pred, Exiting);
548
549 if (!PredRecipe->user_empty()) {
550 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
551 RecipeWithoutMask->getDebugLoc());
552 Exiting->appendRecipe(PHIRecipe);
553 PredRecipe->replaceAllUsesWith(PHIRecipe);
554 }
555 PredRecipe->eraseFromParent();
556 return Region;
557}
558
559static void addReplicateRegions(VPlan &Plan) {
562 vp_depth_first_deep(Plan.getEntry()))) {
563 for (VPRecipeBase &R : *VPBB)
564 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
565 if (RepR->isPredicated())
566 WorkList.push_back(RepR);
567 }
568 }
569
570 unsigned BBNum = 0;
571 for (VPReplicateRecipe *RepR : WorkList) {
572 VPBasicBlock *CurrentBlock = RepR->getParent();
573 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
574
575 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
576 SplitBlock->setName(
577 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
578 // Record predicated instructions for above packing optimizations.
580 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
582
583 VPRegionBlock *ParentRegion = Region->getParent();
584 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
585 ParentRegion->setExiting(SplitBlock);
586 }
587}
588
592 vp_depth_first_deep(Plan.getEntry()))) {
593 // Don't fold the blocks in the skeleton of the Plan into their single
594 // predecessors for now.
595 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
596 if (!VPBB->getParent())
597 continue;
598 auto *PredVPBB =
599 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
600 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
601 isa<VPIRBasicBlock>(PredVPBB))
602 continue;
603 WorkList.push_back(VPBB);
604 }
605
606 for (VPBasicBlock *VPBB : WorkList) {
607 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
608 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
609 R.moveBefore(*PredVPBB, PredVPBB->end());
610 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
611 auto *ParentRegion = VPBB->getParent();
612 if (ParentRegion && ParentRegion->getExiting() == VPBB)
613 ParentRegion->setExiting(PredVPBB);
614 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
615 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
616 }
617 return !WorkList.empty();
618}
619
621 // Convert masked VPReplicateRecipes to if-then region blocks.
623
624 bool ShouldSimplify = true;
625 while (ShouldSimplify) {
626 ShouldSimplify = sinkScalarOperands(Plan);
627 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
628 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
629 }
630}
631
632/// Remove redundant casts of inductions.
633///
634/// Such redundant casts are casts of induction variables that can be ignored,
635/// because we already proved that the casted phi is equal to the uncasted phi
636/// in the vectorized loop. There is no need to vectorize the cast - the same
637/// value can be used for both the phi and casts in the vector loop.
639 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
641 if (!IV || IV->getTruncInst())
642 continue;
643
644 // A sequence of IR Casts has potentially been recorded for IV, which
645 // *must be bypassed* when the IV is vectorized, because the vectorized IV
646 // will produce the desired casted value. This sequence forms a def-use
647 // chain and is provided in reverse order, ending with the cast that uses
648 // the IV phi. Search for the recipe of the last cast in the chain and
649 // replace it with the original IV. Note that only the final cast is
650 // expected to have users outside the cast-chain and the dead casts left
651 // over will be cleaned up later.
652 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
653 VPValue *FindMyCast = IV;
654 for (Instruction *IRCast : reverse(Casts)) {
655 VPSingleDefRecipe *FoundUserCast = nullptr;
656 for (auto *U : FindMyCast->users()) {
657 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
658 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
659 FoundUserCast = UserCast;
660 break;
661 }
662 }
663 // A cast recipe in the chain may have been removed by earlier DCE.
664 if (!FoundUserCast)
665 break;
666 FindMyCast = FoundUserCast;
667 }
668 if (FindMyCast != IV)
669 FindMyCast->replaceAllUsesWith(IV);
670 }
671}
672
675 Instruction::BinaryOps InductionOpcode,
676 FPMathOperator *FPBinOp, Instruction *TruncI,
677 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
678 VPBuilder &Builder) {
679 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
680 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
681 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
682 VPSingleDefRecipe *BaseIV =
683 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
684
685 // Truncate base induction if needed.
686 Type *ResultTy = BaseIV->getScalarType();
687 if (TruncI) {
688 Type *TruncTy = TruncI->getType();
689 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
690 "Not truncating.");
691 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
692 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
693 ResultTy = TruncTy;
694 }
695
696 // Truncate step if needed.
697 Type *StepTy = Step->getScalarType();
698 if (ResultTy != StepTy) {
699 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
700 "Not truncating.");
701 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
702 auto *VecPreheader =
704 VPBuilder::InsertPointGuard Guard(Builder);
705 Builder.setInsertPoint(VecPreheader);
706 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
707 }
708 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
709 &Plan.getVF(), DL);
710}
711
713 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
715 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
716 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
717 if (!LoopRegion)
718 return;
719
720 auto *WideCanIV =
722 if (!WideCanIV)
723 return;
724
725 Type *CanIVTy = LoopRegion->getCanonicalIVType();
726
727 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
728 // IV.
729 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
730 VPBuilder Builder(WideCanIV);
731 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
732 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
733 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
734 WideCanIV->getDebugLoc(), Builder));
735 WideCanIV->eraseFromParent();
736 return;
737 }
738
739 if (vputils::onlyScalarValuesUsed(WideCanIV))
740 return;
741
742 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
743 // in the header, reuse it instead of introducing another wide induction phi.
744 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
745 for (VPRecipeBase &Phi : Header->phis()) {
747 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
748 continue;
749 // The reused wide IV feeds the header mask, whose lanes may extend past
750 // the trip count; drop flags that only hold inside the scalar loop.
751 WidenIV->dropPoisonGeneratingFlags();
752 WideCanIV->replaceAllUsesWith(WidenIV);
753 WideCanIV->eraseFromParent();
754 return;
755 }
756
757 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
758 auto *VecTy = VectorType::get(CanIVTy, VF);
759 InstructionCost BroadcastCost = TTI.getShuffleCost(
761 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
762 if (PHICost > BroadcastCost)
763 return;
764
765 // Bail out if the additional wide induction phi increase the expected spill
766 // cost.
767 VPRegisterUsage UnrolledBase =
768 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
769 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
770 NumUsers *= UF;
771 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
772 VPRegisterUsage Projected = UnrolledBase;
773 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
774 if (Projected.spillCost(TTI, CostKind) >
775 UnrolledBase.spillCost(TTI, CostKind))
776 return;
777
780 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
781 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
782 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
783 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
784 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
785 WideCanIV->replaceAllUsesWith(NewWideIV);
786 WideCanIV->eraseFromParent();
787}
788
789/// Returns true if \p R is dead and can be removed.
790static bool isDeadRecipe(VPRecipeBase &R) {
791 // Do remove conditional assume instructions as their conditions may be
792 // flattened.
793 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
794 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
796 if (IsConditionalAssume)
797 return true;
798
799 if (R.mayHaveSideEffects())
800 return false;
801
802 // Recipe is dead if no user keeps the recipe alive.
803 return all_of(R.definedValues(), [](VPValue *V) { return V->user_empty(); });
804}
805
808 Plan.getEntry());
810 // The recipes in the block are processed in reverse order, to catch chains
811 // of dead recipes.
812 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
813 if (isDeadRecipe(R)) {
814 R.eraseFromParent();
815 continue;
816 }
817
818 // Check if R is a dead VPPhi <-> update cycle and remove it.
819 VPValue *Start, *Incoming;
820 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
821 continue;
822 auto *PhiR = cast<VPPhi>(&R);
823 VPUser *PhiUser = PhiR->getSingleUser();
824 if (!PhiUser)
825 continue;
826 if (PhiUser != Incoming->getDefiningRecipe() ||
827 Incoming->getNumUsers() != 1)
828 continue;
829 PhiR->replaceAllUsesWith(Start);
830 PhiR->eraseFromParent();
831 Incoming->getDefiningRecipe()->eraseFromParent();
832 }
833 }
834}
835
838 for (unsigned I = 0; I != Users.size(); ++I) {
840 for (VPValue *V : Cur->definedValues())
841 Users.insert_range(V->users());
842 }
843 return Users.takeVector();
844}
845
846/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
847/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
848/// generates scalar values.
849static VPValue *
851 VPlan &Plan, VPBuilder &Builder) {
853 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
854 VPValue *StepV = PtrIV->getOperand(1);
856 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
857 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
858
859 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
860 PtrIV->getDebugLoc(), "next.gep");
861}
862
863/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
864/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
865/// VPWidenPointerInductionRecipe will generate vectors only. If some users
866/// require vectors while other require scalars, the scalar uses need to extract
867/// the scalars from the generated vectors (Note that this is different to how
868/// int/fp inductions are handled). Legalize extract-from-ends using uniform
869/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
870/// the correct end value is available. Also optimize
871/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
872/// providing them scalar steps built on the canonical scalar IV and update the
873/// original IV's users. This is an optional optimization to reduce the needs of
874/// vector extracts.
877 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
878 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
879 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
880 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
881 if (!PhiR)
882 continue;
883
884 // Try to narrow wide and replicating recipes to uniform recipes, based on
885 // VPlan analysis.
886 // TODO: Apply to all recipes in the future, to replace legacy uniformity
887 // analysis.
888 auto Users = collectUsersRecursively(PhiR);
889 for (VPUser *U : reverse(Users)) {
890 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
891 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
892 // Skip recipes that shouldn't be narrowed.
893 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
894 Def->user_empty() || !Def->getUnderlyingValue() ||
895 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
896 continue;
897
898 // Skip recipes that may have other lanes than their first used.
900 continue;
901
902 // TODO: Support scalarizing ExtractValue.
903 if (match(Def,
905 continue;
906
908 Def->getUnderlyingInstr()->getOpcode(), Def->operands(),
909 /*Mask=*/nullptr, *Def, {}, DebugLoc::getUnknown(),
910 Def->getUnderlyingInstr());
911 Clone->insertAfter(Def);
912 Def->replaceAllUsesWith(Clone);
913 }
914
915 // Replace wide pointer inductions which have only their scalars used by
916 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
917 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
918 if (!Plan.hasScalarVFOnly() &&
919 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
920 continue;
921
922 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
923 PtrIV->replaceAllUsesWith(PtrAdd);
924 continue;
925 }
926
927 // Replace widened induction with scalar steps for users that only use
928 // scalars.
929 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
930 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
931 return U->usesScalars(WideIV);
932 }))
933 continue;
934
935 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
937 Plan, ID.getKind(), ID.getInductionOpcode(),
938 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
939 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
940 WideIV->getDebugLoc(), Builder);
941
942 // Update scalar users of IV to use Step instead.
943 if (!HasOnlyVectorVFs) {
944 assert(!Plan.hasScalableVF() &&
945 "plans containing a scalar VF cannot also include scalable VFs");
946 WideIV->replaceAllUsesWith(Steps);
947 } else {
948 bool HasScalableVF = Plan.hasScalableVF();
949 WideIV->replaceUsesWithIf(Steps,
950 [WideIV, HasScalableVF](VPUser &U, unsigned) {
951 if (HasScalableVF)
952 return U.usesFirstLaneOnly(WideIV);
953 return U.usesScalars(WideIV);
954 });
955 }
956 }
957}
958
959/// Check if \p VPV is an untruncated wide induction, either before or after the
960/// increment. If so return the header IV (before the increment), otherwise
961/// return null.
964 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
965 if (WideIV) {
966 // VPV itself is a wide induction, separately compute the end value for exit
967 // users if it is not a truncated IV.
968 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
969 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
970 }
971
972 // Check if VPV is an optimizable induction increment.
973 VPRecipeBase *Def = VPV->getDefiningRecipe();
974 if (!Def || Def->getNumOperands() != 2)
975 return nullptr;
976 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
977 if (!WideIV)
978 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
979 if (!WideIV)
980 return nullptr;
981
982 auto IsWideIVInc = [&]() {
983 auto &ID = WideIV->getInductionDescriptor();
984
985 // Check if VPV increments the induction by the induction step.
986 VPValue *IVStep = WideIV->getStepValue();
987 switch (ID.getInductionOpcode()) {
988 case Instruction::Add:
989 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
990 case Instruction::FAdd:
991 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
992 case Instruction::FSub:
993 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
994 m_Specific(IVStep)));
995 case Instruction::Sub: {
996 // IVStep will be the negated step of the subtraction. Check if Step == -1
997 // * IVStep.
998 VPValue *Step;
999 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
1000 return false;
1001 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1002 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1003 ScalarEvolution &SE = *PSE.getSE();
1004 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1005 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1006 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1007 }
1008 default:
1009 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1010 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1011 m_Specific(WideIV->getStepValue())));
1012 }
1013 llvm_unreachable("should have been covered by switch above");
1014 };
1015 return IsWideIVInc() ? WideIV : nullptr;
1016}
1017
1018/// Attempts to optimize the induction variable exit values for users in the
1019/// early exit block.
1022 VPValue *Incoming, *Mask;
1024 m_VPValue(Incoming))))
1025 return nullptr;
1026
1027 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1028 if (!WideIV)
1029 return nullptr;
1030
1031 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1032 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1033 return nullptr;
1034
1035 // Calculate the final index.
1036 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1037 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1038 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1039 auto *ExtractR = cast<VPInstruction>(Op);
1040 VPBuilder B(ExtractR);
1041
1042 DebugLoc DL = ExtractR->getDebugLoc();
1043 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1044 FirstActiveLane = B.createScalarZExtOrTrunc(
1045 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1046 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1047
1048 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1049 // changed it means the exit is using the incremented value, so we need to
1050 // add the step.
1051 if (Incoming != WideIV) {
1052 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1053 EndValue = B.createAdd(EndValue, One, DL);
1054 }
1055
1056 if (!match(WideIV, m_CanonicalWidenIV())) {
1057 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1058 VPIRValue *Start = WideIV->getStartValue();
1059 VPValue *Step = WideIV->getStepValue();
1060 EndValue = B.createDerivedIV(
1061 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1062 Start, EndValue, Step);
1063 }
1064
1065 return EndValue;
1066}
1067
1068/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1069/// VPDerivedIVRecipe for non-canonical inductions.
1071 VPBuilder &VectorPHBuilder,
1072 VPValue *VectorTC) {
1073 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1074 // Truncated wide inductions resume from the last lane of their vector value
1075 // in the last vector iteration which is handled elsewhere.
1076 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1077 return nullptr;
1078
1079 VPIRValue *Start = WideIV->getStartValue();
1080 VPValue *Step = WideIV->getStepValue();
1082 VPValue *EndValue = VectorTC;
1083 if (!match(WideIV, m_CanonicalWidenIV())) {
1084 EndValue = VectorPHBuilder.createDerivedIV(
1085 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1086 Start, VectorTC, Step);
1087 }
1088
1089 // EndValue is derived from the vector trip count (which has the same type as
1090 // the widest induction) and thus may be wider than the induction here.
1091 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1092 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1093 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1094 ScalarTypeOfWideIV,
1095 WideIV->getDebugLoc());
1096 }
1097
1098 return EndValue;
1099}
1100
1101/// Attempts to optimize the induction variable exit values for users in the
1102/// exit block coming from the latch in the original scalar loop.
1103static VPValue *
1107 VPValue *Incoming;
1108 if (!match(Op,
1110 VPValue *Mask;
1112 m_VPValue(Incoming))) ||
1113 Mask != vputils::findHeaderMask(Plan))
1114 return nullptr;
1115 }
1116
1117 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1118 if (!WideIV)
1119 return nullptr;
1120
1121 VPValue *EndValue = EndValues.lookup(WideIV);
1122 assert(EndValue && "Must have computed the end value up front");
1123
1124 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1125 // changed it means the exit is using the incremented value, so we don't
1126 // need to subtract the step.
1127 if (Incoming != WideIV)
1128 return EndValue;
1129
1130 // Otherwise, subtract the step from the EndValue.
1131 auto *ExtractR = cast<VPInstruction>(Op);
1132 VPBuilder B(ExtractR);
1133 VPValue *Step = WideIV->getStepValue();
1134 Type *ScalarTy = WideIV->getScalarType();
1135 if (ScalarTy->isIntegerTy())
1136 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1137 if (ScalarTy->isPointerTy()) {
1138 Type *StepTy = Step->getScalarType();
1139 auto *Zero = Plan.getZero(StepTy);
1140 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1141 DebugLoc::getUnknown(), "ind.escape");
1142 }
1143 if (ScalarTy->isFloatingPointTy()) {
1144 const auto &ID = WideIV->getInductionDescriptor();
1145 return B.createNaryOp(
1146 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1147 ? Instruction::FSub
1148 : Instruction::FAdd,
1149 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1150 }
1151 llvm_unreachable("all possible induction types must be handled");
1152 return nullptr;
1153}
1154
1156 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1157 // Compute end values for all inductions.
1158 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1159 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1160 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1162 VPValue *ResumeTC =
1163 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1164 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1165 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1166 if (!WideIV)
1167 continue;
1168 if (VPValue *EndValue =
1169 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1170 EndValues[WideIV] = EndValue;
1171 }
1172
1173 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1174 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1175 VPValue *Op;
1176 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1177 continue;
1178 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1179 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1180 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1181 R.eraseFromParent();
1182 }
1183 }
1184
1185 // Then, optimize exit block users.
1186 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1187 for (VPRecipeBase &R : ExitVPBB->phis()) {
1188 auto *ExitIRI = cast<VPIRPhi>(&R);
1189
1190 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1191 VPValue *Escape = nullptr;
1192 if (PredVPBB == MiddleVPBB)
1194 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1195 else
1197 Plan, ExitIRI->getOperand(Idx), PSE);
1198 if (Escape)
1199 ExitIRI->setOperand(Idx, Escape);
1200 }
1201 }
1202 }
1203}
1204
1205/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1206/// them with already existing recipes expanding the same SCEV expression.
1209
1210 for (VPRecipeBase &R :
1212 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1213 if (!ExpR)
1214 continue;
1215
1216 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1217 if (Inserted)
1218 continue;
1219
1220 ExpR->replaceAllUsesWith(V->second);
1221 if (ExpR == Plan.getTripCount())
1222 Plan.resetTripCount(V->second);
1223
1224 ExpR->eraseFromParent();
1225 }
1226}
1227
1229 SmallVector<VPValue *> WorkList;
1231 WorkList.push_back(V);
1232
1233 while (!WorkList.empty()) {
1234 VPValue *Cur = WorkList.pop_back_val();
1235 if (!Seen.insert(Cur).second)
1236 continue;
1237 VPRecipeBase *R = Cur->getDefiningRecipe();
1238 if (!R)
1239 continue;
1240 if (!isDeadRecipe(*R))
1241 continue;
1242 append_range(WorkList, R->operands());
1243 R->eraseFromParent();
1244 }
1245}
1246
1247/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1248/// Returns an optional pair, where the first element indicates whether it is
1249/// an intrinsic ID.
1250static std::optional<std::pair<bool, unsigned>>
1252 return TypeSwitch<const VPSingleDefRecipe *,
1253 std::optional<std::pair<bool, unsigned>>>(R)
1256 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1257 .Case([](const VPWidenIntrinsicRecipe *I) {
1258 return std::make_pair(true, I->getVectorIntrinsicID());
1259 })
1260 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1261 [](auto *I) {
1262 // For recipes that do not directly map to LLVM IR instructions,
1263 // assign opcodes after the last VPInstruction opcode (which is also
1264 // after the last IR Instruction opcode), based on the VPRecipeID.
1265 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1266 I->getVPRecipeID());
1267 })
1268 .Default([](auto *) { return std::nullopt; });
1269}
1270
1271/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1272/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1273/// Operands are foldable live-ins.
1275 ArrayRef<VPValue *> Operands,
1276 const DataLayout &DL) {
1277 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1278 if (!OpcodeOrIID)
1279 return nullptr;
1280
1282 for (VPValue *Op : Operands) {
1283 VPValue *Candidate = Op;
1284 match(Op, m_Broadcast(m_VPValue(Candidate)));
1285 if (!match(Candidate, m_LiveIn()))
1286 return nullptr;
1287 Value *V = Candidate->getUnderlyingValue();
1288 if (!V)
1289 return nullptr;
1290 Ops.push_back(V);
1291 }
1292
1293 VPlan &Plan = *R.getParent()->getPlan();
1294 auto FoldToIRValue = [&]() -> Value * {
1295 InstSimplifyFolder Folder(DL);
1296 if (OpcodeOrIID->first) {
1297 auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(&R);
1298 return Folder.FoldIntrinsic(OpcodeOrIID->second, Ops, R.getScalarType(),
1299 RFlags ? RFlags->getFastMathFlagsOrNone()
1300 : FastMathFlags());
1301 }
1302 unsigned Opcode = OpcodeOrIID->second;
1303 if (Instruction::isBinaryOp(Opcode))
1304 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1305 Ops[0], Ops[1]);
1306 if (Instruction::isCast(Opcode))
1307 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1308 R.getVPSingleValue()->getScalarType());
1309 switch (Opcode) {
1310 case VPInstruction::Not:
1311 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1313 case Instruction::Select:
1314 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1315 case Instruction::ICmp:
1316 case Instruction::FCmp:
1317 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1318 Ops[1]);
1319 case Instruction::GetElementPtr: {
1320 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1321 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1322 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1323 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1324 }
1327 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1328 Ops[1],
1329 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1330 // An extract of a live-in is an extract of a broadcast, so return the
1331 // broadcasted element.
1332 case Instruction::ExtractElement:
1333 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1334 return Ops[0];
1335 }
1336 return nullptr;
1337 };
1338
1339 if (Value *V = FoldToIRValue())
1340 return Plan.getOrAddLiveIn(V);
1341 return nullptr;
1342}
1343
1344/// Try to simplify logical and bitwise recipes in \p Def.
1346 bool CanCreateNewRecipe) {
1347 VPlan *Plan = Def->getParent()->getPlan();
1348
1349 // Simplify (X && Y) | (X && !Y) -> X.
1350 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1351 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1352 // recipes to be visited during simplification.
1353 VPValue *X, *Y, *Z;
1354 if (match(Def,
1357 Def->replaceAllUsesWith(X);
1358 Def->eraseFromParent();
1359 return true;
1360 }
1361
1362 // x | AllOnes -> AllOnes
1363 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1364 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1365 return true;
1366 }
1367
1368 // x | 0 -> x
1369 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1370 Def->replaceAllUsesWith(X);
1371 return true;
1372 }
1373
1374 // x | !x -> AllOnes
1375 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1376 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1377 return true;
1378 }
1379
1380 // x & 0 -> 0
1381 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1382 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1383 return true;
1384 }
1385
1386 // x & AllOnes -> x
1387 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1388 Def->replaceAllUsesWith(X);
1389 return true;
1390 }
1391
1392 // x && false -> false
1393 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1394 Def->replaceAllUsesWith(Plan->getFalse());
1395 return true;
1396 }
1397
1398 // x && true -> x
1399 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1400 Def->replaceAllUsesWith(X);
1401 return true;
1402 }
1403
1404 // (x && y) | (x && z) -> x && (y | z)
1405 if (CanCreateNewRecipe &&
1408 // Simplify only if one of the operands has one use to avoid creating an
1409 // extra recipe.
1410 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1411 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1412 Def->replaceAllUsesWith(
1413 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1414 return true;
1415 }
1416
1417 // x && (x && y) -> x && y
1418 if (match(Def, m_LogicalAnd(m_VPValue(X),
1420 Def->replaceAllUsesWith(Def->getOperand(1));
1421 return true;
1422 }
1423
1424 // x && (y && x) -> x && y
1425 if (match(Def, m_LogicalAnd(m_VPValue(X),
1427 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1428 return true;
1429 }
1430
1431 // x && !x -> 0
1432 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1433 Def->replaceAllUsesWith(Plan->getFalse());
1434 return true;
1435 }
1436
1437 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1438 Def->replaceAllUsesWith(X);
1439 return true;
1440 }
1441
1442 // select c, false, true -> not c
1443 VPValue *C;
1444 if (CanCreateNewRecipe &&
1445 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1446 Def->replaceAllUsesWith(Builder.createNot(C));
1447 return true;
1448 }
1449
1450 // select !c, x, y -> select c, y, x
1451 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1452 Def->setOperand(0, C);
1453 Def->setOperand(1, Y);
1454 Def->setOperand(2, X);
1455 return true;
1456 }
1457
1458 // select x, (i1 y | z), y -> y | (x && z)
1459 if (CanCreateNewRecipe &&
1460 match(Def, m_Select(m_VPValue(X),
1462 m_Deferred(Y))) &&
1463 Y->getScalarType()->isIntegerTy(1)) {
1464 Def->replaceAllUsesWith(
1465 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1466 return true;
1467 }
1468
1469 return false;
1470}
1471
1472/// Try to simplify VPSingleDefRecipe \p Def.
1474 VPlan *Plan = Def->getParent()->getPlan();
1475
1476 // Simplification of live-in IR values for SingleDef recipes using
1477 // InstSimplifyFolder.
1478 const DataLayout &DL = Plan->getDataLayout();
1479 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1480 return Def->replaceAllUsesWith(V);
1481
1482 // Fold PredPHI LiveIn -> LiveIn.
1483 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1484 VPValue *Op = PredPHI->getOperand(0);
1485 if (isa<VPIRValue>(Op))
1486 PredPHI->replaceAllUsesWith(Op);
1487 }
1488
1489 // Drop the mask of a predicated store masked by the header mask (which is
1490 // guaranteed to be true at least for the first lane) and both the stored
1491 // value and the address are uniform across VF and UF.
1492 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
1493 RepR && RepR->isPredicated() && RepR->getOpcode() == Instruction::Store &&
1494 all_of(RepR->operandsWithoutMask(), vputils::isUniformAcrossVFsAndUFs) &&
1495 vputils::isHeaderMask(RepR->getMask(), *Plan)) {
1496 auto *Unmasked = new VPReplicateRecipe(
1497 RepR->getUnderlyingInstr(), RepR->operandsWithoutMask(),
1498 RepR->isSingleScalar(), /*Mask=*/nullptr, *RepR, *RepR,
1499 RepR->getDebugLoc());
1500 Unmasked->insertBefore(RepR);
1501 RepR->replaceAllUsesWith(Unmasked);
1502 RepR->eraseFromParent();
1503 return;
1504 }
1505
1506 VPBuilder Builder(Def);
1507
1508 // Avoid replacing VPInstructions with underlying values with new
1509 // VPInstructions, as we would fail to create widen/replicate recpes from the
1510 // new VPInstructions without an underlying value, and miss out on some
1511 // transformations that only apply to widened/replicated recipes later, by
1512 // doing so.
1513 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1514 // VPInstructions without underlying values, as those will get skipped during
1515 // cost computation.
1516 bool CanCreateNewRecipe =
1517 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1518
1519 VPValue *A;
1520 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1521 Type *TruncTy = Def->getScalarType();
1522 Type *ATy = A->getScalarType();
1523 if (TruncTy == ATy) {
1524 Def->replaceAllUsesWith(A);
1525 } else {
1526 // Don't replace a non-widened cast recipe with a widened cast.
1527 if (!isa<VPWidenCastRecipe>(Def))
1528 return;
1529 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1530
1531 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1532 ? Instruction::SExt
1533 : Instruction::ZExt;
1534 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1535 TruncTy);
1536 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1537 // UnderlyingExt has distinct return type, used to retain legacy cost.
1538 Ext->setUnderlyingValue(UnderlyingExt);
1539 }
1540 Def->replaceAllUsesWith(Ext);
1541 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1542 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1543 Def->replaceAllUsesWith(Trunc);
1544 }
1545 }
1546 }
1547
1548 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1549 return;
1550
1551 VPValue *X, *Y, *C;
1552 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1553 return Def->replaceAllUsesWith(A);
1554
1555 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1556 return Def->replaceAllUsesWith(A);
1557
1558 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1559 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1560
1561 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1562 // Preserve nsw from the Mul on the new Sub.
1564 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1565 return Def->replaceAllUsesWith(Builder.createSub(
1566 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1567 }
1568
1569 if (CanCreateNewRecipe &&
1571 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1572 // new Sub.
1574 false,
1575 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1576 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1577 ->hasNoSignedWrap()};
1578 return Def->replaceAllUsesWith(
1579 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1580 }
1581
1582 const APInt *APC;
1583 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1584 APC->isPowerOf2()) {
1585 auto *MulR = cast<VPRecipeWithIRFlags>(Def);
1586 unsigned ShiftAmt = APC->exactLogBase2();
1587 VPIRFlags::WrapFlagsTy NW(MulR->hasNoUnsignedWrap(),
1588 MulR->hasNoSignedWrap() &&
1589 ShiftAmt != APC->getBitWidth() - 1);
1590 return Def->replaceAllUsesWith(Builder.createNaryOp(
1591 Instruction::Shl,
1592 {A, Plan->getConstantInt(APC->getBitWidth(), ShiftAmt)}, NW,
1593 Def->getDebugLoc()));
1594 }
1595
1596 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1597 APC->isPowerOf2())
1598 return Def->replaceAllUsesWith(Builder.createNaryOp(
1599 Instruction::LShr,
1600 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1601 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1602
1603 if (match(Def, m_Not(m_VPValue(A)))) {
1604 if (match(A, m_Not(m_VPValue(A))))
1605 return Def->replaceAllUsesWith(A);
1606
1607 // Try to fold Not into compares by adjusting the predicate in-place.
1608 CmpPredicate Pred;
1609 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1610 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1611 if (all_of(Cmp->users(),
1613 m_Not(m_Specific(Cmp)),
1614 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1615 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1616 for (VPUser *U : to_vector(Cmp->users())) {
1617 auto *R = cast<VPSingleDefRecipe>(U);
1618 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1619 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1620 R->setOperand(1, Y);
1621 R->setOperand(2, X);
1622 } else {
1623 // not (cmp pred) -> cmp inv_pred
1624 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1625 R->replaceAllUsesWith(Cmp);
1626 }
1627 }
1628 // If Cmp doesn't have a debug location, use the one from the negation,
1629 // to preserve the location.
1630 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1631 Cmp->setDebugLoc(Def->getDebugLoc());
1632 }
1633 }
1634 }
1635
1636 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1637 // any-of (fcmp uno %A, %B), ...
1638 if (match(Def, m_AnyOf())) {
1640 VPRecipeBase *UnpairedCmp = nullptr;
1641 for (VPValue *Op : Def->operands()) {
1642 VPValue *X;
1643 if (Op->getNumUsers() > 1 ||
1645 m_Deferred(X)))) {
1646 NewOps.push_back(Op);
1647 } else if (!UnpairedCmp) {
1648 UnpairedCmp = Op->getDefiningRecipe();
1649 } else {
1650 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1651 UnpairedCmp->getOperand(0), X));
1652 UnpairedCmp = nullptr;
1653 }
1654 }
1655
1656 if (UnpairedCmp)
1657 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1658
1659 if (NewOps.size() < Def->getNumOperands()) {
1660 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1661 return Def->replaceAllUsesWith(NewAnyOf);
1662 }
1663 }
1664
1665 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1666 // This is useful for fmax/fmin without fast-math flags, where we need to
1667 // check if any operand is NaN.
1668 if (CanCreateNewRecipe &&
1670 m_Deferred(X)),
1672 m_Deferred(Y))))) {
1673 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1674 return Def->replaceAllUsesWith(NewCmp);
1675 }
1676
1677 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1678 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1679 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1680 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1681 return Def->replaceAllUsesWith(Def->getOperand(1));
1682
1684 m_One()))) {
1685 Type *WideStepTy = Def->getScalarType();
1686 if (X->getScalarType() != WideStepTy)
1687 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1688 Def->replaceAllUsesWith(X);
1689 return;
1690 }
1691
1692 // For i1 vp.merges produced by AnyOf reductions:
1693 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1695 m_VPValue(X), m_VPValue())) &&
1697 Def->getScalarType()->isIntegerTy(1)) {
1698 Def->setOperand(1, Def->getOperand(0));
1699 Def->setOperand(0, Y);
1700 return;
1701 }
1702
1703 // Simplify MaskedCond with no block mask to its single operand.
1705 !cast<VPInstruction>(Def)->isMasked())
1706 return Def->replaceAllUsesWith(Def->getOperand(0));
1707
1708 // Look through ExtractLastLane.
1709 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1710 if (match(A, m_BuildVector())) {
1711 auto *BuildVector = cast<VPInstruction>(A);
1712 Def->replaceAllUsesWith(
1713 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1714 return;
1715 }
1716
1717 if (match(A, m_Broadcast(m_VPValue(X))))
1718 return Def->replaceAllUsesWith(X);
1719
1721 return Def->replaceAllUsesWith(A);
1722
1723 if (Plan->hasScalarVFOnly())
1724 return Def->replaceAllUsesWith(A);
1725 }
1726
1727 // Look through ExtractPenultimateElement (BuildVector ....).
1729 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1730 Def->replaceAllUsesWith(
1731 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1732 return;
1733 }
1734
1735 uint64_t Idx;
1737 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1738 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1739 return;
1740 }
1741
1742 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1743 Def->replaceAllUsesWith(
1744 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1745 return;
1746 }
1747
1748 // Replace uses of a BuildVector by users that only use its first lane with
1749 // its first operand directly.
1750 if (match(Def, m_BuildVector())) {
1751 Def->replaceUsesWithIf(Def->getOperand(0), [Def](VPUser &U, unsigned) {
1752 return U.usesFirstLaneOnly(Def);
1753 });
1754 }
1755
1756 // Look through broadcast of single-scalar when used as select conditions; in
1757 // that case the scalar condition can be used directly.
1758 if (match(Def,
1761 "broadcast operand must be single-scalar");
1762 Def->setOperand(0, C);
1763 return;
1764 }
1765
1766 if (match(Def, m_Broadcast(m_VPValue(X))))
1767 return Def->replaceUsesWithIf(
1768 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1769
1771 if (Def->getNumOperands() == 1) {
1772 Def->replaceAllUsesWith(Def->getOperand(0));
1773 return;
1774 }
1775 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1776 if (all_equal(Phi->incoming_values()))
1777 Phi->replaceAllUsesWith(Phi->getOperand(0));
1778 }
1779 return;
1780 }
1781
1782 VPIRValue *IRV;
1783 if (Def->getNumOperands() == 1 &&
1785 return Def->replaceAllUsesWith(IRV);
1786
1787 // Some simplifications can only be applied after unrolling. Perform them
1788 // below.
1789 if (!Plan->isUnrolled())
1790 return;
1791
1792 // After unrolling, extract-lane may be used to extract values from multiple
1793 // scalar sources. Only simplify when extracting from a single scalar source.
1794 VPValue *LaneToExtract;
1795 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1796 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1798 return Def->replaceAllUsesWith(A);
1799
1800 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1801 // scalar canonical IV.
1803 if (match(LaneToExtract, m_ZeroInt()) &&
1804 match(A, m_CanonicalWidenIV(WidenIV)))
1805 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1806
1807 // Simplify extract-lane with single source to extract-element.
1808 Def->replaceAllUsesWith(Builder.createNaryOp(
1809 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1810 return;
1811 }
1812
1813 // Look for cycles where Def is of the form:
1814 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1815 // IVInc = X + Step ; used by X and Def
1816 // Def = IVInc + Y
1817 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1818 // and if Inc exists, replace it with X.
1819 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1820 isa<VPIRValue>(Y) &&
1821 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1822 auto *Phi = cast<VPPhi>(X);
1823 auto *IVInc = Def->getOperand(0);
1824 if (IVInc->getNumUsers() == 2) {
1825 // If Phi has a second user (besides IVInc's defining recipe), it must
1826 // be Inc = Phi + Y for the fold to apply.
1828 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1829 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1830 Def->replaceAllUsesWith(IVInc);
1831 if (Inc)
1832 Inc->replaceAllUsesWith(Phi);
1833 Phi->setOperand(0, Y);
1834 return;
1835 }
1836 }
1837 }
1838
1839 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1840 // just the pointer operand.
1841 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1842 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1843 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1844
1845 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1846 // the start index is zero and only the first lane 0 is demanded.
1847 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1848 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1849 Steps->replaceAllUsesWith(Steps->getOperand(0));
1850 return;
1851 }
1852 }
1853 // Simplify redundant ReductionStartVector recipes after unrolling.
1854 VPValue *StartV;
1856 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1857 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1858 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1859 return PhiR && PhiR->isInLoop();
1860 });
1861 return;
1862 }
1863
1864 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1865 return Def->replaceAllUsesWith(A);
1866}
1867
1877
1879 VPValue *X;
1882 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1883 if (match(&R, m_Reverse(m_Reverse(m_VPValue(X)))))
1884 R.getVPSingleValue()->replaceAllUsesWith(X);
1885}
1886
1887/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1888/// header mask to be simplified further when tail folding, e.g. in
1889/// optimizeEVLMasks.
1890static void reassociateHeaderMask(VPlan &Plan) {
1891 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1892 if (!HeaderMask)
1893 return;
1894
1895 SmallVector<VPUser *> Worklist;
1896 for (VPUser *U : HeaderMask->users())
1897 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1899
1900 while (!Worklist.empty()) {
1901 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1902 VPValue *X, *Y;
1903 if (!R || !match(R, m_LogicalAnd(
1904 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1905 m_VPValue(Y))))
1906 continue;
1907 append_range(Worklist, R->users());
1908 VPBuilder Builder(R);
1909 R->replaceAllUsesWith(
1910 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1911 }
1912}
1913
1914static std::optional<Instruction::BinaryOps>
1916 switch (ID) {
1917 case Intrinsic::masked_udiv:
1918 return Instruction::UDiv;
1919 case Intrinsic::masked_sdiv:
1920 return Instruction::SDiv;
1921 case Intrinsic::masked_urem:
1922 return Instruction::URem;
1923 case Intrinsic::masked_srem:
1924 return Instruction::SRem;
1925 default:
1926 return {};
1927 }
1928}
1929
1931 if (Plan.hasScalarVFOnly())
1932 return;
1933
1935 vp_depth_first_deep(Plan.getEntry()))) {
1936 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1939 continue;
1940 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1941 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1942 continue;
1943
1944 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1945 if (RepR && RepR->getOpcode() == Instruction::Store &&
1946 vputils::isSingleScalar(RepR->getOperand(1))) {
1947 auto *Clone = new VPReplicateRecipe(
1948 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1949 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1950 *RepR /*Metadata*/, RepR->getDebugLoc());
1951 Clone->insertBefore(RepOrWidenR);
1952 VPBuilder Builder(Clone);
1953 VPValue *ExtractOp = Clone->getOperand(0);
1954 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1955 ExtractOp =
1956 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1957 ExtractOp =
1958 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1959 Clone->setOperand(0, ExtractOp);
1960 RepR->eraseFromParent();
1961 continue;
1962 }
1963
1964 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1965 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1966 if (!vputils::onlyFirstLaneUsed(IntrR))
1967 continue;
1968 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1969 if (!Opc)
1970 continue;
1971 VPBuilder Builder(IntrR);
1972 VPValue *SafeDivisor = Builder.createSelect(
1973 IntrR->getOperand(2), IntrR->getOperand(1),
1974 Plan.getConstantInt(IntrR->getScalarType(), 1));
1975 VPValue *Clone = Builder.createNaryOp(
1976 *Opc, {IntrR->getOperand(0), SafeDivisor},
1977 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1978 IntrR->replaceAllUsesWith(Clone);
1979 IntrR->eraseFromParent();
1980 continue;
1981 }
1982
1983 // Skip recipes that aren't single scalars.
1984 if (!vputils::isSingleScalar(RepOrWidenR))
1985 continue;
1986
1987 // Predicate to check if a user of Op introduces extra broadcasts.
1988 auto IntroducesBCastOf = [](const VPValue *Op) {
1989 return [Op](const VPUser *U) {
1990 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1994 VPI->getOpcode()))
1995 return false;
1996 }
1997 return !U->usesScalars(Op);
1998 };
1999 };
2000
2001 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
2002 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
2003 if (any_of(
2004 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
2005 IntroducesBCastOf(Op)))
2006 return false;
2007 // Non-constant live-ins require broadcasts, while constants do not
2008 // need explicit broadcasts.
2009 auto *IRV = dyn_cast<VPIRValue>(Op);
2010 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
2011 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
2012 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
2013 }))
2014 continue;
2015
2016 auto *Clone = VPBuilder::createSingleScalarOp(
2017 getOpcodeOrIntrinsicID(RepOrWidenR)->second, RepOrWidenR->operands(),
2018 /*Mask=*/nullptr, *RepOrWidenR, {}, DebugLoc::getUnknown(),
2019 RepOrWidenR->getUnderlyingInstr());
2020 Clone->insertBefore(RepOrWidenR);
2021 RepOrWidenR->replaceAllUsesWith(Clone);
2022 if (isDeadRecipe(*RepOrWidenR))
2023 RepOrWidenR->eraseFromParent();
2024 }
2025 }
2026}
2027
2028/// Try to see if all of \p Blend's masks share a common value logically and'ed
2029/// and remove it from the masks.
2031 if (Blend->isNormalized())
2032 return;
2033 VPValue *CommonEdgeMask;
2034 if (!match(Blend->getMask(0),
2035 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
2036 return;
2037 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2038 if (!match(Blend->getMask(I),
2039 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
2040 return;
2041 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2042 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
2043}
2044
2045/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2046/// to make sure the masks are simplified.
2047static void simplifyBlends(VPlan &Plan) {
2050 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2051 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2052 if (!Blend)
2053 continue;
2054
2055 removeCommonBlendMask(Blend);
2056
2057 // Try to remove redundant blend recipes.
2058 SmallPtrSet<VPValue *, 4> UniqueValues;
2059 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2060 UniqueValues.insert(Blend->getIncomingValue(0));
2061 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2062 if (!match(Blend->getMask(I), m_False()))
2063 UniqueValues.insert(Blend->getIncomingValue(I));
2064
2065 if (UniqueValues.size() == 1) {
2066 Blend->replaceAllUsesWith(*UniqueValues.begin());
2067 Blend->eraseFromParent();
2068 continue;
2069 }
2070
2071 if (Blend->isNormalized())
2072 continue;
2073
2074 // Normalize the blend so its first incoming value is used as the initial
2075 // value with the others blended into it.
2076
2077 unsigned StartIndex = 0;
2078 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2079 // If a value's mask is used only by the blend then is can be deadcoded.
2080 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2081 // that's used by multiple blends where it can be removed from them all.
2082 VPValue *Mask = Blend->getMask(I);
2083 if (Mask->hasOneUse() && !match(Mask, m_False())) {
2084 StartIndex = I;
2085 break;
2086 }
2087 }
2088
2089 SmallVector<VPValue *, 4> OperandsWithMask;
2090 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2091
2092 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2093 if (I == StartIndex)
2094 continue;
2095 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2096 OperandsWithMask.push_back(Blend->getMask(I));
2097 }
2098
2099 auto *NewBlend =
2100 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2101 OperandsWithMask, *Blend, Blend->getDebugLoc());
2102 NewBlend->insertBefore(&R);
2103
2104 VPValue *DeadMask = Blend->getMask(StartIndex);
2105 Blend->replaceAllUsesWith(NewBlend);
2106 Blend->eraseFromParent();
2108
2109 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2110 VPValue *NewMask;
2111 if (NewBlend->getNumOperands() == 3 &&
2112 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2113 VPValue *Inc0 = NewBlend->getOperand(0);
2114 VPValue *Inc1 = NewBlend->getOperand(1);
2115 VPValue *OldMask = NewBlend->getOperand(2);
2116 NewBlend->setOperand(0, Inc1);
2117 NewBlend->setOperand(1, Inc0);
2118 NewBlend->setOperand(2, NewMask);
2119 if (OldMask->user_empty())
2120 cast<VPInstruction>(OldMask)->eraseFromParent();
2121 }
2122 }
2123 }
2124}
2125
2126/// Optimize the width of vector induction variables in \p Plan based on a known
2127/// constant Trip Count, \p BestVF and \p BestUF.
2129 ElementCount BestVF,
2130 unsigned BestUF) {
2131 // Only proceed if we have not completely removed the vector region.
2132 if (!Plan.getVectorLoopRegion())
2133 return false;
2134
2135 const APInt *TC;
2136 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2137 return false;
2138
2139 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2140 // and UF. Returns at least 8.
2141 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2142 APInt AlignedTC =
2145 APInt MaxVal = AlignedTC - 1;
2146 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2147 };
2148 unsigned NewBitWidth =
2149 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2150
2151 LLVMContext &Ctx = Plan.getContext();
2152 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2153
2154 bool MadeChange = false;
2155
2156 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2157 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2158 // Currently only handle canonical IVs as it is trivial to replace the start
2159 // and stop values, and we currently only perform the optimization when the
2160 // IV has a single use.
2162 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2163 continue;
2164 if (WideIV->hasMoreThanOneUniqueUser() ||
2165 NewIVTy == WideIV->getScalarType())
2166 continue;
2167
2168 // Currently only handle cases where the single user is a header-mask
2169 // comparison with the backedge-taken-count.
2170 VPUser *SingleUser = WideIV->getSingleUser();
2171 if (!SingleUser ||
2172 !match(SingleUser,
2173 m_ICmp(m_Specific(WideIV),
2175 continue;
2176
2177 // Update IV operands and comparison bound to use new narrower type.
2178 assert(!WideIV->getTruncInst() &&
2179 "canonical IV is not expected to have a truncation");
2180 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2181 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2182 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2183 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2184 NewWideIV->insertBefore(WideIV);
2185
2186 auto *NewBTC = new VPWidenCastRecipe(
2187 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2188 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2189 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2190 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2191 Cmp->replaceAllUsesWith(
2192 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2193
2194 MadeChange = true;
2195 }
2196
2197 return MadeChange;
2198}
2199
2200/// Return true if \p Cond is known to be true for given \p BestVF and \p
2201/// BestUF.
2203 ElementCount BestVF, unsigned BestUF,
2206 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2207 &PSE](VPValue *C) {
2208 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2209 });
2210
2211 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2214 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2215 m_Specific(&Plan.getVectorTripCount()))))
2216 return false;
2217
2218 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2219 // count is not conveniently available as SCEV so far, so we compare directly
2220 // against the original trip count. This is stricter than necessary, as we
2221 // will only return true if the trip count == vector trip count.
2222 const SCEV *VectorTripCount =
2224 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2225 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2226 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2227 "Trip count SCEV must be computable");
2228 ScalarEvolution &SE = *PSE.getSE();
2229 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2230 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2231 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2232}
2233
2234/// Try to replace multiple active lane masks used for control flow with
2235/// a single, wide active lane mask instruction followed by multiple
2236/// extract subvector intrinsics. This applies to the active lane mask
2237/// instructions both in the loop and in the preheader.
2238/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2239/// new extracts from the first active lane mask, which has it's last
2240/// operand (multiplier) set to UF.
2242 unsigned UF) {
2243 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2244 return false;
2245
2246 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2247 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2248 auto *Term = &ExitingVPBB->back();
2249
2250 using namespace llvm::VPlanPatternMatch;
2252 m_VPValue(), m_VPValue(), m_VPValue())))))
2253 return false;
2254
2255 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2256 LLVMContext &Ctx = Plan.getContext();
2257
2258 auto ExtractFromALM = [&](VPInstruction *ALM,
2259 SmallVectorImpl<VPValue *> &Extracts) {
2260 DebugLoc DL = ALM->getDebugLoc();
2261 for (unsigned Part = 0; Part < UF; ++Part) {
2263 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2264 auto *Ext =
2265 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2266 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2267 Extracts[Part] = Ext;
2268 Ext->insertAfter(ALM);
2269 }
2270 };
2271
2272 // Create a list of each active lane mask phi, ordered by unroll part.
2274 for (VPRecipeBase &R : Header->phis()) {
2276 if (!Phi)
2277 continue;
2278 VPValue *Index = nullptr;
2279 match(Phi->getBackedgeValue(),
2281 assert(Index && "Expected index from ActiveLaneMask instruction");
2282
2283 uint64_t Part;
2284 if (match(Index,
2286 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2287 Phis[Part] = Phi;
2288 else {
2289 // Anything other than a CanonicalIVIncrementForPart is part 0
2290 assert(!match(
2291 Index,
2293 Phis[0] = Phi;
2294 }
2295 }
2296
2297 assert(all_of(Phis, not_equal_to(nullptr)) &&
2298 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2299
2300 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2301 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2302
2303 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2304 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2305 "Expected incoming values of Phi to be ActiveLaneMasks");
2306
2307 // When using wide lane masks, the return type of the get.active.lane.mask
2308 // intrinsic is VF x UF (last operand).
2309 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2310 EntryALM->setOperand(2, ALMMultiplier);
2311 LoopALM->setOperand(2, ALMMultiplier);
2312
2313 // Create UF x extract vectors and insert into preheader.
2314 SmallVector<VPValue *> EntryExtracts(UF);
2315 ExtractFromALM(EntryALM, EntryExtracts);
2316
2317 // Create UF x extract vectors and insert before the loop compare & branch,
2318 // updating the compare to use the first extract.
2319 SmallVector<VPValue *> LoopExtracts(UF);
2320 ExtractFromALM(LoopALM, LoopExtracts);
2321 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2322 Not->setOperand(0, LoopExtracts[0]);
2323
2324 // Update the incoming values of active lane mask phis.
2325 for (unsigned Part = 0; Part < UF; ++Part) {
2326 Phis[Part]->setStartValue(EntryExtracts[Part]);
2327 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2328 }
2329
2330 return true;
2331}
2332
2333/// Try to simplify the branch condition of \p Plan. This may restrict the
2334/// resulting plan to \p BestVF and \p BestUF.
2336 unsigned BestUF,
2338 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2339 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2340 auto *Term = &ExitingVPBB->back();
2341 VPValue *Cond;
2342 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2343 // Check if the branch condition compares the canonical IV increment (for main
2344 // loop), or the canonical IV increment plus an offset (for epilog loop).
2345 if (match(Term, m_BranchOnCount(
2346 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2347 m_VPValue())) ||
2349 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2350 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2351 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2352 const SCEV *VectorTripCount =
2354 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2355 VectorTripCount =
2357 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2358 "Trip count SCEV must be computable");
2359 ScalarEvolution &SE = *PSE.getSE();
2360 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2361 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2362 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2363 return false;
2364 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2366 // For BranchOnCond, check if we can prove the condition to be true using VF
2367 // and UF.
2368 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2369 return false;
2370 } else {
2371 return false;
2372 }
2373
2374 // The vector loop region only executes once. Convert terminator of the
2375 // exiting block to exit in the first iteration.
2376 if (match(Term, m_BranchOnTwoConds())) {
2377 Term->setOperand(1, Plan.getTrue());
2378 return true;
2379 }
2380
2381 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2382 {}, Term->getDebugLoc());
2383 ExitingVPBB->appendRecipe(BOC);
2384 Term->eraseFromParent();
2385
2386 return true;
2387}
2388
2389/// From the definition of llvm.experimental.get.vector.length,
2390/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2394 vp_depth_first_deep(Plan.getEntry()))) {
2395 for (VPRecipeBase &R : *VPBB) {
2396 VPValue *AVL;
2397 if (!match(&R, m_EVL(m_VPValue(AVL))))
2398 continue;
2399
2400 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2401 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2402 continue;
2403 ScalarEvolution &SE = *PSE.getSE();
2404 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2405 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2406 continue;
2407
2409 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2410 R.getDebugLoc());
2411 if (Trunc != AVL) {
2412 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2413 const DataLayout &DL = Plan.getDataLayout();
2414 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2415 Trunc = Folded;
2416 }
2417 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2418 return true;
2419 }
2420 }
2421 return false;
2422}
2423
2425 unsigned BestUF,
2427 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2428 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2429
2430 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2431 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2432 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2433
2434 if (MadeChange) {
2435 Plan.setVF(BestVF);
2436 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2437 }
2438}
2439
2441 for (VPRecipeBase &R :
2443 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2444 if (!PhiR)
2445 continue;
2446 RecurKind RK = PhiR->getRecurrenceKind();
2447 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2449 continue;
2450
2451 for (VPUser *U : collectUsersRecursively(PhiR))
2452 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2453 RecWithFlags->dropPoisonGeneratingFlags();
2454 }
2455 }
2456}
2457
2458namespace {
2459struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2460 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2461 /// return that source element type.
2462 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2463 // All VPInstructions that lower to GEPs must have the i8 source element
2464 // type (as they are PtrAdds), so we omit it.
2466 .Case([](const VPReplicateRecipe *I) -> Type * {
2467 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2468 return GEP->getSourceElementType();
2469 return nullptr;
2470 })
2471 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2472 [](auto *I) { return I->getSourceElementType(); })
2473 .Default([](auto *) { return nullptr; });
2474 }
2475
2476 /// Returns true if recipe \p Def can be safely handed for CSE.
2477 static bool canHandle(const VPSingleDefRecipe *Def) {
2478 // We can extend the list of handled recipes in the future,
2479 // provided we account for the data embedded in them while checking for
2480 // equality or hashing.
2481 auto C = getOpcodeOrIntrinsicID(Def);
2482
2483 // The issue with (Insert|Extract)Value is that the index of the
2484 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2485 // VPlan.
2486 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2487 C->second == Instruction::ExtractValue)))
2488 return false;
2489
2490 // During CSE, we can only handle recipes that don't read from memory: if
2491 // they read from memory, there could be an intervening write to memory
2492 // before the next instance is CSE'd, leading to an incorrect result.
2493 return !Def->mayReadFromMemory();
2494 }
2495
2496 /// Hash the underlying data of \p Def.
2497 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2498 hash_code Result = hash_combine(
2499 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2500 getGEPSourceElementType(Def), Def->getScalarType(),
2502 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2503 if (RFlags->hasPredicate())
2504 return hash_combine(Result, RFlags->getPredicate());
2505 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2506 return hash_combine(Result, SIVSteps->getInductionOpcode());
2507 return Result;
2508 }
2509
2510 /// Check equality of underlying data of \p L and \p R.
2511 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2512 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2514 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2516 !equal(L->operands(), R->operands()))
2517 return false;
2519 "must have valid opcode info for both recipes");
2520 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2521 if (LFlags->hasPredicate() &&
2522 LFlags->getPredicate() !=
2523 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2524 return false;
2525 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2526 if (LSIV->getInductionOpcode() !=
2527 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2528 return false;
2529 // Recipes in replicate regions implicitly depend on predicate. If either
2530 // recipe is in a replicate region, only consider them equal if both have
2531 // the same parent.
2532 const VPRegionBlock *RegionL = L->getRegion();
2533 const VPRegionBlock *RegionR = R->getRegion();
2534 if (((RegionL && RegionL->isReplicator()) ||
2535 (RegionR && RegionR->isReplicator())) &&
2536 L->getParent() != R->getParent())
2537 return false;
2538 return L->getScalarType() == R->getScalarType();
2539 }
2540};
2541} // end anonymous namespace
2542
2543/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2544/// Plan.
2546 VPDominatorTree VPDT(Plan);
2548
2550 Plan.getEntry());
2552 for (VPRecipeBase &R : *VPBB) {
2553 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2554 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2555 continue;
2556 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2557 // V must dominate Def for a valid replacement.
2558 if (!VPDT.dominates(V->getParent(), VPBB))
2559 continue;
2560 // Only keep flags present on both V and Def.
2561 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2562 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2563 Def->replaceAllUsesWith(V);
2564 continue;
2565 }
2566 CSEMap[Def] = Def;
2567 }
2568 }
2569}
2570
2571/// Return true if we do not know how to (mechanically) hoist or sink a
2572/// non-memory or memory recipe \p R out of a loop region. When sinking, passing
2573/// \p Sinking = true ensures that assumes aren't sunk.
2575 VPBasicBlock *LastBB,
2576 bool Sinking = false) {
2577 if (!isa<VPReplicateRecipe>(R) || !R.mayReadOrWriteMemory() ||
2579 return vputils::cannotHoistOrSinkRecipe(R, Sinking);
2580
2581 // Check that the memory operation doesn't alias between FirstBB and LastBB.
2582 auto MemLoc = vputils::getMemoryLocation(R);
2583
2584 // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting
2585 // stores upfront, and constructing a full SinkStoreInfo.
2586 auto SinkInfo =
2587 Sinking ? std::make_optional(SinkStoreInfo(cast<VPReplicateRecipe>(R)))
2588 : std::nullopt;
2589
2590 return !MemLoc ||
2591 !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB, SinkInfo);
2592}
2593
2594/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2595static void licm(VPlan &Plan) {
2596 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2597
2598 // Hoist any loop invariant recipes from the vector loop region to the
2599 // preheader. Preform a shallow traversal of the vector loop region, to
2600 // exclude recipes in replicate regions. Since the top-level blocks in the
2601 // vector loop region are guaranteed to execute if the vector pre-header is,
2602 // we don't need to check speculation safety.
2603 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2604 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2605 "Expected vector prehader's successor to be the vector loop region");
2607 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2608 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2609 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2610 LoopRegion->getExitingBasicBlock()))
2611 continue;
2612 if (any_of(R.operands(), [](VPValue *Op) {
2613 return !Op->isDefinedOutsideLoopRegions();
2614 }))
2615 continue;
2616 R.moveBefore(*Preheader, Preheader->end());
2617 }
2618 }
2619
2620#ifndef NDEBUG
2621 VPDominatorTree VPDT(Plan);
2622#endif
2623 // Sink recipes with no users inside the vector loop region if all users are
2624 // in the same exit block of the region.
2625 // TODO: Extend to sink recipes from inner loops.
2627 LoopRegion->getEntry());
2629 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2630 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2631 LoopRegion->getExitingBasicBlock(),
2632 /*Sinking=*/true))
2633 continue;
2634
2635 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2636 assert(!RepR->isPredicated() &&
2637 "Expected prior transformation of predicated replicates to "
2638 "replicate regions");
2639 // narrowToSingleScalarRecipes should have already maximally narrowed
2640 // replicates to single-scalar replicates.
2641 // TODO: When unrolling, replicateByVF doesn't handle sunk
2642 // non-single-scalar replicates correctly.
2643 if (!RepR->isSingleScalar())
2644 continue;
2645
2646 // The pointer operand of stores must be loop-invariant.
2647 if (RepR->getOpcode() == Instruction::Store &&
2648 !RepR->getOperand(1)->isDefinedOutsideLoopRegions())
2649 continue;
2650 }
2651
2652 [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2653 assert((!R.mayWriteToMemory() ||
2654 (RepR && RepR->getOpcode() == Instruction::Store &&
2655 RepR->getOperand(1)->isDefinedOutsideLoopRegions())) &&
2656 "The only recipes that may write to memory are expected to be "
2657 "stores with invariant pointer-operand");
2658
2659 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2660 // support recipes with multiple defined values (e.g., interleaved loads).
2661 auto *Def = cast<VPSingleDefRecipe>(&R);
2662
2663 // Cannot sink the recipe if the user is defined in a loop region or a
2664 // non-successor of the vector loop region. Cannot sink if user is a phi
2665 // either.
2666 VPBasicBlock *SinkBB = nullptr;
2667 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2668 auto *UserR = cast<VPRecipeBase>(U);
2669 VPBasicBlock *Parent = UserR->getParent();
2670 // TODO: Support sinking when users are in multiple blocks.
2671 if (SinkBB && SinkBB != Parent)
2672 return true;
2673 SinkBB = Parent;
2674 // TODO: If the user is a PHI node, we should check the block of
2675 // incoming value. Support PHI node users if needed.
2676 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2677 Parent->getSinglePredecessor() != LoopRegion;
2678 }))
2679 continue;
2680
2681 if (!SinkBB)
2682 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2683
2684 // TODO: This will need to be a check instead of a assert after
2685 // conditional branches in vectorized loops are supported.
2686 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2687 "Defining block must dominate sink block");
2688 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2689 // just moving.
2690 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2691 }
2692 }
2693}
2694
2696 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2697 if (Plan.hasScalarVFOnly())
2698 return;
2699 // Keep track of created truncates, so they can be re-used. Note that we
2700 // cannot use RAUW after creating a new truncate, as this would could make
2701 // other uses have different types for their operands, making them invalidly
2702 // typed.
2704 VPBasicBlock *PH = Plan.getVectorPreheader();
2707 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2710 continue;
2711
2712 VPValue *ResultVPV = R.getVPSingleValue();
2713 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2714 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2715 if (!NewResSizeInBits)
2716 continue;
2717
2718 // If the value wasn't vectorized, we must maintain the original scalar
2719 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2720 // skip casts which do not need to be handled explicitly here, as
2721 // redundant casts will be removed during recipe simplification.
2723 continue;
2724
2725 Type *OldResTy = ResultVPV->getScalarType();
2726 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2727 assert(OldResTy->isIntegerTy() && "only integer types supported");
2728 (void)OldResSizeInBits;
2729
2730 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2731
2732 // Any wrapping introduced by shrinking this operation shouldn't be
2733 // considered undefined behavior. So, we can't unconditionally copy
2734 // arithmetic wrapping flags to VPW.
2735 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2736 VPW->dropPoisonGeneratingFlags();
2737
2738 assert((OldResSizeInBits != NewResSizeInBits ||
2739 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2740 "Only ICmps should not need extending the result.");
2741 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2742
2743 // For loads/intrinsics we don't recreate the recipe; just wrap the
2744 // original wide result in a ZExt to OldResTy.
2746 if (OldResSizeInBits != NewResSizeInBits) {
2748 Instruction::ZExt, ResultVPV, OldResTy);
2749 ResultVPV->replaceAllUsesWith(Ext);
2750 Ext->setOperand(0, ResultVPV);
2751 }
2752 continue;
2753 }
2754
2755 // Shrink operands by introducing truncates as needed.
2756 unsigned StartIdx =
2757 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2758 SmallVector<VPValue *> NewOperands(R.operands());
2759 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2760 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2761 if (OpSizeInBits == NewResSizeInBits)
2762 continue;
2763 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2764 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2765 if (Inserted) {
2766 VPBuilder Builder;
2767 if (isa<VPIRValue>(Op))
2768 Builder.setInsertPoint(PH);
2769 else
2770 Builder.setInsertPoint(&R);
2771 ProcessedIter->second =
2772 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2773 }
2774 Op = ProcessedIter->second;
2775 }
2776
2777 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2778 NWR->insertBefore(&R);
2779
2780 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2781 // users (unless this is an ICmp, which produces i1 regardless).
2782 VPValue *Replacement = NWR->getVPSingleValue();
2783 if (OldResSizeInBits != NewResSizeInBits)
2784 Replacement =
2786 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2787 ->getVPSingleValue();
2788 ResultVPV->replaceAllUsesWith(Replacement);
2789 R.eraseFromParent();
2790 }
2791 }
2792}
2793
2794bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2795 std::optional<VPDominatorTree> VPDT;
2796 if (OnlyLatches)
2797 VPDT.emplace(Plan);
2798
2799 // Collect all blocks before modifying the CFG so we can identify unreachable
2800 // ones after constant branch removal.
2802
2803 bool SimplifiedPhi = false;
2804 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2805 VPValue *Cond;
2806 // Skip blocks that are not terminated by BranchOnCond.
2807 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2808 continue;
2809
2810 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2811 continue;
2812
2813 assert(VPBB->getNumSuccessors() == 2 &&
2814 "Two successors expected for BranchOnCond");
2815 unsigned RemovedIdx;
2816 if (match(Cond, m_True()))
2817 RemovedIdx = 1;
2818 else if (match(Cond, m_False()))
2819 RemovedIdx = 0;
2820 else
2821 continue;
2822
2823 VPBasicBlock *RemovedSucc =
2824 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2825 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2826 "There must be a single edge between VPBB and its successor");
2827 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2828 // these recipes.
2829 auto Phis = RemovedSucc->phis();
2830 for (VPRecipeBase &R : Phis)
2831 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2832 SimplifiedPhi |= !std::empty(Phis);
2833
2834 // Disconnect blocks and remove the terminator.
2835 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2836 VPBB->back().eraseFromParent();
2837 }
2838
2839 // Compute which blocks are still reachable from the entry after constant
2840 // branch removal.
2843
2844 // Detach all unreachable blocks from their successors, removing their recipes
2845 // and incoming values from phi recipes.
2846 VPSymbolicValue Tmp(nullptr);
2847 for (VPBlockBase *B : AllBlocks) {
2848 if (Reachable.contains(B))
2849 continue;
2850 for (VPBlockBase *Succ : to_vector(B->successors())) {
2851 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2852 for (VPRecipeBase &R : SuccBB->phis())
2853 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2855 }
2856 for (VPBasicBlock *DeadBB :
2858 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2859 for (VPValue *Def : R.definedValues())
2860 Def->replaceAllUsesWith(&Tmp);
2861 R.eraseFromParent();
2862 }
2863 }
2864 }
2865 return SimplifiedPhi;
2866}
2867
2888
2889// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2890// the loop terminator with a branch-on-cond recipe with the negated
2891// active-lane-mask as operand. Note that this turns the loop into an
2892// uncountable one. Only the existing terminator is replaced, all other existing
2893// recipes/users remain unchanged, except for poison-generating flags being
2894// dropped from the canonical IV increment. Return the created
2895// VPActiveLaneMaskPHIRecipe.
2896//
2897// The function adds the following recipes:
2898//
2899// vector.ph:
2900// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2901// %EntryALM = active-lane-mask %EntryInc, TC
2902//
2903// vector.body:
2904// ...
2905// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2906// ...
2907// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2908// %ALM = active-lane-mask %InLoopInc, TC
2909// %Negated = Not %ALM
2910// branch-on-cond %Negated
2911//
2914 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2915 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2916 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2917 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2918 // TODO: Check if dropping the flags is needed.
2919 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2920 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2921 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2922 // we have to take unrolling into account. Each part needs to start at
2923 // Part * VF
2924 auto *VecPreheader = Plan.getVectorPreheader();
2925 VPBuilder Builder(VecPreheader);
2926
2927 // Create the ActiveLaneMask instruction using the correct start values.
2928 VPValue *TC = Plan.getTripCount();
2929 VPValue *VF = &Plan.getVF();
2930
2931 auto *EntryIncrement = Builder.createOverflowingOp(
2932 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2933 DL, "index.part.next");
2934
2935 // Create the active lane mask instruction in the VPlan preheader.
2936 VPValue *ALMMultiplier =
2937 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2938 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2939 {EntryIncrement, TC, ALMMultiplier}, DL,
2940 "active.lane.mask.entry");
2941
2942 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2943 // preheader ActiveLaneMask instruction.
2944 auto *LaneMaskPhi =
2946 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2947 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2948
2949 // Create the active lane mask for the next iteration of the loop before the
2950 // original terminator.
2951 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2952 Builder.setInsertPoint(OriginalTerminator);
2953 auto *InLoopIncrement = Builder.createOverflowingOp(
2955 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2956 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2957 {InLoopIncrement, TC, ALMMultiplier}, DL,
2958 "active.lane.mask.next");
2959 LaneMaskPhi->addBackedgeValue(ALM);
2960
2961 // Replace the original terminator with BranchOnCond. We have to invert the
2962 // mask here because a true condition means jumping to the exit block.
2963 auto *NotMask = Builder.createNot(ALM, DL);
2964 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2965 OriginalTerminator->eraseFromParent();
2966 return LaneMaskPhi;
2967}
2968
2970 bool UseActiveLaneMaskForControlFlow) {
2971 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2972 auto *WideCanonicalIV =
2974 assert(WideCanonicalIV &&
2975 "Must have widened canonical IV when tail folding!");
2976 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2977 VPSingleDefRecipe *LaneMask;
2978 if (UseActiveLaneMaskForControlFlow) {
2979 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2980 } else {
2981 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2982 VPValue *ALMMultiplier =
2983 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2984 LaneMask =
2985 B.createNaryOp(VPInstruction::ActiveLaneMask,
2986 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2987 nullptr, "active.lane.mask");
2988 }
2989
2990 // Walk users of WideCanonicalIV and replace the header mask of the form
2991 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2992 // removing the old one to ensure there is always only a single header mask.
2993 HeaderMask->replaceAllUsesWith(LaneMask);
2994 HeaderMask->eraseFromParent();
2995}
2996
2997template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2998 Op0_t In;
3000
3001 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3002
3003 template <typename OpTy> bool match(OpTy *V) const {
3004 if (m_Specific(In).match(V)) {
3005 Out = nullptr;
3006 return true;
3007 }
3008 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3009 }
3010};
3011
3012/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3013/// Returns the remaining part \p Out if so, or nullptr otherwise.
3014template <typename Op0_t, typename Op1_t>
3015static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3016 Op1_t &Out) {
3017 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3018}
3019
3020static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
3021 switch (IntrID) {
3022 case Intrinsic::masked_udiv:
3023 return Intrinsic::vp_udiv;
3024 case Intrinsic::masked_sdiv:
3025 return Intrinsic::vp_sdiv;
3026 case Intrinsic::masked_urem:
3027 return Intrinsic::vp_urem;
3028 case Intrinsic::masked_srem:
3029 return Intrinsic::vp_srem;
3030 default:
3031 return std::nullopt;
3032 }
3033}
3034
3035/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3036/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3037/// recipe could be created.
3038/// \p HeaderMask Header Mask.
3039/// \p CurRecipe Recipe to be transform.
3040/// \p EVL The explicit vector length parameter of vector-predication
3041/// intrinsics.
3043 VPRecipeBase &CurRecipe, VPValue &EVL) {
3044 VPlan *Plan = CurRecipe.getParent()->getPlan();
3045 DebugLoc DL = CurRecipe.getDebugLoc();
3046 VPValue *Addr, *Mask, *EndPtr;
3047
3048 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3049 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3050 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3051 EVLEndPtr->insertBefore(&CurRecipe);
3052 // Cast EVL (i32) to match the VF operand's type.
3053 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
3054 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
3056 EVLEndPtr->setOperand(1, EVLAsVF);
3057 return EVLEndPtr;
3058 };
3059
3060 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
3062 if (!V)
3063 return nullptr;
3064 auto *Reverse = new VPWidenIntrinsicRecipe(
3065 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3066 V->getScalarType(), {}, {}, DL);
3067 Reverse->insertBefore(&CurRecipe);
3068 return Reverse;
3069 };
3070
3071 if (match(&CurRecipe,
3072 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3073 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3074 EVL, Mask);
3075
3076 if (match(&CurRecipe,
3077 m_MaskedLoad(m_VPValue(EndPtr),
3078 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3079 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3080 Mask = GetVPReverse(Mask);
3081 Addr = AdjustEndPtr(EndPtr);
3082 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3083 Addr, EVL, Mask);
3084 LoadR->insertBefore(&CurRecipe);
3085 VPValue *Poison = Plan->getPoison(LoadR->getScalarType());
3086 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3087 {Poison, LoadR, &EVL},
3088 LoadR->getScalarType(), {}, {}, DL);
3089 }
3090
3091 VPValue *Stride;
3093 m_VPValue(Addr), m_VPValue(Stride),
3094 m_RemoveMask(HeaderMask, Mask),
3095 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3096 if (!Mask)
3097 Mask = Plan->getTrue();
3098 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3099 NewLoad->setOperand(2, Mask);
3100 NewLoad->setOperand(3, &EVL);
3101 return NewLoad;
3102 }
3103
3104 VPValue *StoredVal;
3105 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3106 m_RemoveMask(HeaderMask, Mask))))
3107 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3108 StoredVal, EVL, Mask);
3109
3110 if (match(&CurRecipe,
3111 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3112 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3113 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3114 Mask = GetVPReverse(Mask);
3115 Addr = AdjustEndPtr(EndPtr);
3116 VPValue *Poison = Plan->getPoison(StoredVal->getScalarType());
3117 auto *SpliceR = new VPWidenIntrinsicRecipe(
3118 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3119 StoredVal->getScalarType(), {}, {}, DL);
3120 SpliceR->insertBefore(&CurRecipe);
3121 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3122 SpliceR, EVL, Mask);
3123 }
3124
3125 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3126 if (Rdx->isConditional() &&
3127 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3128 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3129
3130 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3131 if (Interleave->getMask() &&
3132 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3133 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3134
3135 VPValue *LHS, *RHS;
3136 if (match(&CurRecipe, m_SelectLike(m_RemoveMask(HeaderMask, Mask),
3138 return new VPWidenIntrinsicRecipe(
3139 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3140 LHS->getScalarType(), {}, {}, DL);
3141
3142 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3143 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3144 VPValue *ZExt =
3145 VPBuilder(&CurRecipe)
3146 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3147 return new VPInstruction(
3148 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3149 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3150 }
3151
3152 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3153 if (match(&CurRecipe,
3155 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3156 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3157 {RHS, Plan->getTrue(), LHS, &EVL},
3158 LHS->getScalarType(), {}, {}, DL);
3159
3160 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3161 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3162 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3163 return new VPWidenIntrinsicRecipe(*VPID,
3164 {IntrR->getOperand(0),
3165 IntrR->getOperand(1),
3166 Mask ? Mask : Plan->getTrue(), &EVL},
3167 IntrR->getScalarType(), {}, {}, DL);
3168
3169 return nullptr;
3170}
3171
3172/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3173/// The transforms here need to preserve the original semantics.
3175 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3176 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3179 m_VPValue(EVL))) &&
3180 match(EVL, m_EVL(m_VPValue()))) {
3181 HeaderMask = R.getVPSingleValue();
3182 break;
3183 }
3184 }
3185 if (!HeaderMask)
3186 return;
3187
3188 SmallVector<VPRecipeBase *> OldRecipes;
3189 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3191 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3192 NewR->insertBefore(R);
3193 for (auto [Old, New] :
3194 zip_equal(R->definedValues(), NewR->definedValues()))
3195 Old->replaceAllUsesWith(New);
3196 OldRecipes.push_back(R);
3197 }
3198 }
3199
3200 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3201 // False, EVL)
3202 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3203 VPValue *Mask;
3204 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3205 auto *LogicalAnd = cast<VPInstruction>(U);
3206 auto *Merge = new VPWidenIntrinsicRecipe(
3207 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3208 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3209 Merge->insertBefore(LogicalAnd);
3210 LogicalAnd->replaceAllUsesWith(Merge);
3211 OldRecipes.push_back(LogicalAnd);
3212 }
3213 }
3214
3215 // Fold the following splice patterns:
3216 // splice.right(splice.left(poison, x, evl), poison, evl) -> x
3217 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3218 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3219 for (VPUser *U : collectUsersRecursively(EVL)) {
3220 auto *R = cast<VPRecipeBase>(U);
3221 VPValue *X;
3224 m_Poison(), m_VPValue(X), m_Specific(EVL)),
3225 m_Poison(), m_Specific(EVL)))) {
3226 R->getVPSingleValue()->replaceAllUsesWith(X);
3227 OldRecipes.push_back(R);
3228 continue;
3229 }
3230
3231 if (!match(U,
3234 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3236 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3237 continue;
3238
3239 auto *VPReverse = new VPWidenIntrinsicRecipe(
3240 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3241 X->getScalarType(), {}, {}, R->getDebugLoc());
3242 VPReverse->insertBefore(R);
3243 R->getVPSingleValue()->replaceAllUsesWith(VPReverse);
3244 OldRecipes.push_back(R);
3245 }
3246
3247 for (VPRecipeBase *R : reverse(OldRecipes)) {
3248 SmallVector<VPValue *> PossiblyDead(R->operands());
3249 R->eraseFromParent();
3250 for (VPValue *Op : PossiblyDead)
3252 }
3253}
3254
3255/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3256/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3257/// iteration.
3258static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3259 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3260 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3261
3262 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3263 VPValue *EVLAsIdx =
3267
3268 assert(all_of(Plan.getVF().users(),
3269 [&Plan](VPUser *U) {
3270 auto IsAllowedUser =
3271 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3272 VPWidenIntOrFpInductionRecipe,
3273 VPWidenMemIntrinsicRecipe>;
3274 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3275 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3276 IsAllowedUser);
3277 return IsAllowedUser(U);
3278 }) &&
3279 "User of VF that we can't transform to EVL.");
3280 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3282 });
3283
3284 assert(all_of(Plan.getVFxUF().users(),
3286 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3287 m_Specific(&Plan.getVFxUF())),
3289 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3290 "increment of the canonical induction.");
3291 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3292 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3293 // canonical induction must not be updated.
3295 });
3296
3297 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3298 // contained.
3299 bool ContainsFORs =
3301 if (ContainsFORs) {
3302 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3303 VPValue *MaxEVL = &Plan.getVF();
3304 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3305 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3306 MaxEVL = Builder.createScalarZExtOrTrunc(
3307 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3309
3310 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3311 VPValue *PrevEVL = Builder.createScalarPhi(
3312 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3313
3316 for (VPRecipeBase &R : *VPBB) {
3317 VPValue *V1, *V2;
3318 if (!match(&R,
3320 m_VPValue(V1), m_VPValue(V2))))
3321 continue;
3322 VPValue *Imm = Plan.getOrAddLiveIn(
3325 Intrinsic::experimental_vp_splice,
3326 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3327 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3328 VPSplice->insertBefore(&R);
3329 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3330 }
3331 }
3332 }
3333
3334 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3335 if (!HeaderMask)
3336 return;
3337
3338 // Ensure that any reduction that uses a select to mask off tail lanes does so
3339 // in the vector loop, not the middle block, since EVL tail folding can have
3340 // tail elements in the penultimate iteration.
3341 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3342 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3343 m_VPValue(), m_VPValue()))))
3344 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3345 Plan.getVectorLoopRegion();
3346 return true;
3347 }));
3348
3349 // Replace header masks with a mask equivalent to predicating by EVL:
3350 //
3351 // icmp ule widen-canonical-iv backedge-taken-count
3352 // ->
3353 // icmp ult step-vector, EVL
3354 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3355 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3356 Type *EVLType = EVL.getScalarType();
3357 VPValue *EVLMask = Builder.createICmp(
3359 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3360 HeaderMask->replaceAllUsesWith(EVLMask);
3361}
3362
3363/// Converts a tail folded vector loop region to step by
3364/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3365/// iteration.
3366///
3367/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3368/// replaces all uses of the canonical IV except for the canonical IV
3369/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3370/// only for loop iterations counting after this transformation.
3371///
3372/// - The header mask is replaced with a header mask based on the EVL.
3373///
3374/// - Plans with FORs have a new phi added to keep track of the EVL of the
3375/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3376/// @llvm.vp.splice.
3377///
3378/// The function uses the following definitions:
3379/// %StartV is the canonical induction start value.
3380///
3381/// The function adds the following recipes:
3382///
3383/// vector.ph:
3384/// ...
3385///
3386/// vector.body:
3387/// ...
3388/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3389/// [ %NextIter, %vector.body ]
3390/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3391/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3392/// ...
3393/// %OpEVL = cast i32 %VPEVL to IVSize
3394/// %NextIter = add IVSize %OpEVL, %CurrentIter
3395/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3396/// ...
3397///
3398/// If MaxSafeElements is provided, the function adds the following recipes:
3399/// vector.ph:
3400/// ...
3401///
3402/// vector.body:
3403/// ...
3404/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3405/// [ %NextIter, %vector.body ]
3406/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3407/// %cmp = cmp ult %AVL, MaxSafeElements
3408/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3409/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3410/// ...
3411/// %OpEVL = cast i32 %VPEVL to IVSize
3412/// %NextIter = add IVSize %OpEVL, %CurrentIter
3413/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3414/// ...
3415///
3417 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3418 if (Plan.hasScalarVFOnly())
3419 return;
3420 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3421 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3422
3423 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3424 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3425 VPValue *StartV = Plan.getZero(CanIVTy);
3426 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3427
3428 // Create the CurrentIteration recipe in the vector loop.
3429 auto *CurrentIteration =
3431 CurrentIteration->insertBefore(*Header, Header->begin());
3432 VPBuilder Builder(Header, Header->getFirstNonPhi());
3433 // Create the AVL (application vector length), starting from TC -> 0 in steps
3434 // of EVL.
3435 VPPhi *AVLPhi = Builder.createScalarPhi(
3436 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3437 VPValue *AVL = AVLPhi;
3438
3439 if (MaxSafeElements) {
3440 // Support for MaxSafeDist for correct loop emission.
3441 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3442 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3443 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3444 "safe_avl");
3445 }
3446 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3447 DebugLoc::getUnknown(), "evl");
3448
3449 Builder.setInsertPoint(CanonicalIVIncrement);
3450 VPValue *OpVPEVL = VPEVL;
3451
3452 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3453 OpVPEVL = Builder.createScalarZExtOrTrunc(
3454 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3455
3456 auto *NextIter = Builder.createAdd(
3457 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3458 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3459 CurrentIteration->addBackedgeValue(NextIter);
3460
3461 VPValue *NextAVL =
3462 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3463 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3464 AVLPhi->addIncoming(NextAVL);
3465
3466 fixupVFUsersForEVL(Plan, *VPEVL);
3467 removeDeadRecipes(Plan);
3468
3469 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3470 // except for the canonical IV increment.
3471 CanonicalIV->replaceUsesWithIf(CurrentIteration,
3472 [CanonicalIVIncrement](VPUser &U, unsigned) {
3473 return &U != CanonicalIVIncrement;
3474 });
3475 // TODO: support unroll factor > 1.
3476 Plan.setUF(1);
3477}
3478
3480 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3481 // There should be only one VPCurrentIteration in the entire plan.
3482 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3483
3486 for (VPRecipeBase &R : VPBB->phis())
3487 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3488 assert(!CurrentIteration &&
3489 "Found multiple CurrentIteration. Only one expected");
3490 CurrentIteration = PhiR;
3491 }
3492
3493 // Early return if it is not variable-length stepping.
3494 if (!CurrentIteration)
3495 return;
3496
3497 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3498 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3499
3500 // Convert CurrentIteration to concrete recipe.
3501 auto *ScalarR =
3502 VPBuilder(CurrentIteration)
3504 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3505 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3506 CurrentIteration->replaceAllUsesWith(ScalarR);
3507 CurrentIteration->eraseFromParent();
3508
3509 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3510 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3511 if (auto *CanIVInc = findUserOf(
3512 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3513 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3514 CanIVInc->eraseFromParent();
3515 }
3516}
3517
3519 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3520 if (!LoopRegion)
3521 return;
3522 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3523 if (Header->empty())
3524 return;
3525 // The EVL IV is always at the beginning.
3526 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3527 if (!EVLPhi)
3528 return;
3529
3530 // Bail if not an EVL tail folded loop.
3531 VPValue *AVL;
3532 if (!match(EVLPhi->getBackedgeValue(),
3533 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3534 return;
3535
3536 // The AVL may be capped to a safe distance.
3537 VPValue *SafeAVL, *UnsafeAVL;
3538 if (match(AVL,
3540 m_VPValue(SafeAVL)),
3541 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3542 AVL = UnsafeAVL;
3543
3544 VPValue *AVLNext;
3545 [[maybe_unused]] bool FoundAVLNext =
3547 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3548 assert(FoundAVLNext && "Didn't find AVL backedge?");
3549
3550 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3551 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3552 if (match(LatchBr, m_BranchOnCond(m_True())))
3553 return;
3554
3555 VPValue *CanIVInc;
3556 [[maybe_unused]] bool FoundIncrement = match(
3557 LatchBr,
3559 m_Specific(&Plan.getVectorTripCount()))));
3560 assert(FoundIncrement &&
3561 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3562 m_Specific(&Plan.getVFxUF()))) &&
3563 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3564 "trip count");
3565
3566 Type *AVLTy = AVLNext->getScalarType();
3567 VPBuilder Builder(LatchBr);
3568 LatchBr->setOperand(
3569 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3570}
3571
3573 VPlan &Plan, PredicatedScalarEvolution &PSE,
3574 const DenseMap<Value *, const SCEV *> &StridesMap,
3575 const VPDominatorTree &VPDT) {
3576 // Replace VPValues for known constant strides guaranteed by predicated scalar
3577 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3578 // blocks dominated by the vector preheader.
3579 assert(!Plan.getVectorLoopRegion() &&
3580 "expected to run before loop regions are created");
3581 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3582 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3583 auto *R = cast<VPRecipeBase>(&U);
3584 VPBlockBase *Parent = R->getParent();
3585 return VPDT.dominates(Preheader, Parent);
3586 };
3587 ValueToSCEVMapTy RewriteMap;
3588 for (const SCEV *Stride : StridesMap.values()) {
3589 using namespace SCEVPatternMatch;
3590 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3591 const APInt *StrideConst;
3592 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3593 // Only handle constant strides for now.
3594 continue;
3595
3596 auto *CI = Plan.getConstantInt(*StrideConst);
3597 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3598 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3599
3600 // The versioned value may not be used in the loop directly but through a
3601 // sext/zext. Add new live-ins in those cases.
3602 for (Value *U : StrideV->users()) {
3604 continue;
3605 VPValue *StrideVPV = Plan.getLiveIn(U);
3606 if (!StrideVPV)
3607 continue;
3608 unsigned BW = U->getType()->getScalarSizeInBits();
3609 APInt C =
3610 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3611 VPValue *CI = Plan.getConstantInt(C);
3612 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3613 }
3614 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3615 }
3616
3617 for (VPRecipeBase &R : *Plan.getEntry()) {
3618 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3619 if (!ExpSCEV)
3620 continue;
3621 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3622 auto *NewSCEV =
3623 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3624 if (NewSCEV != ScevExpr) {
3625 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3626 ExpSCEV->replaceAllUsesWith(NewExp);
3627 if (Plan.getTripCount() == ExpSCEV)
3628 Plan.resetTripCount(NewExp);
3629 }
3630 }
3631}
3632
3634 // Collect recipes in the backward slice of `Root` that may generate a poison
3635 // value that is used after vectorization.
3637 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3639 Worklist.push_back(Root);
3640
3641 // Traverse the backward slice of Root through its use-def chain.
3642 while (!Worklist.empty()) {
3643 VPRecipeBase *CurRec = Worklist.pop_back_val();
3644
3645 if (!Visited.insert(CurRec).second)
3646 continue;
3647
3648 // Prune search if we find another recipe generating a widen memory
3649 // instruction. Widen memory instructions involved in address computation
3650 // will lead to gather/scatter instructions, which don't need to be
3651 // handled.
3653 VPHeaderPHIRecipe>(CurRec))
3654 continue;
3655
3656 // This recipe contributes to the address computation of a widen
3657 // load/store. If the underlying instruction has poison-generating flags,
3658 // drop them directly.
3659 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3660 VPValue *A, *B;
3661 // Dropping disjoint from an OR may yield incorrect results, as some
3662 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3663 // for dependence analysis). Instead, replace it with an equivalent Add.
3664 // This is possible as all users of the disjoint OR only access lanes
3665 // where the operands are disjoint or poison otherwise.
3666 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3667 RecWithFlags->isDisjoint()) {
3668 VPBuilder Builder(RecWithFlags);
3669 VPInstruction *New =
3670 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3671 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3672 RecWithFlags->replaceAllUsesWith(New);
3673 RecWithFlags->eraseFromParent();
3674 CurRec = New;
3675 } else
3676 RecWithFlags->dropPoisonGeneratingFlags();
3677 } else {
3680 (void)Instr;
3681 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3682 "found instruction with poison generating flags not covered by "
3683 "VPRecipeWithIRFlags");
3684 }
3685
3686 // Add new definitions to the worklist.
3687 for (VPValue *Operand : CurRec->operands())
3688 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3689 Worklist.push_back(OpDef);
3690 }
3691 });
3692
3693 // We want to exclude the tail folding case, as we don't need to drop flags
3694 // for operations computing the first lane in this case: the first lane of the
3695 // header mask must always be true.
3696 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3697 return Mask && !vputils::isHeaderMask(Mask, Plan);
3698 };
3699
3700 // Traverse all the recipes in the VPlan and collect the poison-generating
3701 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3702 // VPInterleaveRecipe.
3703 auto Iter =
3706 for (VPRecipeBase &Recipe : *VPBB) {
3707 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3708 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3709 if (AddrDef && WidenRec->isConsecutive() &&
3710 IsNotHeaderMask(WidenRec->getMask()))
3711 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3712 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3713 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3714 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3715 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3716 }
3717 }
3718 }
3719}
3720
3722 VPlan &Plan,
3724 &InterleaveGroups,
3725 const bool &EpilogueAllowed) {
3726 if (InterleaveGroups.empty())
3727 return;
3728
3730 for (VPBasicBlock *VPBB :
3733 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3734 return isa<VPWidenMemoryRecipe>(&R);
3735 })) {
3736 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3737 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3738 }
3739
3740 // Interleave memory: for each Interleave Group we marked earlier as relevant
3741 // for this VPlan, replace the Recipes widening its memory instructions with a
3742 // single VPInterleaveRecipe at its insertion point.
3743 VPDominatorTree VPDT(Plan);
3744 for (const auto *IG : InterleaveGroups) {
3745 // Skip interleave groups where members don't have recipes. This can happen
3746 // when removeDeadRecipes removes recipes that are part of interleave groups
3747 // but have no users.
3748 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3749 return !IRMemberToRecipe.contains(Member);
3750 }))
3751 continue;
3752
3753 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3754 VPIRMetadata InterleaveMD(*Start);
3755 SmallVector<VPValue *, 4> StoredValues;
3756 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3757 StoredValues.push_back(StoreR->getStoredValue());
3758 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3759 Instruction *MemberI = IG->getMember(I);
3760 if (!MemberI)
3761 continue;
3762 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3763 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3764 StoredValues.push_back(StoreR->getStoredValue());
3765 InterleaveMD.intersect(*MemoryR);
3766 }
3767
3768 bool NeedsMaskForGaps =
3769 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3770 (!StoredValues.empty() && !IG->isFull());
3771
3772 Instruction *IRInsertPos = IG->getInsertPos();
3773 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3774 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3775
3777 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3778 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3779 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3780
3781 // Get or create the start address for the interleave group.
3782 VPValue *Addr = Start->getAddr();
3783 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3784 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3785 // We cannot re-use the address of member zero because it does not
3786 // dominate the insert position. Instead, use the address of the insert
3787 // position and create a PtrAdd adjusting it to the address of member
3788 // zero.
3789 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3790 // InsertPos or sink loads above zero members to join it.
3791 assert(IG->getIndex(IRInsertPos) != 0 &&
3792 "index of insert position shouldn't be zero");
3793 auto &DL = IRInsertPos->getDataLayout();
3794 APInt Offset(32,
3795 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3796 IG->getIndex(IRInsertPos),
3797 /*IsSigned=*/true);
3798 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3799 VPBuilder B(InsertPosR);
3800 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3801 }
3802 // If the group is reverse, adjust the index to refer to the last vector
3803 // lane instead of the first. We adjust the index from the first vector
3804 // lane, rather than directly getting the pointer for lane VF - 1, because
3805 // the pointer operand of the interleaved access is supposed to be uniform.
3806 if (IG->isReverse()) {
3807 auto *ReversePtr = new VPVectorEndPointerRecipe(
3808 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3809 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3810 ReversePtr->insertBefore(InsertPosR);
3811 Addr = ReversePtr;
3812 }
3813 auto *VPIG = new VPInterleaveRecipe(
3814 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3815 InterleaveMD, InsertPosR->getDebugLoc());
3816 VPIG->insertBefore(InsertPosR);
3817
3818 unsigned J = 0;
3819 for (unsigned i = 0; i < IG->getFactor(); ++i)
3820 if (Instruction *Member = IG->getMember(i)) {
3821 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3822 if (!Member->getType()->isVoidTy()) {
3823 VPValue *OriginalV = MemberR->getVPSingleValue();
3824 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3825 J++;
3826 }
3827 MemberR->eraseFromParent();
3828 }
3829 }
3830}
3831
3832/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3833/// value, phi and backedge value. In the following example:
3834///
3835/// vector.ph:
3836/// Successor(s): vector loop
3837///
3838/// <x1> vector loop: {
3839/// vector.body:
3840/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3841/// ...
3842/// EMIT branch-on-count ...
3843/// No successors
3844/// }
3845///
3846/// WIDEN-INDUCTION will get expanded to:
3847///
3848/// vector.ph:
3849/// ...
3850/// vp<%induction.start> = ...
3851/// vp<%induction.increment> = ...
3852///
3853/// Successor(s): vector loop
3854///
3855/// <x1> vector loop: {
3856/// vector.body:
3857/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3858/// ...
3859/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3860/// EMIT branch-on-count ...
3861/// No successors
3862/// }
3863static void
3865 VPlan *Plan = WidenIVR->getParent()->getPlan();
3866 VPValue *Start = WidenIVR->getStartValue();
3867 VPValue *Step = WidenIVR->getStepValue();
3868 VPValue *VF = WidenIVR->getVFValue();
3869 DebugLoc DL = WidenIVR->getDebugLoc();
3870
3871 // The value from the original loop to which we are mapping the new induction
3872 // variable.
3873 Type *Ty = WidenIVR->getScalarType();
3874
3875 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3878 VPIRFlags Flags = *WidenIVR;
3879 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3880 AddOp = Instruction::Add;
3881 MulOp = Instruction::Mul;
3882 } else {
3883 AddOp = ID.getInductionOpcode();
3884 MulOp = Instruction::FMul;
3885 }
3886
3887 // If the phi is truncated, truncate the start and step values.
3888 VPBuilder Builder(Plan->getVectorPreheader());
3889 Type *StepTy = Step->getScalarType();
3890 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3891 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3892 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3893 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3894 StepTy = Ty;
3895 }
3896
3897 // Construct the initial value of the vector IV in the vector loop preheader.
3898 Type *IVIntTy =
3900 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3901 if (StepTy->isFloatingPointTy())
3902 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3903
3904 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3905 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3906
3907 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3908 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3909 DebugLoc::getUnknown(), "induction");
3910
3911 // Create the widened phi of the vector IV.
3912 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3913 Init, WidenIVR->getDebugLoc(), "vec.ind");
3914
3915 // Create the backedge value for the vector IV.
3916 VPValue *Inc;
3917 VPValue *Prev;
3918 // If unrolled, use the increment and prev value from the operands.
3919 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3920 Inc = SplatVF;
3921 Prev = WidenIVR->getLastUnrolledPartOperand();
3922 } else {
3923 // Move the insertion point after the VF definition when the VF is defined
3924 // inside a loop, such as for EVL tail-folding.
3925 if (VPRecipeBase *R = VF->getDefiningRecipe())
3926 if (R->getParent()->getEnclosingLoopRegion())
3927 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3928
3929 // Multiply the vectorization factor by the step using integer or
3930 // floating-point arithmetic as appropriate.
3931 if (StepTy->isFloatingPointTy())
3932 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3933 DL);
3934 else
3935 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3936
3937 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3938 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3939 Prev = WidePHI;
3940 }
3941
3943 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3944 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3945 WidenIVR->getDebugLoc(), "vec.ind.next");
3946
3947 WidePHI->addIncoming(Next);
3948
3949 WidenIVR->replaceAllUsesWith(WidePHI);
3950}
3951
3952/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3953/// initial value, phi and backedge value. In the following example:
3954///
3955/// <x1> vector loop: {
3956/// vector.body:
3957/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3958/// ...
3959/// EMIT branch-on-count ...
3960/// }
3961///
3962/// WIDEN-POINTER-INDUCTION will get expanded to:
3963///
3964/// <x1> vector loop: {
3965/// vector.body:
3966/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3967/// EMIT %mul = mul %stepvector, %step
3968/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3969/// ...
3970/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3971/// EMIT branch-on-count ...
3972/// }
3974 VPlan *Plan = R->getParent()->getPlan();
3975 VPValue *Start = R->getStartValue();
3976 VPValue *Step = R->getStepValue();
3977 VPValue *VF = R->getVFValue();
3978
3979 assert(R->getInductionDescriptor().getKind() ==
3981 "Not a pointer induction according to InductionDescriptor!");
3982 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3983 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3984 "Recipe should have been replaced");
3985
3986 VPBuilder Builder(R);
3987 DebugLoc DL = R->getDebugLoc();
3988
3989 // Build a scalar pointer phi.
3990 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3991
3992 // Create actual address geps that use the pointer phi as base and a
3993 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3994 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3995 Type *StepTy = Step->getScalarType();
3996 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3997 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3998 VPValue *PtrAdd =
3999 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
4000 R->replaceAllUsesWith(PtrAdd);
4001
4002 // Create the backedge value for the scalar pointer phi.
4004 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
4005 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
4006 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
4007
4008 VPValue *InductionGEP =
4009 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
4010 ScalarPtrPhi->addIncoming(InductionGEP);
4011}
4012
4013/// Expand a VPDerivedIVRecipe into executable recipes.
4015 VPBuilder Builder(R);
4016 VPIRValue *Start = R->getStartValue();
4017 VPValue *Step = R->getStepValue();
4018 VPValue *Index = R->getIndex();
4019 Type *StepTy = Step->getScalarType();
4020 Type *IndexTy = Index->getScalarType();
4021 Index = StepTy->isIntegerTy()
4022 ? Builder.createScalarSExtOrTrunc(
4023 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
4024 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
4026 switch (R->getInductionKind()) {
4028 assert(Index->getScalarType() == Start->getScalarType() &&
4029 "Index type does not match StartValue type");
4030 return R->replaceAllUsesWith(Builder.createAdd(
4031 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4032 }
4034 return R->replaceAllUsesWith(Builder.createPtrAdd(
4035 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4037 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
4038 const FPMathOperator *FPBinOp = R->getFPBinOp();
4039 assert(FPBinOp &&
4040 (FPBinOp->getOpcode() == Instruction::FAdd ||
4041 FPBinOp->getOpcode() == Instruction::FSub) &&
4042 "Original BinOp should be defined for FP induction");
4043 FastMathFlags FMF = FPBinOp->getFastMathFlags();
4044 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
4045 return R->replaceAllUsesWith(
4046 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
4047 }
4049 return;
4050 }
4051 llvm_unreachable("Unhandled induction kind");
4052}
4053
4055 // Replace loop regions with explicity CFG.
4056 SmallVector<VPRegionBlock *> LoopRegions;
4058 vp_depth_first_deep(Plan.getEntry()))) {
4059 if (!R->isReplicator())
4060 LoopRegions.push_back(R);
4061 }
4062 for (VPRegionBlock *R : LoopRegions)
4063 R->dissolveToCFGLoop();
4064}
4065
4068 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4069 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4072 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4073 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4074 }
4075
4076 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4077 // single-condition branches:
4078 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4079 // the first condition is true, and otherwise jumps to a new interim block.
4080 // 2. A branch that ends the interim block, jumps to the second successor if
4081 // the second condition is true, and otherwise jumps to the third
4082 // successor.
4083 for (VPInstruction *Br : WorkList) {
4084 assert(Br->getNumOperands() == 2 &&
4085 "BranchOnTwoConds must have exactly 2 conditions");
4086 DebugLoc DL = Br->getDebugLoc();
4087 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4088 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4089 assert(Successors.size() == 3 &&
4090 "BranchOnTwoConds must have exactly 3 successors");
4091
4092 for (VPBlockBase *Succ : Successors)
4093 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4094
4095 VPValue *Cond0 = Br->getOperand(0);
4096 VPValue *Cond1 = Br->getOperand(1);
4097 VPBlockBase *Succ0 = Successors[0];
4098 VPBlockBase *Succ1 = Successors[1];
4099 VPBlockBase *Succ2 = Successors[2];
4100
4101 // If the successor block for both conditions is the same, then combine the
4102 // two conditions and plant a single conditional branch.
4103 if (Succ0 == Succ1) {
4104 VPBuilder Builder(Br);
4105 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4106 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4107 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4108 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4109 Br->eraseFromParent();
4110 continue;
4111 }
4112
4113 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4114 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4115
4116 VPBasicBlock *InterimBB =
4117 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4118
4119 VPBuilder(BrOnTwoCondsBB)
4121 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4122 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4123
4125 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4126 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4127 Br->eraseFromParent();
4128 }
4129}
4130
4133 vp_depth_first_deep(Plan.getEntry()))) {
4134 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4135 VPBuilder Builder(&R);
4136 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4138 WidenIVR->eraseFromParent();
4139 continue;
4140 }
4141
4142 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4143 // If the recipe only generates scalars, scalarize it instead of
4144 // expanding it.
4145 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4146 VPValue *PtrAdd =
4147 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4148 WidenIVR->replaceAllUsesWith(PtrAdd);
4149 WidenIVR->eraseFromParent();
4150 continue;
4151 }
4153 WidenIVR->eraseFromParent();
4154 continue;
4155 }
4156
4157 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4158 expandVPDerivedIV(DerivedIVR);
4159 DerivedIVR->eraseFromParent();
4160 continue;
4161 }
4162
4163 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4164 VPValue *CanIV = WideCanIV->getCanonicalIV();
4165 Type *CanIVTy = CanIV->getScalarType();
4166 VPValue *Step = WideCanIV->getStepValue();
4167 if (!Step) {
4168 assert(Plan.getConcreteUF() == 1 &&
4169 "Expected unroller to have materialized step for UF != 1");
4170 Step = Plan.getZero(CanIVTy);
4171 }
4172 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4173 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4174 Step = Builder.createAdd(
4175 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4176 VPValue *CanVecIV =
4177 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4178 WideCanIV->getNoWrapFlags());
4179 WideCanIV->replaceAllUsesWith(CanVecIV);
4180 WideCanIV->eraseFromParent();
4181 continue;
4182 }
4183
4184 // Expand VPBlendRecipe into VPInstruction::Select.
4185 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4186 VPValue *Select = Blend->getIncomingValue(0);
4187 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4188 Select = Builder.createSelect(Blend->getMask(I),
4189 Blend->getIncomingValue(I), Select,
4190 R.getDebugLoc(), "predphi", *Blend);
4191 Blend->replaceAllUsesWith(Select);
4192 Blend->eraseFromParent();
4193 continue;
4194 }
4195
4196 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4197 if (!VEPR->getOffset()) {
4198 assert(Plan.getConcreteUF() == 1 &&
4199 "Expected unroller to have materialized offset for UF != 1");
4200 VEPR->materializeOffset();
4201 }
4202 continue;
4203 }
4204
4205 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4206 Expr->decompose();
4207 Expr->eraseFromParent();
4208 continue;
4209 }
4210
4211 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4212 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4213 if (LastActiveL &&
4214 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4215 // Create Not(Mask) for all operands.
4217 for (VPValue *Op : LastActiveL->operands()) {
4218 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4219 NotMasks.push_back(NotMask);
4220 }
4221
4222 // Create FirstActiveLane on the inverted masks.
4223 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4224 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4225
4226 // Subtract 1 to get the last active lane.
4227 VPValue *One =
4228 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4229 VPValue *LastLane =
4230 Builder.createSub(FirstInactiveLane, One,
4231 LastActiveL->getDebugLoc(), "last.active.lane");
4232
4233 LastActiveL->replaceAllUsesWith(LastLane);
4234 LastActiveL->eraseFromParent();
4235 continue;
4236 }
4237
4238 // Lower MaskedCond with block mask to LogicalAnd.
4240 auto *VPI = cast<VPInstruction>(&R);
4241 assert(VPI->isMasked() &&
4242 "Unmasked MaskedCond should be simplified earlier");
4243 VPI->replaceAllUsesWith(Builder.createNaryOp(
4244 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4245 VPI->eraseFromParent();
4246 continue;
4247 }
4248
4249 // Lower CanonicalIVIncrementForPart to plain Add.
4250 if (match(
4251 &R,
4253 auto *VPI = cast<VPInstruction>(&R);
4254 VPValue *Add = Builder.createOverflowingOp(
4255 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4256 VPI->getDebugLoc());
4257 VPI->replaceAllUsesWith(Add);
4258 VPI->eraseFromParent();
4259 continue;
4260 }
4261
4262 // Lower BranchOnCount to ICmp + BranchOnCond.
4263 VPValue *IV, *TC;
4264 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4265 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4266 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4267 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4268 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4269 BranchOnCountInst->eraseFromParent();
4270 continue;
4271 }
4272
4273 VPValue *VectorStep;
4274 VPValue *ScalarStep;
4276 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4277 continue;
4278
4279 // Expand WideIVStep.
4280 auto *VPI = cast<VPInstruction>(&R);
4281 Type *IVTy = VPI->getScalarType();
4282 if (VectorStep->getScalarType() != IVTy) {
4284 ? Instruction::UIToFP
4285 : Instruction::Trunc;
4286 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4287 }
4288
4289 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4290 if (ScalarStep->getScalarType() != IVTy) {
4291 ScalarStep =
4292 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4293 }
4294
4295 VPIRFlags Flags;
4296 unsigned MulOpc;
4297 if (IVTy->isFloatingPointTy()) {
4298 MulOpc = Instruction::FMul;
4299 Flags = VPI->getFastMathFlagsOrNone();
4300 } else {
4301 MulOpc = Instruction::Mul;
4302 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4303 }
4304
4305 VPInstruction *Mul = Builder.createNaryOp(
4306 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4307 VectorStep = Mul;
4308 VPI->replaceAllUsesWith(VectorStep);
4309 VPI->eraseFromParent();
4310 }
4311 }
4312}
4313
4314/// Returns the VPValue representing the uncountable exit comparison used by
4315/// AnyOf if the recipes it depends on can be traced back to live-ins and
4316/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
4317/// generating the values for the comparison. The recipes are stored in
4318/// \p Recipes.
4319static std::optional<VPValue *>
4321 VPBasicBlock *LatchVPBB) {
4322 // Given a plain CFG VPlan loop with countable latch exiting block
4323 // \p LatchVPBB, we're looking to match the recipes contributing to the
4324 // uncountable exit condition comparison (here, vp<%4>) back to either
4325 // live-ins or the address nodes for the load used as part of the uncountable
4326 // exit comparison so that we can either move them within the loop, or copy
4327 // them to the preheader depending on the chosen method for dealing with
4328 // stores in uncountable exit loops.
4329 //
4330 // Currently, the address of the load is restricted to a GEP with 2 operands
4331 // and a live-in base address. This constraint may be relaxed later.
4332 //
4333 // VPlan ' for UF>=1' {
4334 // Live-in vp<%0> = VF * UF
4335 // Live-in vp<%1> = vector-trip-count
4336 // Live-in ir<20> = original trip-count
4337 //
4338 // ir-bb<entry>:
4339 // Successor(s): scalar.ph, vector.ph
4340 //
4341 // vector.ph:
4342 // Successor(s): for.body
4343 //
4344 // for.body:
4345 // EMIT vp<%2> = phi ir<0>, vp<%index.next>
4346 // EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
4347 // EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
4348 // EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
4349 // EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
4350 // EMIT vp<%3> = masked-cond ir<%uncountable.cond>
4351 // Successor(s): for.inc
4352 //
4353 // for.inc:
4354 // EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
4355 // EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
4356 // EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
4357 // EMIT vp<%4> = any-of ir<%3>
4358 // EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
4359 // EMIT branch-on-two-conds vp<%4>, vp<%5>
4360 // Successor(s): middle.block, middle.block, for.body
4361 //
4362 // middle.block:
4363 // Successor(s): ir-bb<exit>, scalar.ph
4364 //
4365 // ir-bb<exit>:
4366 // No successors
4367 //
4368 // scalar.ph:
4369 // }
4370
4371 // Find the uncountable loop exit condition.
4372 VPValue *UncountableCondition = nullptr;
4373 if (!match(LatchVPBB->getTerminator(),
4374 m_BranchOnTwoConds(m_AnyOf(m_VPValue(UncountableCondition)),
4375 m_VPValue())))
4376 return std::nullopt;
4377
4379 Worklist.push_back(UncountableCondition);
4380 while (!Worklist.empty()) {
4381 VPValue *V = Worklist.pop_back_val();
4382
4383 // Any value defined outside the loop does not need to be copied.
4384 if (V->isDefinedOutsideLoopRegions())
4385 continue;
4386
4387 // FIXME: Remove the single user restriction; it's here because we're
4388 // starting with the simplest set of loops we can, and multiple
4389 // users means needing to add PHI nodes in the transform.
4390 if (V->getNumUsers() > 1)
4391 return std::nullopt;
4392
4393 VPValue *Op1, *Op2;
4394 // Walk back through recipes until we find at least one load from memory.
4395 if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
4396 Worklist.push_back(Op1);
4397 Worklist.push_back(Op2);
4398 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4399 } else if (match(V, m_VPInstruction<Instruction::Load>(m_VPValue(Op1)))) {
4400 VPRecipeBase *GepR = Op1->getDefiningRecipe();
4401 // Only matching base + single offset term for now.
4402 if (GepR->getNumOperands() != 2)
4403 return std::nullopt;
4404 // Matching a GEP with a loop-invariant base ptr.
4406 m_LiveIn(), m_VPValue())))
4407 return std::nullopt;
4408 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4409 Recipes.push_back(cast<VPInstruction>(GepR));
4411 m_VPValue(Op1)))) {
4412 Worklist.push_back(Op1);
4413 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4414 } else
4415 return std::nullopt;
4416 }
4417
4418 // If we couldn't match anything, don't return the condition. It may be
4419 // defined outside the loop.
4420 if (Recipes.empty() || none_of(Recipes, [](VPInstruction *I) {
4422 }))
4423 return std::nullopt;
4424
4425 return UncountableCondition;
4426}
4427
4433
4434/// Update \p Plan to mask memory operations in the loop based on whether the
4435/// early exit is taken or not.
4436///
4437/// We're currently expecting to find a loop with properties similar to the
4438/// following:
4439///
4440/// for.body:
4441/// ir<%indvars.iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
4442/// EMIT ir<%arrayidx> = getelementptr inbounds nuw ir<@c>, ir<%indvars.iv>
4443/// EMIT-SCALAR ir<%0> = load ir<%arrayidx>
4444/// EMIT ir<%cmp1> = icmp sgt ir<%0>, ir<5>
4445/// EMIT vp<%1> = masked-cond ir<%cmp1>
4446/// Successor(s): if.end
4447///
4448/// if.end:
4449/// EMIT ir<%arrayidx3> = getelementptr inbounds nuw ir<@src>, ir<%indvars.iv>
4450/// EMIT-SCALAR ir<%2> = load ir<%arrayidx3>
4451/// EMIT ir<%add> = add nsw ir<%2>, ir<42>
4452/// EMIT ir<%arrayidx5> = getelementptr inbounds nuw ir<@dst>, ir<%indvars.iv>
4453/// EMIT store ir<%add>, ir<%arrayidx5>
4454/// EMIT ir<%indvars.iv.next> = add nuw nsw ir<%indvars.iv>, ir<1>
4455/// EMIT vp<%3> = any-of ir<%1>
4456/// EMIT ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<10000>
4457/// EMIT branch-on-two-conds vp<%3>, ir<%exitcond.not>
4458/// Successor(s): middle.block, middle.block, for.body
4459///
4460/// We currently expect LoopVectorizationLegality to ensure that:
4461/// * There must also be a counted exit. We will need to support speculative
4462/// or first-faulting loads before we can remove this restriction.
4463/// * Any stores within the loop must not alias with the load used for the
4464/// uncountable exit. We can relax this a bit with runtime aliasing checks.
4465/// * Other memory operations in the loop can take place before or after the
4466/// uncountable exit, but must also be unconditional. We need to support
4467/// combining the conditions in VPlanPredicator.
4468/// * The loop must have a single unconditional load contributing to the
4469/// uncountable exit comparison, and the other term must be loop-invariant.
4470/// Improving upon this requires work in getRecipesForUncountableExit to
4471/// handle more complex recipe graphs.
4474 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4475 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4476 AssumptionCache *AC) {
4477
4478 // Disconnect early exiting blocks from successors, remove branches. We
4479 // currently don't support multiple uses for recipes involved in creating
4480 // the uncountable exit condition.
4481 for (auto &Exit : Exits) {
4482 if (Exit.EarlyExitingVPBB == LatchVPBB)
4483 continue;
4484
4485 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4486 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4487 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4488 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4489 }
4490
4491 VPDominatorTree VPDT(Plan);
4492
4493 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4494 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4495 // version of the loop.
4496 SmallVector<VPInstruction *, 8> ConditionRecipes;
4497
4498 std::optional<VPValue *> Cond =
4499 getRecipesForUncountableExit(ConditionRecipes, LatchVPBB);
4500 if (!Cond)
4501 return false;
4502
4503 // Find load contributing to condition.
4504 // At the moment LoopVectorizationLegality only supports a single
4505 // early-exit expression with a compare and a single load that must
4506 // be unconditional.
4507 // TODO: Support more than one load.
4508 auto *Load =
4509 find_singleton<VPInstruction>(ConditionRecipes, [](auto *I, bool _) {
4511 ? I
4512 : nullptr;
4513 });
4514 assert(Load && "Couldn't find exactly one load");
4515 // TODO: Support conditional loads for uncountable exits.
4516 assert(VPDT.dominates(Load->getParent(), LatchVPBB) &&
4517 "Uncountable exit condition load is conditional.");
4518 VPInstruction *Ptr = cast<VPInstruction>(Load->getOperand(0));
4519
4520 // Ensure that we are guaranteed to be able to dereference the memory used
4521 // for determining the uncountable exit for the maximum possible number of
4522 // scalar iterations of the loop.
4523 //
4524 // TODO: Support first-faulting loads in cases where we don't know whether
4525 // all possible addresses are dereferenceable.
4526 {
4528 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4529 const DataLayout &DL = Plan.getDataLayout();
4530 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4531 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4533 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4534 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4535 &Predicates))
4536 return false;
4537 }
4538
4539 // Check for a single GEP for the condition load to see if we can link it to
4540 // a widen IV recipe with a step of 1; we're only interested in contiguous
4541 // accesses for the condition load right now.
4542 auto *IV = cast<VPWidenInductionRecipe>(&HeaderVPBB->front());
4543 if (!match(IV->getStartValue(), m_SpecificInt(0)) ||
4544 !match(IV->getStepValue(), m_SpecificInt(1)))
4545 return false;
4547 m_Specific(IV))))
4548 return false;
4549
4550 // We want to guarantee that the uncountable exit condition (and the mask
4551 // we will generate from it) are available for all operations in the loop
4552 // that need to be masked. If the condition recipes are not already the first
4553 // recipes in the header after the last phi, move them there.
4554 auto InsertIt = HeaderVPBB->getFirstNonPhi();
4555 while (InsertIt != HeaderVPBB->end() &&
4556 is_contained(ConditionRecipes, &*InsertIt)) {
4557 erase(ConditionRecipes, &*InsertIt);
4558 InsertIt++;
4559 }
4560 for (auto *Recipe : reverse(ConditionRecipes))
4561 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4562
4563 // Create a mask to represent all lanes that fully execute in the vector loop,
4564 // stopping short of any early exit.
4565 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4566 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4567 Type *IVScalarTy = IV->getScalarType();
4568 Type *FirstActiveTy = FirstActive->getScalarType();
4569 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4570 VPValue *Zero = Plan.getZero(IVScalarTy);
4571 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4572 FirstActiveTy, DebugLoc());
4574 {Zero, FirstActive, ALMMultiplier},
4575 DebugLoc(), "uncountable.exit.mask");
4576
4577 // Convert all other memory operations to use the mask.
4578 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4579 for (VPRecipeBase &R : *VPBB)
4580 if (R.mayReadOrWriteMemory() && &R != Load) {
4581 // TODO: Handle conditional memory operations in the loop.
4582 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4583 return false;
4584 cast<VPInstruction>(&R)->addMask(Mask);
4585 }
4586
4587 // Update middle block branch to compare (IV + however many lanes were active)
4588 // against the full trip count, since we may be exiting the vector loop early.
4589 // If we didn't take an early exit, we should get the equivalent of VF from
4590 // the FirstActiveLane.
4591 assert(match(MiddleVPBB->getTerminator(), m_BranchOnCond()) &&
4592 "Expected BranchOnCond terminator for MiddleVPBB");
4593 VPBuilder MiddleBuilder(MiddleVPBB->getTerminator());
4594 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4595 {Zero, IV}, DebugLoc());
4596 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4597 VPValue *FullTC =
4598 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4599 MiddleVPBB->getTerminator()->setOperand(0, FullTC);
4600
4601 // Update resume phi in scalar.ph.
4602 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4603 auto Phis = ScalarPH->phis();
4604 // TODO: Handle more than one Phi; re-derive from IV.
4605 // TODO: Handle reductions.
4606 if (range_size(Phis) != 1)
4607 return false;
4608 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4609 // Make sure we're referring to the same IV.
4610 assert(
4611 match(ContinueIV->getOperand(0),
4613 "Continuing from different IV");
4614 ContinueIV->setOperand(0, ExitIV);
4615 return true;
4616}
4617
4619 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4620 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4622#ifndef NDEBUG
4623 VPDominatorTree VPDT(Plan);
4624#endif
4625 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4627 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4628 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4629 if (Pred == MiddleVPBB)
4630 continue;
4631 // Collect condition for this early exit.
4632 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4633 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4634 VPValue *CondOfEarlyExitingVPBB;
4635 [[maybe_unused]] bool Matched =
4636 match(EarlyExitingVPBB->getTerminator(),
4637 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4638 assert(Matched && "Terminator must be BranchOnCond");
4639
4640 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4641 // the correct block mask.
4642 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4643 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4645 TrueSucc == ExitBlock
4646 ? CondOfEarlyExitingVPBB
4647 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4648 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4649 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4650 VPDT.properlyDominates(
4651 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4652 LatchVPBB)) &&
4653 "exit condition must dominate the latch");
4654 Exits.push_back({
4655 EarlyExitingVPBB,
4656 ExitBlock,
4657 CondToEarlyExit,
4658 });
4659 }
4660 }
4661
4662 assert(!Exits.empty() && "must have at least one early exit");
4663 // Sort exits by RPO order to get correct program order. RPO gives a
4664 // topological ordering of the CFG, ensuring upstream exits are checked
4665 // before downstream exits in the dispatch chain.
4667 HeaderVPBB);
4669 for (const auto &[Num, VPB] : enumerate(RPOT))
4670 RPOIdx[VPB] = Num;
4671 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4672 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4673 });
4674#ifndef NDEBUG
4675 // After RPO sorting, verify that for any pair where one exit dominates
4676 // another, the dominating exit comes first. This is guaranteed by RPO
4677 // (topological order) and is required for the dispatch chain correctness.
4678 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4679 for (unsigned J = I + 1; J < Exits.size(); ++J)
4680 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4681 Exits[I].EarlyExitingVPBB) &&
4682 "RPO sort must place dominating exits before dominated ones");
4683#endif
4684
4685 // Build the AnyOf condition for the latch terminator using logical OR
4686 // to avoid poison propagation from later exit conditions when an earlier
4687 // exit is taken.
4688 VPValue *Combined = Exits[0].CondToExit;
4689 for (const EarlyExitInfo &Info : drop_begin(Exits))
4690 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4691
4692 VPValue *IsAnyExitTaken =
4693 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4694
4695 // Create a comparison for the latch exit condition and replace the
4696 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4697 // is used as the latch-exit condition; canonical IV recipes have not been
4698 // introduced yet, so there is no BranchOnCount to derive the condition from.
4699 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4700 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4701 "Unexpected terminator");
4702 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4703 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4704 LatchExitingBranch->eraseFromParent();
4705 LatchBuilder.setInsertPoint(LatchVPBB);
4707 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4708 LatchVPBB->clearSuccessors();
4709
4711 // If handling the exiting lane in the scalar loop, combine the exit
4712 // conditions into a single BranchOnCond.
4713 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4714 MiddleVPBB->clearPredecessors();
4715 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4717 Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
4718 }
4719
4720 // Create the vector.early.exit blocks.
4721 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4722 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4723 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4724 VPBasicBlock *VectorEarlyExitVPBB =
4725 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4726 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4727 }
4728
4729 // Create the dispatch block (or reuse the single exit block if only one
4730 // exit). The dispatch block computes the first active lane of the combined
4731 // condition and, for multiple exits, chains through conditions to determine
4732 // which exit to take.
4733 VPBasicBlock *DispatchVPBB =
4734 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4735 : Plan.createVPBasicBlock("vector.early.exit.check");
4736 DispatchVPBB->setPredecessors({LatchVPBB});
4737 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4738 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4739 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4740 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4741
4742 // For each early exit, disconnect the original exiting block
4743 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4744 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4745 // values at the first active lane:
4746 //
4747 // Input:
4748 // early.exiting.I:
4749 // ...
4750 // EMIT branch-on-cond vp<%cond.I>
4751 // Successor(s): in.loop.succ, ir-bb<exit.I>
4752 //
4753 // ir-bb<exit.I>:
4754 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4755 //
4756 // Output:
4757 // early.exiting.I:
4758 // ...
4759 // Successor(s): in.loop.succ
4760 //
4761 // vector.early.exit.I:
4762 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4763 // Successor(s): ir-bb<exit.I>
4764 //
4765 // ir-bb<exit.I>:
4766 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4767 // vector.early.exit.I)
4768 //
4769 for (auto [Exit, VectorEarlyExitVPBB] :
4770 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4771 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4772 // Adjust the phi nodes in EarlyExitVPBB.
4773 // 1. remove incoming values from EarlyExitingVPBB,
4774 // 2. extract the incoming value at FirstActiveLane
4775 // 3. add back the extracts as last operands for the phis
4776 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4777 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4778 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4779 // values from VectorEarlyExitVPBB.
4780 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4781 auto *ExitIRI = cast<VPIRPhi>(&R);
4782 VPValue *IncomingVal =
4783 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4784 VPValue *NewIncoming = IncomingVal;
4785 if (!isa<VPIRValue>(IncomingVal)) {
4786 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4787 NewIncoming = EarlyExitBuilder.createNaryOp(
4788 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4789 DebugLoc::getUnknown(), "early.exit.value");
4790 }
4791 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4792 ExitIRI->addIncoming(NewIncoming);
4793 }
4794
4795 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4796 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4797 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4798 }
4799
4800 // Chain through exits: for each exit, check if its condition is true at
4801 // the first active lane. If so, take that exit; otherwise, try the next.
4802 // The last exit needs no check since it must be taken if all others fail.
4803 //
4804 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4805 //
4806 // latch:
4807 // ...
4808 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4809 // ...
4810 //
4811 // vector.early.exit.check:
4812 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4813 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4814 // EMIT branch-on-cond vp<%at.cond.0>
4815 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4816 //
4817 // vector.early.exit.check.0:
4818 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4819 // EMIT branch-on-cond vp<%at.cond.1>
4820 // Successor(s): vector.early.exit.1, vector.early.exit.2
4821 VPBasicBlock *CurrentBB = DispatchVPBB;
4822 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4823 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4824 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4825 DebugLoc::getUnknown(), "exit.cond.at.lane");
4826
4827 // For the last dispatch, branch directly to the last exit on false;
4828 // otherwise, create a new check block.
4829 bool IsLastDispatch = (I + 2 == Exits.size());
4830 VPBasicBlock *FalseBB =
4831 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4832 : Plan.createVPBasicBlock(
4833 Twine("vector.early.exit.check.") + Twine(I));
4834
4835 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4836 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4837 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4838 FalseBB->setPredecessors({CurrentBB});
4839
4840 CurrentBB = FalseBB;
4841 DispatchBuilder.setInsertPoint(CurrentBB);
4842 }
4843
4844 return true;
4845}
4846
4847/// This function tries convert extended in-loop reductions to
4848/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4849/// valid. The created recipe must be decomposed to its constituent
4850/// recipes before execution.
4851static VPExpressionRecipe *
4853 VFRange &Range) {
4854 Type *RedTy = Red->getScalarType();
4855 VPValue *VecOp = Red->getVecOp();
4856
4857 assert(!Red->isPartialReduction() &&
4858 "This path does not support partial reductions");
4859
4860 // Clamp the range if using extended-reduction is profitable.
4861 auto IsExtendedRedValidAndClampRange =
4862 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4864 [&](ElementCount VF) {
4865 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4867
4869 InstructionCost ExtCost =
4870 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4871 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4872
4873 assert(!RedTy->isFloatingPointTy() &&
4874 "getExtendedReductionCost only supports integer types");
4875 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4876 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4877 Red->getFastMathFlagsOrNone(), CostKind);
4878 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4879 },
4880 Range);
4881 };
4882
4883 VPValue *A;
4884 // Match reduce(ext)).
4886 IsExtendedRedValidAndClampRange(
4887 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4888 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4889 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4890
4891 return nullptr;
4892}
4893
4894/// This function tries convert extended in-loop reductions to
4895/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4896/// and valid. The created VPExpressionRecipe must be decomposed to its
4897/// constituent recipes before execution. Patterns of the
4898/// VPExpressionRecipe:
4899/// reduce.add(mul(...)),
4900/// reduce.add(mul(ext(A), ext(B))),
4901/// reduce.add(ext(mul(ext(A), ext(B)))).
4902/// reduce.fadd(fmul(ext(A), ext(B)))
4903static VPExpressionRecipe *
4905 VPCostContext &Ctx, VFRange &Range) {
4906 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4907 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4908 Opcode != Instruction::FAdd)
4909 return nullptr;
4910
4911 assert(!Red->isPartialReduction() &&
4912 "This path does not support partial reductions");
4913 Type *RedTy = Red->getScalarType();
4914
4915 // Clamp the range if using multiply-accumulate-reduction is profitable.
4916 auto IsMulAccValidAndClampRange =
4918 VPWidenCastRecipe *OuterExt) -> bool {
4920 [&](ElementCount VF) {
4922 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4923 InstructionCost MulAccCost;
4924
4925 // getMulAccReductionCost for in-loop reductions does not support
4926 // mixed or floating-point extends.
4927 if (Ext0 && Ext1 &&
4928 (Ext0->getOpcode() != Ext1->getOpcode() ||
4929 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4930 return false;
4931
4932 bool IsZExt =
4933 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4934 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4935 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4936 SrcVecTy, CostKind);
4937
4938 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4939 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4940 InstructionCost ExtCost = 0;
4941 if (Ext0)
4942 ExtCost += Ext0->computeCost(VF, Ctx);
4943 if (Ext1)
4944 ExtCost += Ext1->computeCost(VF, Ctx);
4945 if (OuterExt)
4946 ExtCost += OuterExt->computeCost(VF, Ctx);
4947
4948 return MulAccCost.isValid() &&
4949 MulAccCost < ExtCost + MulCost + RedCost;
4950 },
4951 Range);
4952 };
4953
4954 VPValue *VecOp = Red->getVecOp();
4955 VPRecipeBase *Sub = nullptr;
4956 VPValue *A, *B;
4957 VPValue *Tmp = nullptr;
4958
4959 if (RedTy->isFloatingPointTy())
4960 return nullptr;
4961
4962 // Sub reductions could have a sub between the add reduction and vec op.
4963 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4964 Sub = VecOp->getDefiningRecipe();
4965 VecOp = Tmp;
4966 }
4967
4968 // If ValB is a constant and can be safely extended, truncate it to the same
4969 // type as ExtA's operand, then extend it to the same type as ExtA. This
4970 // creates two uniform extends that can more easily be matched by the rest of
4971 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4972 // replaced with the new extend of the constant.
4973 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4974 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4975 VPWidenRecipe *Mul) {
4976 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4977 return;
4978 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4979 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4980 const APInt *Const;
4981 if (!match(ValB, m_APInt(Const)) ||
4983 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4984 return;
4985 // The truncate ensures that the type of each extended operand is the
4986 // same, and it's been proven that the constant can be extended from
4987 // NarrowTy safely. Necessary since ExtA's extended operand would be
4988 // e.g. an i8, while the const will likely be an i32. This will be
4989 // elided by later optimisations.
4990 VPBuilder Builder(Mul);
4991 auto *Trunc =
4992 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4993 Type *WideTy = ExtA->getScalarType();
4994 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4995 Mul->setOperand(1, ExtB);
4996 };
4997
4998 // Try to match reduce.add(mul(...)).
4999 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
5000 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
5001 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
5002 auto *Mul = cast<VPWidenRecipe>(VecOp);
5003
5004 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
5005 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
5006
5007 // Match reduce.add/sub(mul(ext, ext)).
5008 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
5009 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
5010 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
5011 if (Sub)
5012 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
5013 cast<VPWidenRecipe>(Sub), Red);
5014 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
5015 }
5016 // TODO: Add an expression type for this variant with a negated mul
5017 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
5018 return new VPExpressionRecipe(Mul, Red);
5019 }
5020 // TODO: Add an expression type for negated versions of other expression
5021 // variants.
5022 if (Sub)
5023 return nullptr;
5024
5025 // Match reduce.add(ext(mul(A, B))).
5026 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
5027 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
5028 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5029 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
5030 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
5031
5032 // reduce.add(ext(mul(ext, const)))
5033 // -> reduce.add(ext(mul(ext, ext(const))))
5034 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
5035
5036 // reduce.add(ext(mul(ext(A), ext(B))))
5037 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5038 // The inner extends must either have the same opcode as the outer extend or
5039 // be the same, in which case the multiply can never result in a negative
5040 // value and the outer extend can be folded away by doing wider
5041 // extends for the operands of the mul.
5042 if (Ext0 && Ext1 &&
5043 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
5044 Ext0->getOpcode() == Ext1->getOpcode() &&
5045 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
5046 auto *NewExt0 = new VPWidenCastRecipe(
5047 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
5048 *Ext0, *Ext0, Ext0->getDebugLoc());
5049 NewExt0->insertBefore(Ext0);
5050
5051 VPWidenCastRecipe *NewExt1 = NewExt0;
5052 if (Ext0 != Ext1) {
5053 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
5054 Ext->getScalarType(), nullptr, *Ext1,
5055 *Ext1, Ext1->getDebugLoc());
5056 NewExt1->insertBefore(Ext1);
5057 }
5058 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
5059 NewMul->insertBefore(Mul);
5060 Ext->replaceAllUsesWith(NewMul);
5061 Ext->eraseFromParent();
5062 Mul->eraseFromParent();
5063 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
5064 }
5065 }
5066 return nullptr;
5067}
5068
5069/// This function tries to create abstract recipes from the reduction recipe for
5070/// following optimizations and cost estimation.
5072 VPCostContext &Ctx,
5073 VFRange &Range) {
5074 // Creation of VPExpressions for partial reductions is entirely handled in
5075 // transformToPartialReduction.
5076 assert(!Red->isPartialReduction() &&
5077 "This path does not support partial reductions");
5078
5079 VPExpressionRecipe *AbstractR = nullptr;
5080 auto IP = std::next(Red->getIterator());
5081 auto *VPBB = Red->getParent();
5082 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
5083 AbstractR = MulAcc;
5084 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
5085 AbstractR = ExtRed;
5086 // Cannot create abstract inloop reduction recipes.
5087 if (!AbstractR)
5088 return;
5089
5090 AbstractR->insertBefore(*VPBB, IP);
5091 Red->replaceAllUsesWith(AbstractR);
5092}
5093
5104
5106 if (Plan.hasScalarVFOnly())
5107 return;
5108
5109#ifndef NDEBUG
5110 VPDominatorTree VPDT(Plan);
5111#endif
5112
5113 SmallVector<VPValue *> VPValues;
5114 if (VPValue *BTC = Plan.getBackedgeTakenCount())
5115 VPValues.push_back(BTC);
5116 append_range(VPValues, Plan.getLiveIns());
5117 for (VPRecipeBase &R : *Plan.getEntry())
5118 append_range(VPValues, R.definedValues());
5119
5120 auto *VectorPreheader = Plan.getVectorPreheader();
5121 for (VPValue *VPV : VPValues) {
5123 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
5124 continue;
5125
5126 // Add explicit broadcast at the insert point that dominates all users.
5127 VPBasicBlock *HoistBlock = VectorPreheader;
5128 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
5129 for (VPUser *User : VPV->users()) {
5130 if (User->usesScalars(VPV))
5131 continue;
5132 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
5133 HoistPoint = HoistBlock->begin();
5134 else
5135 assert(VPDT.dominates(VectorPreheader,
5136 cast<VPRecipeBase>(User)->getParent()) &&
5137 "All users must be in the vector preheader or dominated by it");
5138 }
5139
5140 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
5141 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
5142 VPV->replaceUsesWithIf(Broadcast,
5143 [VPV, Broadcast](VPUser &U, unsigned Idx) {
5144 return Broadcast != &U && !U.usesScalars(VPV);
5145 });
5146 }
5147}
5148
5149// Collect common metadata from a group of replicate recipes by intersecting
5150// metadata from all recipes in the group.
5152 VPIRMetadata CommonMetadata = *Recipes.front();
5153 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
5154 CommonMetadata.intersect(*Recipe);
5155 return CommonMetadata;
5156}
5157
5158template <unsigned Opcode>
5162 const Loop *L) {
5163 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
5164 "Only Load and Store opcodes supported");
5165 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
5166
5167 // For each address, collect operations with the same or complementary masks.
5170 Plan, PSE, L,
5171 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
5172 for (auto Recipes : Groups) {
5173 if (Recipes.size() < 2)
5174 continue;
5175
5177 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
5178 "Expected all recipes in group to have the same load-store type");
5179
5180 // Collect groups with the same or complementary masks.
5181 for (VPReplicateRecipe *&RecipeI : Recipes) {
5182 if (!RecipeI)
5183 continue;
5184
5185 VPValue *MaskI = RecipeI->getMask();
5187 Group.push_back(RecipeI);
5188 RecipeI = nullptr;
5189
5190 // Find all operations with the same or complementary masks.
5191 bool HasComplementaryMask = false;
5192 for (VPReplicateRecipe *&RecipeJ : Recipes) {
5193 if (!RecipeJ)
5194 continue;
5195
5196 VPValue *MaskJ = RecipeJ->getMask();
5197 // Check if any operation in the group has a complementary mask with
5198 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
5199 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
5200 match(MaskJ, m_Not(m_Specific(MaskI)));
5201 Group.push_back(RecipeJ);
5202 RecipeJ = nullptr;
5203 }
5204
5205 if (HasComplementaryMask) {
5206 assert(Group.size() >= 2 && "must have at least 2 entries");
5207 AllGroups.push_back(std::move(Group));
5208 }
5209 }
5210 }
5211
5212 return AllGroups;
5213}
5214
5215// Find the recipe with minimum alignment in the group.
5216template <typename InstType>
5217static VPReplicateRecipe *
5219 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
5220 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
5221 cast<InstType>(B->getUnderlyingInstr())->getAlign();
5222 });
5223}
5224
5227 const Loop *L) {
5228 auto Groups =
5230 if (Groups.empty())
5231 return;
5232
5233 // Process each group of loads.
5234 for (auto &Group : Groups) {
5235 // Try to use the earliest (most dominating) load to replace all others.
5236 VPReplicateRecipe *EarliestLoad = Group[0];
5237 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5238 VPBasicBlock *LastBB = Group.back()->getParent();
5239
5240 // Check that the load doesn't alias with stores between first and last.
5241 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5242 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5243 continue;
5244
5245 // Collect common metadata from all loads in the group.
5246 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5247
5248 // Find the load with minimum alignment to use.
5249 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5250
5251 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5252 assert(all_of(Group,
5253 [IsSingleScalar](VPReplicateRecipe *R) {
5254 return R->isSingleScalar() == IsSingleScalar;
5255 }) &&
5256 "all members in group must agree on IsSingleScalar");
5257
5258 // Create an unpredicated version of the earliest load with common
5259 // metadata.
5260 auto *UnpredicatedLoad = new VPReplicateRecipe(
5261 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5262 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5263
5264 UnpredicatedLoad->insertBefore(EarliestLoad);
5265
5266 // Replace all loads in the group with the unpredicated load.
5267 for (VPReplicateRecipe *Load : Group) {
5268 Load->replaceAllUsesWith(UnpredicatedLoad);
5269 Load->eraseFromParent();
5270 }
5271 }
5272}
5273
5274static bool
5276 PredicatedScalarEvolution &PSE, const Loop &L) {
5277 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5278 if (!StoreLoc || !StoreLoc->AATags.Scope)
5279 return false;
5280
5281 // When sinking a group of stores, all members of the group alias each other.
5282 // Skip them during the alias checks.
5283 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5284 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5285 SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L);
5286 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5287}
5288
5291 const Loop *L) {
5292 auto Groups =
5294 if (Groups.empty())
5295 return;
5296
5297 for (auto &Group : Groups) {
5298 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5299 continue;
5300
5301 // Use the last (most dominated) store's location for the unconditional
5302 // store.
5303 VPReplicateRecipe *LastStore = Group.back();
5304 VPBasicBlock *InsertBB = LastStore->getParent();
5305
5306 // Collect common alias metadata from all stores in the group.
5307 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5308
5309 // Build select chain for stored values.
5310 VPValue *SelectedValue = Group[0]->getOperand(0);
5311 VPBuilder Builder(InsertBB, LastStore->getIterator());
5312
5313 bool IsSingleScalar = Group[0]->isSingleScalar();
5314 for (unsigned I = 1; I < Group.size(); ++I) {
5315 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5316 "all members in group must agree on IsSingleScalar");
5317 VPValue *Mask = Group[I]->getMask();
5318 VPValue *Value = Group[I]->getOperand(0);
5319 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5320 Group[I]->getDebugLoc());
5321 }
5322
5323 // Find the store with minimum alignment to use.
5324 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5325
5326 // Create unconditional store with selected value and common metadata.
5327 auto *UnpredicatedStore = new VPReplicateRecipe(
5328 StoreWithMinAlign->getUnderlyingInstr(),
5329 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5330 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5331 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5332
5333 // Remove all predicated stores from the group.
5334 for (VPReplicateRecipe *Store : Group)
5335 Store->eraseFromParent();
5336 }
5337}
5338
5340 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5342 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5343 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5344
5345 VPValue *TC = Plan.getTripCount();
5346 if (TC->user_empty())
5347 return;
5348
5349 // Skip cases for which the trip count may be non-trivial to materialize.
5350 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5351 // tail is required.
5352 if (!Plan.hasScalarTail() ||
5354 Plan.getScalarPreheader() ||
5355 !isa<VPIRValue>(TC))
5356 return;
5357
5358 // Materialize vector trip counts for constants early if it can simply
5359 // be computed as (Original TC / VF * UF) * VF * UF.
5360 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5361 // tail-folded loops.
5362 ScalarEvolution &SE = *PSE.getSE();
5363 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5364 if (!isa<SCEVConstant>(TCScev))
5365 return;
5366 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5367 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5368 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5369 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5370}
5371
5373 VPBasicBlock *VectorPH) {
5375 if (BTC->user_empty())
5376 return;
5377
5378 VPBuilder Builder(VectorPH, VectorPH->begin());
5379 auto *TCTy = Plan.getTripCount()->getScalarType();
5380 auto *TCMO =
5381 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5382 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5383 BTC->replaceAllUsesWith(TCMO);
5384}
5385
5387 if (Plan.hasScalarVFOnly())
5388 return;
5389
5390 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5391 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5393 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5394 vp_depth_first_shallow(LoopRegion->getEntry()));
5395 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5396 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5397 // regions. Those are not materialized explicitly yet.
5398 // TODO: materialize build vectors for replicating recipes in replicating
5399 // regions.
5400 for (VPBasicBlock *VPBB :
5401 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5402 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5404 continue;
5405 auto *DefR = cast<VPSingleDefRecipe>(&R);
5406 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5407 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5408 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5409 };
5410 if ((isa<VPReplicateRecipe>(DefR) &&
5411 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5412 (isa<VPInstruction>(DefR) &&
5414 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5415 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5416 continue;
5417
5418 Type *ScalarTy = DefR->getScalarType();
5419 unsigned Opcode = ScalarTy->isStructTy()
5422 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5423 BuildVector->insertAfter(DefR);
5424
5425 DefR->replaceUsesWithIf(
5426 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5427 VPUser &U, unsigned) {
5428 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5429 });
5430 }
5431 }
5432
5433 // Create explicit VPInstructions to convert vectors to scalars. The current
5434 // implementation is conservative - it may miss some cases that may or may not
5435 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5436 // if they are known to operate on scalar values.
5437 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5438 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5440 VPDerivedIVRecipe>(&R))
5441 continue;
5442 for (VPValue *Def : R.definedValues()) {
5443 // Skip recipes that are single-scalar.
5444 // TODO: The Defs skipped here may or may not be vector values.
5445 // Introduce Unpacks, and remove them later, if they are guaranteed to
5446 // produce scalar values.
5447 if (vputils::isSingleScalar(Def))
5448 continue;
5449
5450 // Only introduce an Unpack if some, but not all, users use the first
5451 // lane only.
5452 unsigned NumFirstLaneUsers = count_if(Def->users(), [&Def](VPUser *U) {
5453 return U->usesFirstLaneOnly(Def);
5454 });
5455 if (!NumFirstLaneUsers || NumFirstLaneUsers == Def->getNumUsers())
5456 continue;
5457
5458 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5459 if (R.isPhi())
5460 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5461 else
5462 Unpack->insertAfter(&R);
5463 Def->replaceUsesWithIf(Unpack, [&Def](VPUser &U, unsigned) {
5464 return U.usesFirstLaneOnly(Def);
5465 });
5466 }
5467 }
5468 }
5469}
5470
5472 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5473 bool RequiresScalarEpilogue, VPValue *Step,
5474 std::optional<uint64_t> MaxRuntimeStep) {
5475 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5476 // There's nothing to do if there are no users of the vector trip count or its
5477 // IR value has already been set.
5478 if (VectorTC.user_empty() || VectorTC.getUnderlyingValue())
5479 return;
5480
5481 VPValue *TC = Plan.getTripCount();
5482 Type *TCTy = TC->getScalarType();
5483 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5484 if (auto *StepR = Step->getDefiningRecipe()) {
5485 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5486 "Step VPBB must dominate VectorPHVPBB");
5487 // Insert after Step's definition to maintain valid def-use ordering.
5488 InsertPt = std::next(StepR->getIterator());
5489 }
5490 VPBuilder Builder(VectorPHVPBB, InsertPt);
5491
5492 // For scalable steps, if TC is a constant and is divisible by the maximum
5493 // possible runtime step, then TC % Step == 0 for all valid vscale values
5494 // and the vector trip count equals TC directly.
5495 const APInt *TCVal;
5496 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5497 TCVal->urem(*MaxRuntimeStep) == 0) {
5498 VectorTC.replaceAllUsesWith(TC);
5499 return;
5500 }
5501
5502 // If the tail is to be folded by masking, round the number of iterations N
5503 // up to a multiple of Step instead of rounding down. This is done by first
5504 // adding Step-1 and then rounding down. Note that it's ok if this addition
5505 // overflows: the vector induction variable will eventually wrap to zero given
5506 // that it starts at zero and its Step is a power of two; the loop will then
5507 // exit, with the last early-exit vector comparison also producing all-true.
5508 if (TailByMasking) {
5509 TC = Builder.createAdd(
5510 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5511 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5512 }
5513
5514 // Now we need to generate the expression for the part of the loop that the
5515 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5516 // iterations are not required for correctness, or N - Step, otherwise. Step
5517 // is equal to the vectorization factor (number of SIMD elements) times the
5518 // unroll factor (number of SIMD instructions).
5519 VPValue *R =
5520 Builder.createNaryOp(Instruction::URem, {TC, Step},
5521 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5522
5523 // There are cases where we *must* run at least one iteration in the remainder
5524 // loop. See the cost model for when this can happen. If the step evenly
5525 // divides the trip count, we set the remainder to be equal to the step. If
5526 // the step does not evenly divide the trip count, no adjustment is necessary
5527 // since there will already be scalar iterations. Note that the minimum
5528 // iterations check ensures that N >= Step.
5529 if (RequiresScalarEpilogue) {
5530 assert(!TailByMasking &&
5531 "requiring scalar epilogue is not supported with fail folding");
5532 VPValue *IsZero =
5533 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5534 R = Builder.createSelect(IsZero, Step, R);
5535 }
5536
5537 VPValue *Res =
5538 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5539 VectorTC.replaceAllUsesWith(Res);
5540}
5541
5543 ElementCount VFEC) {
5544 // If VF and VFxUF have already been materialized (no remaining users),
5545 // there's nothing more to do.
5546 if (Plan.getVF().isMaterialized()) {
5547 assert(Plan.getVFxUF().isMaterialized() &&
5548 "VF and VFxUF must be materialized together");
5549 return;
5550 }
5551
5552 VPBuilder Builder(VectorPH, VectorPH->begin());
5553 Type *TCTy = Plan.getTripCount()->getScalarType();
5554 VPValue &VF = Plan.getVF();
5555 VPValue &VFxUF = Plan.getVFxUF();
5556 // If there are no users of the runtime VF, compute VFxUF by constant folding
5557 // the multiplication of VF and UF.
5558 if (VF.user_empty()) {
5559 VPValue *RuntimeVFxUF =
5560 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5561 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5562 return;
5563 }
5564
5565 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5566 // vscale) * UF.
5567 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5569 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5571 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5572 }
5573 VF.replaceAllUsesWith(RuntimeVF);
5574
5575 VPValue *MulByUF = Builder.createOverflowingOp(
5576 Instruction::Mul,
5577 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5578 {true, false});
5579 VFxUF.replaceAllUsesWith(MulByUF);
5580}
5581
5583 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5584 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5585 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5586
5587 VPBuilder Builder(Plan.getVectorPreheader());
5588 auto *AliasMask = Builder.createNaryOp(
5589 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5590 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5591
5592 if (HeaderMaskDef->isPhi())
5593 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5594 else
5595 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5596
5597 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5598 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5599 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5600 return &U != ClampedHeaderMask;
5601 });
5602}
5603
5604VPValue *
5606 ArrayRef<PointerDiffInfo> DiffChecks) {
5607 VPBuilder Builder(AliasCheckVPBB);
5608 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5609
5610 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5611 assert(IncomingAliasMask && "Expected an alias mask!");
5612
5613 VPValue *AliasMask = nullptr;
5614 for (const PointerDiffInfo &Check : DiffChecks) {
5616 VPValue *Sink =
5618 Type *AddrType = Src->getScalarType();
5619
5620 // TODO: Only freeze the required pointer (not both src and sink).
5621 if (Check.NeedsFreeze) {
5622 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5623 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5624 }
5625
5626 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5627 // dependency between the source and the sink. This is not necessary for
5628 // correctness of the mask, but using the "raw" variant prevents loads
5629 // depending on the completion of stores.
5630 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5631 Intrinsic::loop_dependence_war_mask,
5632 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5633
5634 if (AliasMask)
5635 AliasMask = Builder.createAnd(AliasMask, WARMask);
5636 else
5637 AliasMask = WARMask;
5638 }
5639
5641 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5642 VPValue *NumActive = Builder.createNaryOp(
5643 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5644 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5645 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5646 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5647
5648 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5649
5650 return ClampedVF;
5651}
5652
5654 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5655 VPBasicBlock *ClampedVFCheck =
5656 Plan.createVPBasicBlock("vector.clamped.vf.check");
5657
5658 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5659 VPBuilder Builder(ClampedVFCheck);
5661 Type *TCTy = Plan.getTripCount()->getScalarType();
5662
5663 // Check the "ClampedVF" from the alias mask is larger than one.
5664 VPValue *IsScalar =
5665 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5666 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5667
5668 VPValue *TripCount = Plan.getTripCount();
5669 VPValue *MaxUIntTripCount =
5671 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5672
5673 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5674 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5675 // condition (index.next == n.vec) may not be correct in the case of an
5676 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5677 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5678 // power-of-two).
5679 VPValue *TripCountCheck = Builder.createICmp(
5680 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5681
5682 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5683 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5684
5685 // Materialize the trip count early as this will add a use of (VFxUF) that
5686 // needs to be replaced with the ClampedVF.
5688 /*TailByMasking=*/true,
5689 /*RequiresScalarEpilogue=*/false,
5690 &Plan.getVFxUF());
5691
5692 assert(Plan.getConcreteUF() == 1 &&
5693 "Clamped VF not supported with interleaving");
5694 Plan.getVF().replaceAllUsesWith(ClampedVF);
5695 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5696}
5697
5699 ScalarEvolution &SE) {
5700 auto *Entry = Plan.getEntry();
5701 VPBuilder Builder(Entry, Entry->begin());
5703 ->getIRBasicBlock()
5704 ->getTerminator()
5705 ->getDebugLoc();
5706 VPSCEVExpander Expander(Builder, SE, DL);
5707
5708 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5709 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5710 // late expansion.
5711 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5712 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5713 if (!ExpSCEV || ExpSCEV->user_empty())
5714 continue;
5715 Builder.setInsertPoint(ExpSCEV);
5716 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5717 if (!Expanded)
5718 continue;
5719 ExpSCEV->replaceAllUsesWith(Expanded);
5720 // TripCount should not be used after expansion to VPInstructions. Reset to
5721 // poison to avoid dangling references.
5722 if (Plan.getTripCount() == ExpSCEV)
5723 Plan.resetTripCount(Plan.getPoison(ExpSCEV->getScalarType()));
5724 ExpSCEV->eraseFromParent();
5725 }
5726}
5727
5730 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5731
5732 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5733 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5734 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5735 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5736 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5737 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5738 if (!ExpSCEV)
5739 continue;
5740 const SCEV *Expr = ExpSCEV->getSCEV();
5741 Value *Res =
5742 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5743 ExpandedSCEVs[Expr] = Res;
5744 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5745 ExpSCEV->replaceAllUsesWith(Exp);
5746 if (Plan.getTripCount() == ExpSCEV)
5747 Plan.resetTripCount(Exp);
5748 ExpSCEV->eraseFromParent();
5749 }
5751 "all VPExpandSCEVRecipes must have been expanded");
5752 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5753 // to the VPIRBasicBlock.
5754 auto EI = Entry->begin();
5755 for (Instruction &I : drop_end(*EntryBB)) {
5756 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5757 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5758 EI++;
5759 continue;
5760 }
5762 }
5763
5764 return ExpandedSCEVs;
5765}
5766
5767/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5768/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5769/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5770/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5771/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5772/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5773/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5774/// is defined at \p Idx of a load interleave group.
5775/// A live-in or recipe defined outside the loop region can be converted, if it
5776/// is the same across all lanes, or we can create a BuildVector for it.
5777static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5778 VPValue *OpV, unsigned Idx, bool IsScalable) {
5779 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5780 if (Member0Op->isDefinedOutsideLoopRegions()) {
5781 // Operand matches Member0, broadcast across all fields for both live-ins
5782 // and recipes.
5783 if (Member0Op == OpV)
5784 return true;
5785 // Otherwise distinct per-field VPValues are assembled into a BuildVector.
5786 return !IsScalable && OpV->isDefinedOutsideLoopRegions() &&
5787 OpV->getScalarType() == Member0Op->getScalarType();
5788 }
5789 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5790 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5791 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5792 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5793 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5794 Member0Op == OpV;
5795 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5796 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5797 return false;
5798}
5799
5800static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5802 auto *WideMember0 = dyn_cast<VPRecipeWithIRFlags>(Ops[0]);
5803 if (!WideMember0)
5804 return false;
5805 for (VPValue *V : Ops) {
5807 return false;
5808 auto *R = cast<VPRecipeWithIRFlags>(V);
5809 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5810 return false;
5811 if (R->getScalarType() != WideMember0->getScalarType())
5812 return false;
5813 if (R->hasPredicate() && R->getPredicate() != WideMember0->getPredicate())
5814 return false;
5815 }
5816
5817 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5819 for (VPValue *Op : Ops)
5820 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5821
5822 if (canNarrowOps(OpsI, IsScalable))
5823 continue;
5824
5825 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5826 const auto &[OpIdx, OpV] = P;
5827 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5828 }))
5829 return false;
5830 }
5831
5832 return true;
5833}
5834
5835/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5836/// number of members both equal to VF. The interleave group must also access
5837/// the full vector width.
5838static std::optional<ElementCount>
5841 const TargetTransformInfo &TTI) {
5842 if (!InterleaveR || InterleaveR->getMask())
5843 return std::nullopt;
5844
5845 Type *GroupElementTy = nullptr;
5846 if (InterleaveR->getStoredValues().empty()) {
5847 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5848 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5849 return Op->getScalarType() == GroupElementTy;
5850 }))
5851 return std::nullopt;
5852 } else {
5853 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5854 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5855 return Op->getScalarType() == GroupElementTy;
5856 }))
5857 return std::nullopt;
5858 }
5859
5860 auto IG = InterleaveR->getInterleaveGroup();
5861 if (IG->getFactor() != IG->getNumMembers())
5862 return std::nullopt;
5863
5864 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5865 TypeSize Size = TTI.getRegisterBitWidth(
5868 assert(Size.isScalable() == VF.isScalable() &&
5869 "if Size is scalable, VF must be scalable and vice versa");
5870 return Size.getKnownMinValue();
5871 };
5872
5873 for (ElementCount VF : VFs) {
5874 unsigned MinVal = VF.getKnownMinValue();
5875 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5876 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5877 return {VF};
5878 }
5879 return std::nullopt;
5880}
5881
5882/// Returns true if \p VPValue is a narrow VPValue.
5883static bool isAlreadyNarrow(VPValue *VPV) {
5884 if (isa<VPIRValue>(VPV))
5885 return true;
5886 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5887 return RepR && RepR->isSingleScalar();
5888}
5889
5890// Convert the wide recipes defining the VPValues in \p Members feeding an
5891// interleave group to a single narrow variant. The first member is reused as
5892// the narrowed recipe. BuildVectors for live-in operands are inserted into \p
5893// Preheader.
5895 SmallPtrSetImpl<VPValue *> &NarrowedOps,
5896 VPBasicBlock *Preheader) {
5897 VPValue *V = Members.front();
5898 if (NarrowedOps.contains(V))
5899 return V;
5900
5901 if (V->isDefinedOutsideLoopRegions()) {
5902 assert(all_of(Members,
5903 [V](VPValue *M) {
5904 return M->isDefinedOutsideLoopRegions() &&
5905 M->getScalarType() == V->getScalarType();
5906 }) &&
5907 "expected distinct loop-invariant values of matching scalar type");
5908 auto *BV = new VPInstruction(VPInstruction::BuildVector, Members);
5909 Preheader->appendRecipe(BV);
5910 NarrowedOps.insert(BV);
5911 return BV;
5912 }
5913
5914 if (isAlreadyNarrow(V))
5915 return V;
5916
5917 VPRecipeBase *R = V->getDefiningRecipe();
5919 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5920 for (VPValue *Member : Members.drop_front())
5921 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5922 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5924 for (VPValue *Member : Members)
5925 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5926 WideMember0->setOperand(
5927 Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps, Preheader));
5928 }
5929 return V;
5930 }
5931
5932 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5933 // Narrow interleave group to wide load, as transformed VPlan will only
5934 // process one original iteration.
5935 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5936 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5937 LoadGroup->getMask(), /*Consecutive=*/true,
5938 *LoadGroup, LoadGroup->getDebugLoc());
5939 L->insertBefore(LoadGroup);
5940 NarrowedOps.insert(L);
5941 return L;
5942 }
5943
5944 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5945 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5946 "must be a single scalar load");
5947 NarrowedOps.insert(RepR);
5948 return RepR;
5949 }
5950
5951 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5952 VPValue *PtrOp = WideLoad->getAddr();
5953 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5954 PtrOp = VecPtr->getOperand(0);
5955 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5956 // process one original iteration.
5957 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5958 /*IsUniform*/ true,
5959 /*Mask*/ nullptr, {}, *WideLoad);
5960 N->insertBefore(WideLoad);
5961 NarrowedOps.insert(N);
5962 return N;
5963}
5964
5965std::unique_ptr<VPlan>
5967 const TargetTransformInfo &TTI) {
5968 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5969
5970 if (!VectorLoop)
5971 return nullptr;
5972
5973 // Only handle single-block loops for now.
5974 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5975 return nullptr;
5976
5977 // Skip plans when we may not be able to properly narrow.
5978 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5979 if (!match(&Exiting->back(), m_BranchOnCount()))
5980 return nullptr;
5981
5982 assert(match(&Exiting->back(),
5984 m_Specific(&Plan.getVectorTripCount()))) &&
5985 "unexpected branch-on-count");
5986
5988 std::optional<ElementCount> VFToOptimize;
5989 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5992 continue;
5993
5994 // Bail out on recipes not supported at the moment:
5995 // * phi recipes other than the canonical induction
5996 // * recipes writing to memory except interleave groups
5997 // Only support plans with a canonical induction phi.
5998 if (R.isPhi())
5999 return nullptr;
6000
6001 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
6002 if (R.mayWriteToMemory() && !InterleaveR)
6003 return nullptr;
6004
6005 // Bail out if any recipe defines a vector value used outside the
6006 // vector loop region.
6007 if (any_of(R.definedValues(), [&](VPValue *V) {
6008 return any_of(V->users(), [&](VPUser *U) {
6009 auto *UR = cast<VPRecipeBase>(U);
6010 return UR->getParent()->getParent() != VectorLoop;
6011 });
6012 }))
6013 return nullptr;
6014
6015 // All other ops are allowed, but we reject uses that cannot be converted
6016 // when checking all allowed consumers (store interleave groups) below.
6017 if (!InterleaveR)
6018 continue;
6019
6020 // Try to find a single VF, where all interleave groups are consecutive and
6021 // saturate the full vector width. If we already have a candidate VF, check
6022 // if it is applicable for the current InterleaveR, otherwise look for a
6023 // suitable VF across the Plan's VFs.
6025 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
6026 : to_vector(Plan.vectorFactors());
6027 std::optional<ElementCount> NarrowedVF =
6028 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
6029 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
6030 return nullptr;
6031 VFToOptimize = NarrowedVF;
6032
6033 // Skip read interleave groups.
6034 if (InterleaveR->getStoredValues().empty())
6035 continue;
6036
6037 // Narrow interleave groups, if all operands are already matching narrow
6038 // ops.
6039 auto *Member0 = InterleaveR->getStoredValues()[0];
6040 if (isAlreadyNarrow(Member0) &&
6041 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
6042 StoreGroups.push_back(InterleaveR);
6043 continue;
6044 }
6045
6046 // For now, we only support full interleave groups storing load interleave
6047 // groups.
6048 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
6049 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
6050 if (!DefR)
6051 return false;
6052 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
6053 return IR && IR->getInterleaveGroup()->isFull() &&
6054 IR->getVPValue(Op.index()) == Op.value();
6055 })) {
6056 StoreGroups.push_back(InterleaveR);
6057 continue;
6058 }
6059
6060 // Check if all values feeding InterleaveR are matching wide recipes, which
6061 // operands that can be narrowed.
6062 if (!canNarrowOps(InterleaveR->getStoredValues(),
6063 VFToOptimize->isScalable()))
6064 return nullptr;
6065 StoreGroups.push_back(InterleaveR);
6066 }
6067
6068 if (StoreGroups.empty())
6069 return nullptr;
6070
6071 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6072 bool RequiresScalarEpilogue =
6073 MiddleVPBB->getNumSuccessors() == 1 &&
6074 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
6075 // Bail out for tail-folding (middle block with a single successor to exit).
6076 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
6077 return nullptr;
6078
6079 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
6080 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
6081 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
6082 // TODO: Handle cases where only some interleave groups can be narrowed.
6083 std::unique_ptr<VPlan> NewPlan;
6084 if (size(Plan.vectorFactors()) != 1) {
6085 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
6086 Plan.setVF(*VFToOptimize);
6087 NewPlan->removeVF(*VFToOptimize);
6088 }
6089
6090 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
6091 SmallPtrSet<VPValue *, 4> NarrowedOps;
6092 VPBasicBlock *Preheader = Plan.getVectorPreheader();
6093 // Narrow operation tree rooted at store groups.
6094 for (auto *StoreGroup : StoreGroups) {
6095 VPValue *Res = narrowInterleaveGroupOp(StoreGroup->getStoredValues(),
6096 NarrowedOps, Preheader);
6097 auto *SI =
6098 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
6099 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
6100 /*Consecutive=*/true, *StoreGroup,
6101 StoreGroup->getDebugLoc());
6102 S->insertBefore(StoreGroup);
6103 StoreGroup->eraseFromParent();
6104 }
6105
6106 // Adjust induction to reflect that the transformed plan only processes one
6107 // original iteration.
6109 Type *CanIVTy = VectorLoop->getCanonicalIVType();
6110 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
6111 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
6112
6113 VPValue *UF = &Plan.getUF();
6114 VPValue *Step;
6115 if (VFToOptimize->isScalable()) {
6116 VPValue *VScale =
6117 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
6118 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
6119 {true, false});
6120 Plan.getVF().replaceAllUsesWith(VScale);
6121 } else {
6122 Step = UF;
6123 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
6124 }
6125 // Materialize vector trip count with the narrowed step.
6126 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
6127 RequiresScalarEpilogue, Step);
6128
6129 CanIVInc->setOperand(1, Step);
6130 Plan.getVFxUF().replaceAllUsesWith(Step);
6131
6132 removeDeadRecipes(Plan);
6133 assert(none_of(*VectorLoop->getEntryBasicBlock(),
6135 "All VPVectorPointerRecipes should have been removed");
6136 return NewPlan;
6137}
6138
6139/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
6140/// BranchOnCond recipe.
6142 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
6143 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6144 auto *MiddleTerm =
6146 // Only add branch metadata if there is a (conditional) terminator.
6147 if (!MiddleTerm)
6148 return;
6149
6150 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
6151 "must have a BranchOnCond");
6152 // Assume that `TripCount % VectorStep ` is equally distributed.
6153 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
6154 if (VF.isScalable() && VScaleForTuning.has_value())
6155 VectorStep *= *VScaleForTuning;
6156 assert(VectorStep > 0 && "trip count should not be zero");
6157 MDBuilder MDB(Plan.getContext());
6158 MDNode *BranchWeights =
6159 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
6160 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
6161}
6162
6164 VFRange &Range) {
6165 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
6166 auto *MiddleVPBB = Plan.getMiddleBlock();
6167 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6168
6169 auto IsScalableOne = [](ElementCount VF) -> bool {
6170 return VF == ElementCount::getScalable(1);
6171 };
6172
6173 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
6174 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
6175 if (!FOR)
6176 continue;
6177
6178 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
6179 "Cannot handle loops with uncountable early exits");
6180
6181 // Find the existing splice for this FOR, created in
6182 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
6183 // RecurSplice there; only RecurSplice itself still references FOR.
6184 auto *RecurSplice =
6186 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
6187
6188 // For VF vscale x 1, if vscale = 1, we are unable to extract the
6189 // penultimate value of the recurrence. Instead we rely on the existing
6190 // extract of the last element from the result of
6191 // VPInstruction::FirstOrderRecurrenceSplice.
6192 // TODO: Consider vscale_range info and UF.
6193 if (any_of(RecurSplice->users(),
6194 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
6196 Range))
6197 return;
6198
6199 // This is the second phase of vectorizing first-order recurrences, creating
6200 // extracts for users outside the loop. An overview of the transformation is
6201 // described below. Suppose we have the following loop with some use after
6202 // the loop of the last a[i-1],
6203 //
6204 // for (int i = 0; i < n; ++i) {
6205 // t = a[i - 1];
6206 // b[i] = a[i] - t;
6207 // }
6208 // use t;
6209 //
6210 // There is a first-order recurrence on "a". For this loop, the shorthand
6211 // scalar IR looks like:
6212 //
6213 // scalar.ph:
6214 // s.init = a[-1]
6215 // br scalar.body
6216 //
6217 // scalar.body:
6218 // i = phi [0, scalar.ph], [i+1, scalar.body]
6219 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
6220 // s2 = a[i]
6221 // b[i] = s2 - s1
6222 // br cond, scalar.body, exit.block
6223 //
6224 // exit.block:
6225 // use = lcssa.phi [s1, scalar.body]
6226 //
6227 // In this example, s1 is a recurrence because it's value depends on the
6228 // previous iteration. In the first phase of vectorization, we created a
6229 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
6230 // for users in the scalar preheader and exit block.
6231 //
6232 // vector.ph:
6233 // v_init = vector(..., ..., ..., a[-1])
6234 // br vector.body
6235 //
6236 // vector.body
6237 // i = phi [0, vector.ph], [i+4, vector.body]
6238 // v1 = phi [v_init, vector.ph], [v2, vector.body]
6239 // v2 = a[i, i+1, i+2, i+3]
6240 // v1' = splice(v1(3), v2(0, 1, 2))
6241 // b[i, i+1, i+2, i+3] = v2 - v1'
6242 // br cond, vector.body, middle.block
6243 //
6244 // middle.block:
6245 // vector.recur.extract.for.phi = v2(2)
6246 // vector.recur.extract = v2(3)
6247 // br cond, scalar.ph, exit.block
6248 //
6249 // scalar.ph:
6250 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6251 // [s.init, otherwise]
6252 // br scalar.body
6253 //
6254 // scalar.body:
6255 // i = phi [0, scalar.ph], [i+1, scalar.body]
6256 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6257 // s2 = a[i]
6258 // b[i] = s2 - s1
6259 // br cond, scalar.body, exit.block
6260 //
6261 // exit.block:
6262 // lo = lcssa.phi [s1, scalar.body],
6263 // [vector.recur.extract.for.phi, middle.block]
6264 //
6265 // Update extracts of the splice in the middle block: they extract the
6266 // penultimate element of the recurrence.
6268 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6269 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6270 continue;
6271
6272 auto *ExtractR = cast<VPInstruction>(&R);
6273 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6274 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6275 {}, "vector.recur.extract.for.phi");
6276 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6277 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6278 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6279 }
6280 }
6281 }
6282}
6283
6284/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6285/// value. Returns the widened IV if found, nullptr otherwise.
6287 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6288 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6289 Instruction::isIntDivRem(BinOp->getOpcode()))
6290 return nullptr;
6291
6292 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6293 VPValue *InvariantCandidate = BinOp->getOperand(1);
6294 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6295 std::swap(WidenIVCandidate, InvariantCandidate);
6296
6297 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6298 return nullptr;
6299
6300 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6301}
6302
6303/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6304/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6308 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6309 auto *ClonedOp = BinOp->clone();
6310 if (ClonedOp->getOperand(0) == WidenIV) {
6311 ClonedOp->setOperand(0, ScalarIV);
6312 } else {
6313 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6314 ClonedOp->setOperand(1, ScalarIV);
6315 }
6316 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6317 return ClonedOp;
6318}
6319
6322 Loop &L) {
6323 ScalarEvolution &SE = *PSE.getSE();
6324 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6325
6326 // Helper lambda to check if the IV range excludes the sentinel value. Try
6327 // signed first, then unsigned. Return an excluded sentinel if found,
6328 // otherwise return std::nullopt.
6329 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6330 bool UseMax) -> std::optional<APSInt> {
6331 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6332 for (bool Signed : {true, false}) {
6333 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6334 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6335
6336 ConstantRange IVRange =
6337 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6338 if (!IVRange.contains(Sentinel))
6339 return Sentinel;
6340 }
6341 return std::nullopt;
6342 };
6343
6344 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6345 for (VPRecipeBase &Phi :
6346 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6347 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6349 PhiR->getRecurrenceKind()))
6350 continue;
6351
6352 Type *PhiTy = PhiR->getScalarType();
6353 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6354 continue;
6355
6356 // If there's a header mask, the backedge select will not be the find-last
6357 // select.
6358 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6359 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6360 if (HeaderMask &&
6361 !match(BackedgeVal,
6362 m_Select(m_Specific(HeaderMask),
6363 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6364 continue;
6365
6366 // Get the find-last expression from the find-last select of the reduction
6367 // phi. The find-last select should be a select between the phi and the
6368 // find-last expression.
6369 VPValue *Cond, *FindLastExpression;
6370 if (!match(FindLastSelect, m_SelectLike(m_VPValue(Cond), m_Specific(PhiR),
6371 m_VPValue(FindLastExpression))) &&
6372 !match(FindLastSelect,
6373 m_SelectLike(m_VPValue(Cond), m_VPValue(FindLastExpression),
6374 m_Specific(PhiR))))
6375 continue;
6376
6377 // Check if FindLastExpression is a simple expression of a widened IV. If
6378 // so, we can track the underlying IV instead and sink the expression.
6379 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6380 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6381 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6382 &L);
6383 const SCEV *Step;
6384 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6385 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6387 "IVOfExpressionToSink not being an AddRec must imply "
6388 "FindLastExpression not being an AddRec.");
6389 continue;
6390 }
6391
6392 // Determine direction from SCEV step.
6393 if (!SE.isKnownNonZero(Step))
6394 continue;
6395
6396 // Positive step means we need UMax/SMax to find the last IV value, and
6397 // UMin/SMin otherwise.
6398 bool UseMax = SE.isKnownPositive(Step);
6399 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6400 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6401
6402 // Sinking an expression will disable epilogue vectorization. Only use it,
6403 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6404 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6405 // multiply or divide by large constant, respectively), which also makes
6406 // sinking undesirable.
6407 if (IVOfExpressionToSink) {
6408 const SCEV *FindLastExpressionSCEV =
6409 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6410 if (match(FindLastExpressionSCEV,
6411 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6412 bool NewUseMax = SE.isKnownPositive(Step);
6413 if (auto NewSentinel =
6414 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6415 // The original expression already has a sentinel, so prefer not
6416 // sinking to keep epilogue vectorization possible.
6417 SentinelVal = *NewSentinel;
6418 UseSigned = NewSentinel->isSigned();
6419 UseMax = NewUseMax;
6420 IVSCEV = FindLastExpressionSCEV;
6421 IVOfExpressionToSink = nullptr;
6422 }
6423 }
6424 }
6425
6426 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6427 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6428 // cannot use min/max.
6429 if (!SentinelVal) {
6430 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6431 if (AR->hasNoSignedWrap())
6432 UseSigned = true;
6433 else if (AR->hasNoUnsignedWrap())
6434 UseSigned = false;
6435 else
6436 continue;
6437 }
6438
6440 BackedgeVal,
6442
6443 VPValue *NewFindLastSelect = BackedgeVal;
6444 VPValue *SelectCond = Cond;
6445 if (!SentinelVal || IVOfExpressionToSink) {
6446 // When we need to create a new select, normalize the condition so that
6447 // PhiR is the last operand and include the header mask if needed.
6448 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6449 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6450 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6451 SelectCond = LoopBuilder.createNot(SelectCond);
6452
6453 // When tail folding, mask the condition with the header mask to prevent
6454 // propagating poison from inactive lanes in the last vector iteration.
6455 if (HeaderMask)
6456 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6457
6458 if (SelectCond != Cond || IVOfExpressionToSink) {
6459 NewFindLastSelect = LoopBuilder.createSelect(
6460 SelectCond,
6461 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6462 PhiR, DL);
6463 }
6464 }
6465
6466 // Create the reduction result in the middle block using sentinel directly.
6467 RecurKind MinMaxKind =
6468 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6469 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6470 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6471 FastMathFlags());
6472 DebugLoc ExitDL = RdxResult->getDebugLoc();
6473 VPBuilder MiddleBuilder(RdxResult);
6474 VPValue *ReducedIV =
6476 NewFindLastSelect, Flags, ExitDL);
6477
6478 // If IVOfExpressionToSink is an expression to sink, sink it now.
6479 VPValue *VectorRegionExitingVal = ReducedIV;
6480 if (IVOfExpressionToSink)
6481 VectorRegionExitingVal =
6482 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6483 ReducedIV, IVOfExpressionToSink);
6484
6485 VPValue *NewRdxResult;
6486 VPValue *StartVPV = PhiR->getStartValue();
6487 if (SentinelVal) {
6488 // Sentinel-based approach: reduce IVs with min/max, compare against
6489 // sentinel to detect if condition was ever true, select accordingly.
6490 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6491 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6492 Sentinel, ExitDL);
6493 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6494 StartVPV, ExitDL);
6495 StartVPV = Sentinel;
6496 } else {
6497 // Introduce a boolean AnyOf reduction to track if the condition was ever
6498 // true in the loop. Use it to select the initial start value, if it was
6499 // never true.
6500 auto *AnyOfPhi = new VPReductionPHIRecipe(
6501 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6502 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6503 AnyOfPhi->insertAfter(PhiR);
6504
6505 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6506 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6507 AnyOfPhi->setOperand(1, OrVal);
6508
6509 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6510 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6511
6512 // Initialize the IV reduction phi with the neutral element, not the
6513 // original start value, to ensure correct min/max reduction results.
6514 StartVPV = Plan.getOrAddLiveIn(
6515 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6516 }
6517 RdxResult->replaceAllUsesWith(NewRdxResult);
6518 RdxResult->eraseFromParent();
6519
6520 auto *NewPhiR = new VPReductionPHIRecipe(
6521 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6522 *NewFindLastSelect, RdxUnordered{1}, {},
6523 PhiR->hasUsesOutsideReductionChain());
6524 NewPhiR->insertBefore(PhiR);
6525 PhiR->replaceAllUsesWith(NewPhiR);
6526 PhiR->eraseFromParent();
6527 }
6528}
6529
6530namespace {
6531
6532using ExtendKind = TTI::PartialReductionExtendKind;
6533struct ReductionExtend {
6534 Type *SrcType = nullptr;
6535 ExtendKind Kind = ExtendKind::PR_None;
6536};
6537
6538/// Describes the extends used to compute the extended reduction operand.
6539/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6540/// operation.
6541struct ExtendedReductionOperand {
6542 /// The recipe that consumes the extends.
6543 VPWidenRecipe *ExtendsUser = nullptr;
6544 /// Extend descriptions (inputs to getPartialReductionCost).
6545 ReductionExtend ExtendA, ExtendB;
6546};
6547
6548/// A chain of recipes that form a partial reduction. Matches either
6549/// reduction_bin_op (extended op, accumulator), or
6550/// reduction_bin_op (accumulator, extended op).
6551/// The possible forms of the "extended op" are listed in
6552/// matchExtendedReductionOperand.
6553struct VPPartialReductionChain {
6554 /// The top-level binary operation that forms the reduction to a scalar
6555 /// after the loop body.
6556 VPWidenRecipe *ReductionBinOp = nullptr;
6557 /// The user of the extends that is then reduced.
6558 ExtendedReductionOperand ExtendedOp;
6559 /// The recurrence kind for the entire partial reduction chain.
6560 /// This allows distinguishing between Sub and AddWithSub recurrences,
6561 /// when the ReductionBinOp is a Instruction::Sub.
6562 RecurKind RK;
6563 /// The index of the accumulator operand of ReductionBinOp. The extended op
6564 /// is `1 - AccumulatorOpIdx`.
6565 unsigned AccumulatorOpIdx;
6566 unsigned ScaleFactor;
6567 /// Optional blend to represent predication for the block that updates the
6568 /// reduction.
6569 VPBlendRecipe *Blend = nullptr;
6570};
6571
6572static VPSingleDefRecipe *
6573optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6574 // reduce.add(mul(ext(A), C))
6575 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6576 const APInt *Const;
6577 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6578 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6579 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6580 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6581 if (!Op->hasOneUse() ||
6583 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6584 return Op;
6585
6586 VPBuilder Builder(Op);
6587 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6588 Op->getOperand(1), NarrowTy);
6589 Type *WideTy = ExtA->getScalarType();
6590 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6591 return Op;
6592 }
6593
6594 // reduce.add(abs(sub(ext(A), ext(B))))
6595 // -> reduce.add(ext(absolute-difference(A, B)))
6596 VPValue *X, *Y;
6599 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6600 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6601 assert(Ext->getOpcode() ==
6602 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6603 "Expected both the LHS and RHS extends to be the same");
6604 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6605 VPBuilder Builder(Op);
6606 Type *SrcTy = X->getScalarType();
6607 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6608 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6609 auto *Max = Builder.insert(
6610 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6611 {FreezeX, FreezeY}, SrcTy));
6612 auto *Min = Builder.insert(
6613 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6614 {FreezeX, FreezeY}, SrcTy));
6615 auto *AbsDiff =
6616 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6617 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6618 Op->getScalarType());
6619 }
6620
6621 // reduce.add(ext(mul(ext(A), ext(B))))
6622 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6623 // TODO: Support this optimization for float types.
6625 m_ZExtOrSExt(m_VPValue()))))) {
6626 auto *Ext = cast<VPWidenCastRecipe>(Op);
6627 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6628 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6629 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6630 if (!Mul->hasOneUse() ||
6631 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6632 MulLHS->getOpcode() != MulRHS->getOpcode())
6633 return Op;
6634 VPBuilder Builder(Mul);
6635 auto *NewLHS = Builder.createWidenCast(
6636 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6637 auto *NewRHS = MulLHS == MulRHS
6638 ? NewLHS
6639 : Builder.createWidenCast(MulRHS->getOpcode(),
6640 MulRHS->getOperand(0),
6641 Ext->getScalarType());
6642 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6643 Builder.insert(NewMul);
6644 Op->replaceAllUsesWith(NewMul);
6645 Op->eraseFromParent();
6646 Mul->eraseFromParent();
6647 return NewMul;
6648 }
6649
6650 return Op;
6651}
6652
6653static VPExpressionRecipe *
6654createPartialReductionExpression(VPReductionRecipe *Red) {
6655 VPValue *VecOp = Red->getVecOp();
6656
6657 // reduce.[f]add(ext(op))
6658 // -> VPExpressionRecipe(op, red)
6659 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6660 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6661
6662 // reduce.[f]add(neg(ext(op)))
6663 // -> VPExpressionRecipe(op, sub/neg, red)
6664 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6665 auto *Neg = cast<VPWidenRecipe>(VecOp);
6666 auto *Ext =
6667 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6668 return new VPExpressionRecipe(Ext, Neg, Red);
6669 }
6670
6671 // reduce.[f]add([f]mul(ext(a), ext(b)))
6672 // -> VPExpressionRecipe(a, b, mul, red)
6673 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6674 match(VecOp,
6676 auto *Mul = cast<VPWidenRecipe>(VecOp);
6677 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6678 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6679 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6680 }
6681
6682 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6683 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6684 if (match(VecOp,
6686 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6687 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6688 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6689 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6690 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6691 }
6692
6693 // reduce.add(neg(mul(ext(a), ext(b))))
6694 // -> VPExpressionRecipe(a, b, mul, sub, red)
6696 m_ZExtOrSExt(m_VPValue()))))) {
6697 auto *Sub = cast<VPWidenRecipe>(VecOp);
6698 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6699 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6700 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6701 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6702 }
6703
6704 llvm_unreachable("Unsupported expression");
6705}
6706
6707// Helper to transform a partial reduction chain into a partial reduction
6708// recipe. Assumes profitability has been checked.
6709static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6710 VPlan &Plan,
6711 VPReductionPHIRecipe *RdxPhi) {
6712 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6713 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6714
6715 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6716 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6717 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6718
6719 // FIXME: Do these transforms before invoking the cost-model.
6720 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6721
6722 // Sub-reductions can be implemented in two ways:
6723 // (1) negate the operand in the vector loop (the default way).
6724 // (2) subtract the reduced value from the init value in the middle block.
6725 // Both ways keep the reduction itself as an 'add' reduction.
6726 //
6727 // The ISD nodes for partial reductions don't support folding the
6728 // sub/negation into its operands because the following is not a valid
6729 // transformation:
6730 // sub(0, mul(ext(a), ext(b)))
6731 // -> mul(ext(a), ext(sub(0, b)))
6732 //
6733 // It's therefore better to choose option (2) such that the partial
6734 // reduction is always positive (starting at '0') and to do a final
6735 // subtract in the middle block.
6736 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6737 Chain.RK != RecurKind::Sub) ||
6738 (WidenRecipe->getOpcode() == Instruction::FSub &&
6739 Chain.RK != RecurKind::FSub)) {
6740 VPBuilder Builder(WidenRecipe);
6741 Type *ElemTy = ExtendedOp->getScalarType();
6742 VPWidenRecipe *NegRecipe;
6743 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6744 NegRecipe =
6745 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6747 } else {
6748 auto *Zero = Plan.getZero(ElemTy);
6749 NegRecipe =
6750 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6752 }
6753 Builder.insert(NegRecipe);
6754 ExtendedOp = NegRecipe;
6755 }
6756
6757 // Check if WidenRecipe is the final result of the reduction. If so, look
6758 // through the Select recipe introduced by tail-folding, otherwise look
6759 // through any Blend recipe introduced by predication for the block.
6760 VPValue *ExitSearch =
6761 Chain.Blend ? cast<VPValue>(Chain.Blend) : cast<VPValue>(WidenRecipe);
6762
6763 VPValue *Cond = nullptr;
6765 findUserOf(ExitSearch, m_Select(m_VPValue(Cond), m_Specific(ExitSearch),
6766 m_Specific(RdxPhi))));
6767
6768 if (Chain.Blend) {
6769 VPValue *BlendCond = Chain.Blend->getMask(0);
6770 Cond = ExitValue ? VPBuilder(WidenRecipe)
6771 .createLogicalAnd(Cond, BlendCond,
6772 WidenRecipe->getDebugLoc())
6773 : BlendCond;
6774 }
6775
6776 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6777 RdxPhi->getBackedgeValue() == ExitValue ||
6778 RdxPhi->getBackedgeValue() == Chain.Blend;
6779 assert((!ExitValue || IsLastInChain) &&
6780 "if we found ExitValue, it must match RdxPhi's backedge value");
6781
6782 Type *PhiType = RdxPhi->getScalarType();
6783 RecurKind RdxKind =
6785 auto *PartialRed = new VPReductionRecipe(
6786 RdxKind,
6787 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6788 : FastMathFlags(),
6789 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6790 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6791 PartialRed->insertBefore(WidenRecipe);
6792
6793 if (ExitValue)
6794 ExitValue->replaceAllUsesWith(PartialRed);
6795 if (Chain.Blend)
6796 Chain.Blend->replaceAllUsesWith(PartialRed);
6797 WidenRecipe->replaceAllUsesWith(PartialRed);
6798
6799 // For cost-model purposes, fold this into a VPExpression.
6800 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6801 E->insertBefore(WidenRecipe);
6802 PartialRed->replaceAllUsesWith(E);
6803
6804 // We only need to update the PHI node once, which is when we find the
6805 // last reduction in the chain.
6806 if (!IsLastInChain)
6807 return;
6808
6809 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6810 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6811 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6812
6813 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6814 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6815 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6816 StartInst->setOperand(2, NewScaleFactor);
6817
6818 // If this is the last value in a sub-reduction chain, then update the PHI
6819 // node to start at `0` and update the reduction-result to subtract from
6820 // the PHI's start value.
6821 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6822 return;
6823
6824 VPValue *OldStartValue = StartInst->getOperand(0);
6825 StartInst->setOperand(0, StartInst->getOperand(1));
6826
6827 // Replace reduction_result by 'sub (startval, reductionresult)'.
6829 assert(RdxResult && "Could not find reduction result");
6830
6831 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6832 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6833 : Instruction::BinaryOps::Sub;
6834 VPInstruction *NewResult = Builder.createNaryOp(
6835 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6836 RdxPhi->getDebugLoc());
6837 RdxResult->replaceUsesWithIf(
6838 NewResult,
6839 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6840}
6841
6842/// Returns the cost of a link in a partial-reduction chain for a given VF.
6843static InstructionCost
6844getPartialReductionLinkCost(VPCostContext &CostCtx,
6845 const VPPartialReductionChain &Link,
6846 ElementCount VF) {
6847 Type *RdxType = Link.ReductionBinOp->getScalarType();
6848 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6849 std::optional<unsigned> BinOpc = std::nullopt;
6850 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6851 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6852 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6853
6854 std::optional<llvm::FastMathFlags> Flags;
6855 if (RdxType->isFloatingPointTy())
6856 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6857
6858 auto GetLinkOpcode = [&Link]() -> unsigned {
6859 switch (Link.RK) {
6860 case RecurKind::Sub:
6861 return Instruction::Add;
6862 case RecurKind::FSub:
6863 return Instruction::FAdd;
6864 default:
6865 return Link.ReductionBinOp->getOpcode();
6866 }
6867 };
6868
6869 return CostCtx.TTI.getPartialReductionCost(
6870 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6871 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6872 CostCtx.CostKind, Flags);
6873}
6874
6875static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6877}
6878
6879/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6880/// operand. This is an operand where the source of the value (e.g. a load) has
6881/// been extended (sext, zext, or fpext) before it is used in the reduction.
6882///
6883/// Possible forms matched by this function:
6884/// - UpdateR(PrevValue, ext(...))
6885/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6886/// - UpdateR(PrevValue, mul(ext(...), Constant))
6887/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6888/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6889/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6890///
6891/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6892static std::optional<ExtendedReductionOperand>
6893matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6894 assert(is_contained(UpdateR->operands(), Op) &&
6895 "Op should be operand of UpdateR");
6896
6897 // Try matching an absolute difference operand of the form
6898 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6899 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6900 // difference on a wider type and get the extend for "free" from the partial
6901 // reduction.
6902 VPValue *X, *Y;
6903 if (Op->hasOneUse() &&
6907 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6908 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6909 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6910 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6911 Type *LHSInputType = X->getScalarType();
6912 Type *RHSInputType = Y->getScalarType();
6913 if (LHSInputType != RHSInputType ||
6914 LHSExt->getOpcode() != RHSExt->getOpcode())
6915 return std::nullopt;
6916 // Note: This is essentially the same as matching ext(...) as we will
6917 // rewrite this operand to ext(absolute-difference(A, B)).
6918 return ExtendedReductionOperand{
6919 Sub,
6920 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6921 /*ExtendB=*/{}};
6922 }
6923
6924 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6926 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6927 VPValue *CastSource = CastRecipe->getOperand(0);
6928 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6929 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6930 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6931 // Match: ext(mul(...))
6932 // Record the outer extend kind and set `Op` to the mul. We can then match
6933 // this as a binary operation. Note: We can optimize out the outer extend
6934 // by widening the inner extends to match it. See
6935 // optimizeExtendsForPartialReduction.
6936 Op = CastSource;
6937 } else {
6938 return ExtendedReductionOperand{
6939 UpdateR,
6940 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6941 /*ExtendB=*/{}};
6942 }
6943 }
6944
6945 if (!Op->hasOneUse())
6946 return std::nullopt;
6947
6949 if (!MulOp ||
6950 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6951 return std::nullopt;
6952
6953 // The rest of the matching assumes `Op` is a (possibly extended) mul
6954 // operation.
6955
6956 VPValue *LHS = MulOp->getOperand(0);
6957 VPValue *RHS = MulOp->getOperand(1);
6958
6959 // The LHS of the operation must always be an extend.
6961 return std::nullopt;
6962
6963 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6964 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6965 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6966
6967 // The RHS of the operation can be an extend or a constant integer.
6968 const APInt *RHSConst = nullptr;
6969 VPWidenCastRecipe *RHSCast = nullptr;
6971 RHSCast = cast<VPWidenCastRecipe>(RHS);
6972 else if (!match(RHS, m_APInt(RHSConst)) ||
6973 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6974 return std::nullopt;
6975
6976 // The outer extend kind must match the inner extends for folding.
6977 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6978 if (Cast && OuterExtKind &&
6979 getPartialReductionExtendKind(Cast) != OuterExtKind)
6980 return std::nullopt;
6981
6982 Type *RHSInputType = LHSInputType;
6983 ExtendKind RHSExtendKind = LHSExtendKind;
6984 if (RHSCast) {
6985 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6986 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6987 }
6988
6989 return ExtendedReductionOperand{
6990 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6991}
6992
6993/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6994/// and determines if the target can use a cheaper operation with a wider
6995/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6996/// of operations in the reduction.
6997static std::optional<SmallVector<VPPartialReductionChain>>
6998getScaledReductions(VPReductionPHIRecipe *RedPhiR) {
6999 // Get the backedge value from the reduction PHI and find the
7000 // ComputeReductionResult that uses it (directly or through a select for
7001 // predicated reductions).
7002 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
7003 if (!RdxResult)
7004 return std::nullopt;
7005 VPValue *ExitValue = RdxResult->getOperand(0);
7006 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
7007
7009 RecurKind RK = RedPhiR->getRecurrenceKind();
7010 Type *PhiType = RedPhiR->getScalarType();
7011 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
7012
7013 // Work backwards from the ExitValue examining each reduction operation.
7014 VPValue *CurrentValue = ExitValue;
7015 while (CurrentValue != RedPhiR) {
7016 VPBlendRecipe *Blend = dyn_cast<VPBlendRecipe>(CurrentValue);
7017 if (Blend) {
7018 assert(!Blend->isNormalized() && "Expect Blend not to be normalized.");
7019 CurrentValue = Blend->getIncomingValue(0);
7020 if (Blend->getNumIncomingValues() != 2 || !CurrentValue->hasOneUse())
7021 return std::nullopt;
7022 }
7023
7024 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
7025 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
7026 return std::nullopt;
7027
7028 VPValue *Op = UpdateR->getOperand(1);
7029 VPValue *PrevValue = UpdateR->getOperand(0);
7030
7031 // Find the extended operand. The other operand (PrevValue) is the next link
7032 // in the reduction chain.
7033 std::optional<ExtendedReductionOperand> ExtendedOp =
7034 matchExtendedReductionOperand(UpdateR, Op);
7035 if (!ExtendedOp) {
7036 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
7037 if (!ExtendedOp)
7038 return std::nullopt;
7039 std::swap(Op, PrevValue);
7040 }
7041
7042 // Look for VPBlend(reduce(PrevValue, Op), PrevValue), where
7043 // reduce is equal to CurrentValue. This can be lowered as
7044 // a conditional reduction by hoisting the select to the inputs.
7045 if (Blend && Blend->getIncomingValue(1) != PrevValue)
7046 return std::nullopt;
7047
7048 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
7049 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
7050 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
7051 return std::nullopt;
7052
7053 VPPartialReductionChain Link(
7054 {UpdateR, *ExtendedOp, RK,
7055 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
7056 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)),
7057 Blend});
7058 Chain.push_back(Link);
7059 CurrentValue = PrevValue;
7060 }
7061
7062 // The chain links were collected by traversing backwards from the exit value.
7063 // Reverse the chains so they are in program order.
7064 std::reverse(Chain.begin(), Chain.end());
7065 return Chain;
7066}
7067} // namespace
7068
7070 VPCostContext &CostCtx,
7071 VFRange &Range) {
7072 // Find all possible valid partial reductions, grouping chains by their PHI.
7073 // This grouping allows invalidating the whole chain, if any link is not a
7074 // valid partial reduction.
7076 ChainsByPhi;
7077 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7078 for (VPRecipeBase &R : HeaderVPBB->phis()) {
7079 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7080 if (!RedPhiR)
7081 continue;
7082
7083 if (auto Chains = getScaledReductions(RedPhiR))
7084 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
7085 }
7086
7087 if (ChainsByPhi.empty())
7088 return;
7089
7090 // Build set of partial reduction operations and blends for user validation
7091 // and a map of reduction bin ops to their scale factors for scale validation.
7092 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
7093 SmallPtrSet<VPBlendRecipe *, 4> PartialReductionBlends;
7094 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
7095 for (const auto &[_, Chains] : ChainsByPhi)
7096 for (const VPPartialReductionChain &Chain : Chains) {
7097 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
7098 if (Chain.Blend)
7099 PartialReductionBlends.insert(Chain.Blend);
7100 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
7101 }
7102
7103 // A partial reduction is invalid if any of its extends are used by
7104 // something that isn't another partial reduction. This is because the
7105 // extends are intended to be lowered along with the reduction itself.
7106 auto ExtendUsersValid = [&](VPValue *Ext) {
7107 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
7108 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
7109 });
7110 };
7111
7112 auto IsProfitablePartialReductionChainForVF =
7113 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
7114 InstructionCost PartialCost = 0, RegularCost = 0;
7115
7116 // The chain is a profitable partial reduction chain if the cost of handling
7117 // the entire chain is cheaper when using partial reductions than when
7118 // handling the entire chain using regular reductions.
7119 for (const VPPartialReductionChain &Link : Chain) {
7120 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
7121 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
7122 if (!LinkCost.isValid())
7123 return false;
7124
7125 PartialCost += LinkCost;
7126 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
7127 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
7128 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
7129 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
7130 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
7131 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
7132 RegularCost += Extend->computeCost(VF, CostCtx);
7133 }
7134 return PartialCost.isValid() && PartialCost < RegularCost;
7135 };
7136
7137 // Validate chains: check that extends are only used by partial reductions,
7138 // and that reduction bin ops are only used by other partial reductions with
7139 // matching scale factors, are outside the loop region or the select
7140 // introduced by tail-folding. Otherwise we would create users of scaled
7141 // reductions where the types of the other operands don't match.
7142 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
7143 for (const VPPartialReductionChain &Chain : Chains) {
7144 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
7145 Chains.clear();
7146 break;
7147 }
7148 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
7149 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
7150 return PhiR == RedPhiR;
7151 auto *R = cast<VPSingleDefRecipe>(U);
7152
7153 if (auto *Blend = dyn_cast<VPBlendRecipe>(R))
7154 return Blend == Chain.Blend || PartialReductionBlends.contains(Blend);
7155
7156 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
7158 m_Specific(Chain.ReductionBinOp))) ||
7159 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
7160 m_Specific(RedPhiR)));
7161 };
7162 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
7163 Chains.clear();
7164 break;
7165 }
7166
7167 // Check if the compute-reduction-result is used by a sunk store.
7168 // TODO: Also form partial reductions in those cases.
7169 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
7170 if (any_of(RdxResult->users(), [](VPUser *U) {
7171 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
7172 return RepR && RepR->getOpcode() == Instruction::Store;
7173 })) {
7174 Chains.clear();
7175 break;
7176 }
7177 }
7178 }
7179
7180 // Clear the chain if it is not profitable.
7182 [&, &Chains = Chains](ElementCount VF) {
7183 return IsProfitablePartialReductionChainForVF(Chains, VF);
7184 },
7185 Range))
7186 Chains.clear();
7187 }
7188
7189 for (auto &[Phi, Chains] : ChainsByPhi)
7190 for (const VPPartialReductionChain &Chain : Chains)
7191 transformToPartialReduction(Chain, Plan, Phi);
7192}
7193
7194/// If the pointer operand \p Addr of a memory access is an affine AddRec
7195/// w.r.t. \p L with a constant stride, return the stride in units of
7196/// \p AccessTy. Otherwise return std::nullopt.
7197static std::optional<int64_t> getConstantStride(VPValue *Addr, Type *AccessTy,
7199 const Loop *L) {
7200 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
7201 auto *AddRec = dyn_cast<SCEVAddRecExpr>(AddrSCEV);
7202 if (!AddRec)
7203 return {};
7204
7205 return getStrideFromAddRec(AddRec, L, AccessTy, /*Ptr=*/nullptr, PSE);
7206}
7207
7209 VPRecipeBuilder &RecipeBuilder,
7210 VPCostContext &CostCtx) {
7211 // Collect all loads/stores first. We will start with ones having simpler
7212 // decisions followed by more complex ones that are potentially
7213 // guided/dependent on the simpler ones.
7215 for (VPBasicBlock *VPBB :
7218 for (VPRecipeBase &R : *VPBB) {
7219 auto *VPI = dyn_cast<VPInstruction>(&R);
7220 if (VPI && VPI->getUnderlyingValue() &&
7221 is_contained({Instruction::Load, Instruction::Store},
7222 VPI->getOpcode()))
7223 MemOps.push_back(VPI);
7224 }
7225 }
7226
7227 // Few helpers to process different kinds of memory operations.
7228
7229 // To be used as argument to `VPlanTransforms::runPass` which explicitly
7230 // specified pass name, hence `VPlan &` parameter.
7231 auto ProcessSubset = [&](VPlan &, auto ProcessVPInst) {
7232 SmallVector<VPInstruction *> RemainingMemOps;
7233 for (VPInstruction *VPI : MemOps) {
7234 if (!ProcessVPInst(VPI))
7235 RemainingMemOps.push_back(VPI);
7236 }
7237
7238 MemOps.clear();
7239 std::swap(MemOps, RemainingMemOps);
7240 };
7241
7242 auto ReplaceWith = [&](VPInstruction *VPI, VPRecipeBase *New) {
7243 New->insertBefore(VPI);
7244 if (VPI->getOpcode() == Instruction::Load)
7245 VPI->replaceAllUsesWith(New->getVPSingleValue());
7246 VPI->eraseFromParent();
7247
7248 // VPI has been processed.
7249 return true;
7250 };
7251
7252 auto Scalarize = [&](VPInstruction *VPI) {
7253 return ReplaceWith(VPI, RecipeBuilder.handleReplication(VPI, Range));
7254 };
7255
7256 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
7257 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
7259 "lowerMemoryIdioms", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7260 if (RecipeBuilder.replaceWithFinalIfReductionStore(
7261 VPI, FinalRedStoresBuilder))
7262 return true;
7263
7264 // Filter out scalar VPlan for the remaining idioms.
7266 [](ElementCount VF) { return VF.isScalar(); }, Range))
7267 return false;
7268
7269 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI))
7270 return ReplaceWith(VPI, Histogram);
7271
7272 return false;
7273 });
7274
7275 // Filter out scalar VPlan for the remaining memory operations.
7277 [](ElementCount VF) { return VF.isScalar(); }, Range))
7278 return;
7279
7280 // If the instruction's allocated size doesn't equal it's type size, it
7281 // requires padding and will be scalarized.
7283 "scalarizeMemOpsWithIrregularTypes", ProcessSubset, Plan,
7284 [&](VPInstruction *VPI) {
7286 if (hasIrregularType(getLoadStoreType(I), I->getDataLayout()))
7287 return Scalarize(VPI);
7288
7289 return false;
7290 });
7291
7292 if (!RecipeBuilder.prefersVectorizedAddressing()) {
7294 "makeVPlanMemOpDecision", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7296 bool IsLoad = VPI->getOpcode() == Instruction::Load;
7297 if (RecipeBuilder.isPredicatedInst(I) || !IsLoad ||
7299 return false;
7300
7301 // Scalarize loads used as addresses, matching the legacy CM. The load
7302 // is single-scalar if the pointer is loop-invariant, otherwise it is
7303 // replicated per-lane. No mask is needed as the load is not
7304 // predicated.
7305 VPValue *Ptr = VPI->getOperand(0);
7306 const SCEV *PtrSCEV =
7307 vputils::getSCEVExprForVPValue(Ptr, CostCtx.PSE, CostCtx.L);
7308 bool IsSingleScalarLoad =
7309 !isa<SCEVCouldNotCompute>(PtrSCEV) &&
7310 CostCtx.PSE.getSE()->isLoopInvariant(PtrSCEV, CostCtx.L);
7311
7312 ReplaceWith(VPI,
7314 I, Ptr, /*IsSingleScalar=*/IsSingleScalarLoad,
7315 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc()));
7316 return true;
7317 });
7318 }
7319
7320 // Widen unmasked unit-stride consecutive accesses, matching the legacy CM.
7322 "widenConsecutiveMemOps", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7324 if (RecipeBuilder.isPredicatedInst(I))
7325 return false;
7326
7327 bool IsLoad = VPI->getOpcode() == Instruction::Load;
7328 VPValue *Ptr = VPI->getOperand(!IsLoad);
7329 Type *ScalarTy =
7330 IsLoad ? VPI->getScalarType() : VPI->getOperand(0)->getScalarType();
7331 if (getConstantStride(Ptr, ScalarTy, CostCtx.PSE, CostCtx.L) != 1)
7332 return false;
7333
7334 Type *StrideTy =
7336 VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);
7337 auto *VectorPtr = new VPVectorPointerRecipe(
7338 Ptr, ScalarTy, StrideOne, vputils::getGEPFlagsForPtr(Ptr),
7339 VPI->getDebugLoc());
7340 VectorPtr->insertBefore(VPI);
7341 VPRecipeBase *WidenedR;
7342 if (IsLoad)
7343 WidenedR = new VPWidenLoadRecipe(*cast<LoadInst>(I), VectorPtr,
7344 /*Mask=*/nullptr,
7345 /*Consecutive=*/true, *VPI,
7346 VPI->getDebugLoc());
7347 else
7348 WidenedR = new VPWidenStoreRecipe(
7349 *cast<StoreInst>(I), VectorPtr, VPI->getOperand(0),
7350 /*Mask=*/nullptr, /*Consecutive=*/true, *VPI, VPI->getDebugLoc());
7351 return ReplaceWith(VPI, WidenedR);
7352 });
7353
7354 VPlanTransforms::runPass("delegateMemOpWideningToLegacyCM", ProcessSubset,
7355 Plan, [&](VPInstruction *VPI) {
7356 if (VPRecipeBase *Recipe =
7357 RecipeBuilder.tryToWidenMemory(VPI, Range))
7358 return ReplaceWith(VPI, Recipe);
7359
7360 return Scalarize(VPI);
7361 });
7362}
7363
7366 [&](ElementCount VF) { return VF.isScalar(); }, Range))
7367 return;
7368
7370 Plan.getEntry());
7372 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
7373 auto *VPI = dyn_cast<VPInstruction>(&R);
7374 if (!VPI)
7375 continue;
7376
7377 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
7378 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
7379 if (!I)
7380 continue;
7381
7382 // If executing other lanes produces side-effects we can't avoid them.
7383 if (VPI->mayHaveSideEffects())
7384 continue;
7385
7386 // We want to drop the mask operand, verify we can safely do that.
7387 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
7388 continue;
7389
7390 // Avoid rewriting IV increment as that interferes with
7391 // `removeRedundantCanonicalIVs`.
7392 if (VPI->getOpcode() == Instruction::Add &&
7394 continue;
7395
7396 // Other lanes are needed - can't drop them.
7398 continue;
7399
7400 auto *Recipe = VPBuilder::createSingleScalarOp(
7401 VPI->getOpcode(), VPI->operandsWithoutMask(), /*Mask=*/nullptr, *VPI,
7402 *VPI, VPI->getDebugLoc(), I);
7403 Recipe->insertBefore(VPI);
7404 VPI->replaceAllUsesWith(Recipe);
7405 VPI->eraseFromParent();
7406 }
7407 }
7408}
7409
7410/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7411static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7412 PredicatedScalarEvolution &PSE, const Loop *L) {
7413 ScalarEvolution *SE = PSE.getSE();
7414 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7415 switch (Param.ParamKind) {
7416 case VFParamKind::Vector:
7417 case VFParamKind::GlobalPredicate:
7418 return true;
7419 case VFParamKind::OMP_Uniform:
7420 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7421 SE->isLoopInvariant(
7422 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7423 L);
7424 case VFParamKind::OMP_Linear:
7425 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7426 m_scev_AffineAddRec(
7427 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7428 m_SpecificLoop(L)));
7429 default:
7430 return false;
7431 }
7432 });
7433}
7434
7435/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7436/// Returns the variant function, or nullptr. Masked variants are assumed to
7437/// take the mask as a trailing parameter.
7439 ElementCount VF, bool MaskRequired,
7441 const Loop *L) {
7442 if (CI->isNoBuiltin())
7443 return nullptr;
7444 auto Mappings = VFDatabase::getMappings(*CI);
7445 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7446 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7447 areVFParamsOk(Info, Args, PSE, L);
7448 });
7449 if (It == Mappings.end())
7450 return nullptr;
7451 return CI->getModule()->getFunction(It->VectorName);
7452}
7453
7454namespace {
7455/// The outcome of choosing how to widen a call at a given VF.
7456struct CallWideningDecision {
7457 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7458 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7459 : Kind(Kind), Variant(Variant) {}
7460 KindTy Kind;
7461
7462 /// Set when Kind == VectorVariant.
7464
7465 bool operator==(const CallWideningDecision &Other) const {
7466 return Kind == Other.Kind && Variant == Other.Variant;
7467 }
7468};
7469} // namespace
7470
7471/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7472/// vector intrinsic, and vector library variant.
7473static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7475 ElementCount VF,
7476 VPCostContext &CostCtx) {
7477 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7478
7479 // Scalar VFs and calls forced or known to scalarize always replicate.
7480 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7481 return CallWideningDecision::KindTy::Scalarize;
7482
7483 auto *CalledFn = cast<Function>(
7485 Type *ResultTy = VPI.getScalarType();
7487 bool MaskRequired = CostCtx.isMaskRequired(CI);
7488
7489 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7491 return CallWideningDecision::KindTy::Scalarize;
7492
7493 InstructionCost ScalarCost =
7494 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7495 /*IsSingleScalar=*/false, VF, CostCtx);
7496
7497 Function *VecFunc =
7498 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7500 if (VecFunc)
7501 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7502
7503 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7504 // available vector variant.
7505 if (ID) {
7508 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7509 (!VecFunc || VecCallCost >= IntrinsicCost))
7510 return CallWideningDecision::KindTy::Intrinsic;
7511 }
7512
7513 // Otherwise, use a vector library variant when it beats scalarizing.
7514 if (VecFunc && ScalarCost >= VecCallCost)
7515 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7516
7517 return CallWideningDecision::KindTy::Scalarize;
7518}
7519
7521 VPRecipeBuilder &RecipeBuilder,
7522 VPCostContext &CostCtx) {
7525 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7526 auto *VPI = dyn_cast<VPInstruction>(&R);
7527 if (!VPI || !VPI->getUnderlyingValue() ||
7528 VPI->getOpcode() != Instruction::Call)
7529 continue;
7530
7531 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7532 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7533 VPI->op_begin() + CI->arg_size());
7534
7535 CallWideningDecision Decision =
7536 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7538 [&](ElementCount VF) {
7539 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7540 },
7541 Range);
7542
7543 VPSingleDefRecipe *Replacement = nullptr;
7544 switch (Decision.Kind) {
7545 case CallWideningDecision::KindTy::Intrinsic: {
7547 Type *ResultTy = VPI->getScalarType();
7548 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7549 *VPI, VPI->getDebugLoc());
7550 break;
7551 }
7552 case CallWideningDecision::KindTy::VectorVariant: {
7553 // Masked variants take the mask as a trailing parameter, so they have
7554 // one more parameter than the original call's arguments.
7555 if (Decision.Variant->arg_size() > Ops.size()) {
7556 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7557 Ops.push_back(Mask);
7558 }
7559 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7560 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7561 *VPI, VPI->getDebugLoc());
7562 break;
7563 }
7564 case CallWideningDecision::KindTy::Scalarize:
7565 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7566 break;
7567 }
7568
7569 Replacement->insertBefore(VPI);
7570 VPI->replaceAllUsesWith(Replacement);
7571 VPI->eraseFromParent();
7572 }
7573 }
7574}
7575
7578 Loop &L, VPCostContext &Ctx,
7579 VFRange &Range) {
7580 if (Plan.hasScalarVFOnly())
7581 return;
7582
7583 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7584 VPValue *I32VF = nullptr;
7586 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7587 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7588 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7589 // TODO: Support strided store.
7590 // TODO: Transform reverse access into strided access with -1 stride.
7591 // TODO: Transform gather/scatter with uniform address into strided access
7592 // with 0 stride.
7593 // TODO: Transform interleave access into multiple strided accesses.
7594 if (!LoadR || LoadR->isConsecutive())
7595 continue;
7596
7597 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7598 if (!Ptr)
7599 continue;
7600
7601 // Check if this is a strided access by analyzing the address SCEV for an
7602 // affine addRec.
7603 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7604 const SCEV *Start;
7605 const SCEVConstant *Step;
7606 // TODO: Support non-constant loop invariant stride.
7607 if (!match(PtrSCEV,
7609 m_SpecificLoop(&L))))
7610 continue;
7611
7612 Type *LoadTy = LoadR->getScalarType();
7613 Align Alignment = LoadR->getAlign();
7614 auto IsProfitable = [&](ElementCount VF) {
7615 Type *DataTy = toVectorTy(LoadTy, VF);
7616 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7617 return false;
7618 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7619 const InstructionCost StridedLoadStoreCost =
7621 Intrinsic::experimental_vp_strided_load, DataTy,
7622 LoadR->isMasked(), Alignment, Ctx);
7623 return StridedLoadStoreCost < CurrentCost;
7624 };
7625
7627 Range))
7628 continue;
7629
7630 // Invalidate the legacy widening decision so the cost of replaced load is
7631 // not counted during precomputeCosts.
7632 // TODO: Remove once the legacy exit cost computation is retired.
7633 for (ElementCount VF : Range)
7634 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7635
7636 // Get VF as i32 for the vector length operand.
7637 if (!I32VF) {
7638 VPBuilder Builder(Plan.getVectorPreheader());
7639 I32VF = Builder.createScalarZExtOrTrunc(
7640 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7642 }
7643
7644 VPBuilder Builder(LoadR);
7645 // Create the base pointer of strided access.
7646 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7647 // supports a general VPValue as the start value.
7648 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7649 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7650 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7651 assert(IndexTy == StrideInBytes->getScalarType() &&
7652 "Stride type from SCEV must match the index type");
7653 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7654 VectorLoop->getCanonicalIV(), IndexTy,
7655 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7656 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7657 auto *Offset = Builder.createOverflowingOp(
7658 Instruction::Mul, {CanIV, StrideInBytes},
7659 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7660 auto *BasePtr = Builder.createNoWrapPtrAdd(
7661 StartVPV, Offset,
7662 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7664
7665 // Create a new vector pointer for strided access.
7666 VPValue *NewPtr = Builder.createVectorPointer(
7667 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7668 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7669
7670 VPValue *Mask = LoadR->getMask();
7671 if (!Mask)
7672 Mask = Plan.getTrue();
7673 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7674 Intrinsic::experimental_vp_strided_load,
7675 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7676 LoadR->getDebugLoc());
7677 LoadR->replaceAllUsesWith(StridedLoad);
7678 }
7679 }
7680}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static std::optional< int64_t > getConstantStride(VPValue *Addr, Type *AccessTy, PredicatedScalarEvolution &PSE, const Loop *L)
If the pointer operand Addr of a memory access is an affine AddRec w.r.t.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps, VPBasicBlock *Preheader)
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(ArrayRef< VPReplicateRecipe * > ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
SinkStoreInfo(VPReplicateRecipe &GroupLeader)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1692
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
int32_t exactLogBase2() const
Definition APInt.h:1806
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:126
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:154
static DebugLoc getUnknown()
Definition DebugLoc.h:153
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:250
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:299
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:260
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:875
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1646
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4042
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4377
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4452
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4404
iterator end()
Definition VPlan.h:4414
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4412
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4465
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4424
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4426
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2944
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2989
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2994
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2984
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3000
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2980
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:331
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:350
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:240
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:258
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:276
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:312
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:296
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3489
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1633
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
static VPSingleDefRecipe * createSingleScalarOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPValue *Mask, const VPIRFlags &Flags, const VPIRMetadata &Metadata, DebugLoc DL, Instruction *UV)
Create a single-scalar recipe with Opcode and Operands without inserting it.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4074
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:562
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:535
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:547
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:557
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4175
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) const
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3534
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2436
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2483
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2163
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4530
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
unsigned getOpcode() const
Definition VPlan.h:1417
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3096
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3088
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3117
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3169
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3127
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1665
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3700
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
bool prefersVectorizedAddressing() const
Returns true if the target prefers vectorized addressing.
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPSingleDefRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a replicating or single-scalar recipe for VPI.
bool isPredicatedInst(Instruction *I) const
Returns true if I needs to be predicated (i.e.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:338
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3340
A recipe for handling reduction phis.
Definition VPlan.h:2851
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2902
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2895
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2908
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3220
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4587
const VPBlockBase * getEntry() const
Definition VPlan.h:4631
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4663
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4648
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4707
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4715
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4699
const VPBlockBase * getExiting() const
Definition VPlan.h:4643
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4656
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3385
bool isSingleScalar() const
Definition VPlan.h:3443
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3468
bool isPredicated() const
Definition VPlan.h:3445
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3462
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:178
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4235
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:217
bool isMaterialized() const
Returns true if this value has been materialized.
Definition VPlanValue.h:235
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:385
operand_range operands()
Definition VPlanValue.h:458
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:431
unsigned getNumOperands() const
Definition VPlanValue.h:425
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:426
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1458
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:164
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
bool user_empty() const
Definition VPlanValue.h:161
bool hasOneUse() const
Definition VPlanValue.h:175
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:209
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:179
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1461
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1467
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2266
A recipe to compute the pointers for widened memory accesses of SourceElementTy, with the Stride expr...
Definition VPlan.h:2348
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2097
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1878
Instruction::CastOps getOpcode() const
Definition VPlan.h:1914
A recipe for handling GEP instructions.
Definition VPlan.h:2206
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2510
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2558
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2576
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2561
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2581
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2610
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2657
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2661
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2672
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2683
A recipe for widening vector intrinsics.
Definition VPlan.h:1925
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3736
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2741
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1817
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1838
unsigned getOpcode() const
Definition VPlan.h:1857
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4735
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5061
bool hasVF(ElementCount VF) const
Definition VPlan.h:4954
const DataLayout & getDataLayout() const
Definition VPlan.h:4936
LLVMContext & getContext() const
Definition VPlan.h:4932
VPBasicBlock * getEntry()
Definition VPlan.h:4831
bool hasScalableVF() const
Definition VPlan.h:4955
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4890
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4911
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4961
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5027
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4930
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5033
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5110
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5064
bool hasUF(unsigned UF) const
Definition VPlan.h:4979
VPIRValue * getPoison(Type *Ty)
Return a VPIRValue wrapping a poison value of type Ty.
Definition VPlan.h:5055
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4884
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4920
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4917
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5004
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5030
void setVF(ElementCount VF)
Definition VPlan.h:4942
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4995
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1053
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4982
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4904
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4860
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5087
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5024
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4836
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4927
bool hasScalarVFOnly() const
Definition VPlan.h:4972
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4874
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4923
void setUF(unsigned UF)
Definition VPlan.h:4987
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5142
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1209
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5038
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2798
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
SelectLike_match< CondTy, LTy, RTy > m_SelectLike(const CondTy &C, const LTy &TrueC, const RTy &FalseC)
Matches a value that behaves like a boolean-controlled select, i.e.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:128
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
GEPNoWrapFlags getGEPFlagsForPtr(VPValue *Ptr)
Returns the GEP nowrap flags for Ptr, looking through pointer casts mirroring Value::stripPointerCast...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:262
@ Offset
Definition DWP.cpp:573
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:250
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:285
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1694
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1830
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Next
Definition InstrProf.h:147
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
LLVM_ABI std::optional< int64_t > getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE)
If AR is an affine AddRec for Lp with a constant step, return the step in units of AccessTy's allocat...
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:304
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2833
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1926
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:277
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3850
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3800
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3953
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3899
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan, ArgsTy &&...Args)
Helper to run a VPlan pass Pass on VPlan, forwarding extra arguments to the pass.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void simplifyReverses(VPlan &Plan)
Cancel out redundant reverses in Plan, e.g. reverse(reverse(x)) -> x.
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, const VPDominatorTree &VPDT)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...