LLVM 23.0.0git
VPlanUnroll.cpp
Go to the documentation of this file.
1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/IR/Intrinsics.h"
27
28using namespace llvm;
29using namespace llvm::VPlanPatternMatch;
30
31namespace {
32
33/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35/// the unrolling transformation, where the original VPValues are retained for
36/// part zero.
37class UnrollState {
38 /// Plan to unroll.
39 VPlan &Plan;
40 /// Unroll factor to unroll by.
41 const unsigned UF;
42 /// Analysis for types.
43 VPTypeAnalysis TypeInfo;
44
45 /// Unrolling may create recipes that should not be unrolled themselves.
46 /// Those are tracked in ToSkip.
47 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48
49 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50 // ..., UF-1.
51 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52
53 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55
56 /// Add a start index operand to \p Steps for \p Part.
57 void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, unsigned Part);
58
59 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
60 /// all parts.
61 void unrollRecipeByUF(VPRecipeBase &R);
62
63 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
64 /// depends on the concrete header phi. Inserts newly created recipes at \p
65 /// InsertPtForPhi.
66 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
67 VPBasicBlock::iterator InsertPtForPhi);
68
69 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
70 /// the induction steps for each part.
71 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
72 VPBasicBlock::iterator InsertPtForPhi);
73
74 VPValue *getConstantInt(unsigned Part) {
75 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
76 return Plan.getConstantInt(CanIVIntTy, Part);
77 }
78
79public:
80 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
81
82 void unrollBlock(VPBlockBase *VPB);
83
84 VPValue *getValueForPart(VPValue *V, unsigned Part) {
85 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(V))
86 return V;
87 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
88 "accessed value does not exist");
89 return VPV2Parts[V][Part - 1];
90 }
91
92 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
93 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
94 /// corresponding VPValue defined by \p CopyR.
95 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
96 unsigned Part) {
97 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
98 const auto &[V, _] = VPV2Parts.try_emplace(VPV);
99 assert(V->second.size() == Part - 1 && "earlier parts not set");
100 V->second.push_back(CopyR->getVPValue(Idx));
101 }
102 }
103
104 /// Given a uniform recipe \p R, add it for all parts.
105 void addUniformForAllParts(VPSingleDefRecipe *R) {
106 const auto &[V, Inserted] = VPV2Parts.try_emplace(R);
107 assert(Inserted && "uniform value already added");
108 for (unsigned Part = 0; Part != UF; ++Part)
109 V->second.push_back(R);
110 }
111
112 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
113
114 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
115 /// \p P.
116 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
117 auto *Op = R->getOperand(OpIdx);
118 R->setOperand(OpIdx, getValueForPart(Op, Part));
119 }
120
121 /// Update \p R's operands with their corresponding VPValues for part \p P.
122 void remapOperands(VPRecipeBase *R, unsigned Part) {
123 for (const auto &[OpIdx, Op] : enumerate(R->operands()))
124 R->setOperand(OpIdx, getValueForPart(Op, Part));
125 }
126};
127} // namespace
128
129void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
130 unsigned Part) {
131 if (Part == 0)
132 return;
133
134 VPBuilder Builder(Steps);
135 Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0));
136 Type *IntStepTy =
137 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
138 VPValue *StartIndex = Steps->getVFValue();
139 if (Part > 1) {
140 StartIndex = Builder.createOverflowingOp(
141 Instruction::Mul,
142 {StartIndex,
143 Plan.getConstantInt(TypeInfo.inferScalarType(StartIndex), Part)});
144 }
145 StartIndex = Builder.createScalarSExtOrTrunc(
146 StartIndex, IntStepTy, TypeInfo.inferScalarType(StartIndex),
147 Steps->getDebugLoc());
148
149 if (BaseIVTy->isFloatingPointTy())
150 StartIndex = Builder.createScalarCast(Instruction::SIToFP, StartIndex,
151 BaseIVTy, Steps->getDebugLoc());
152
153 Steps->addOperand(StartIndex);
154}
155
156void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
157 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
158 for (unsigned Part = 1; Part != UF; ++Part) {
159 auto *Copy = VPR->clone();
160 VPBlockUtils::insertBlockBefore(Copy, InsertPt);
161
162 auto PartI = vp_depth_first_shallow(Copy->getEntry());
163 auto Part0 = vp_depth_first_shallow(VPR->getEntry());
164 for (const auto &[PartIVPBB, Part0VPBB] :
167 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
168 remapOperands(&PartIR, Part);
169 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR))
170 addStartIndexForScalarSteps(Steps, Part);
171
172 addRecipeForPart(&Part0R, &PartIR, Part);
173 }
174 }
175 }
176}
177
178void UnrollState::unrollWidenInductionByUF(
179 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
180 VPBasicBlock *PH = cast<VPBasicBlock>(
181 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
182 Type *IVTy = TypeInfo.inferScalarType(IV);
183 auto &ID = IV->getInductionDescriptor();
184 VPIRFlags Flags;
185 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
186 Flags = ID.getInductionBinOp()->getFastMathFlags();
187
188 VPValue *ScalarStep = IV->getStepValue();
189 VPBuilder Builder(PH);
190 Type *VectorStepTy =
191 IVTy->isPointerTy() ? TypeInfo.inferScalarType(ScalarStep) : IVTy;
192 VPInstruction *VectorStep = Builder.createNaryOp(
193 VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, VectorStepTy,
194 Flags, IV->getDebugLoc());
195
196 ToSkip.insert(VectorStep);
197
198 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
199 // remains the header phi. Parts > 0 are computed by adding Step to the
200 // previous part. The header phi recipe will get 2 new operands: the step
201 // value for a single part and the last part, used to compute the backedge
202 // value during VPWidenInductionRecipe::execute.
203 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
204 // %Part.1 = %Part.0 + %VectorStep
205 // %Part.2 = %Part.1 + %VectorStep
206 // %Part.3 = %Part.2 + %VectorStep
207 //
208 // The newly added recipes are added to ToSkip to avoid interleaving them
209 // again.
210 VPValue *Prev = IV;
211 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
212 unsigned AddOpc;
213 VPIRFlags AddFlags;
214 if (IVTy->isPointerTy()) {
216 AddFlags = GEPNoWrapFlags::none();
217 } else if (IVTy->isFloatingPointTy()) {
218 AddOpc = ID.getInductionOpcode();
219 AddFlags = Flags; // FMF flags
220 } else {
221 AddOpc = Instruction::Add;
222 AddFlags = VPIRFlags::getDefaultFlags(AddOpc);
224 AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false);
225 }
226 for (unsigned Part = 1; Part != UF; ++Part) {
227 std::string Name =
228 Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
229
230 VPInstruction *Add =
231 Builder.createNaryOp(AddOpc,
232 {
233 Prev,
234 VectorStep,
235 },
236 AddFlags, IV->getDebugLoc(), Name);
237 ToSkip.insert(Add);
238 addRecipeForPart(IV, Add, Part);
239 Prev = Add;
240 }
241 IV->addOperand(VectorStep);
242 IV->addOperand(Prev);
243}
244
245void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
246 VPBasicBlock::iterator InsertPtForPhi) {
247 // First-order recurrences pass a single vector or scalar through their header
248 // phis, irrespective of interleaving.
250 return;
251
252 // Generate step vectors for each unrolled part.
253 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(R)) {
254 unrollWidenInductionByUF(IV, InsertPtForPhi);
255 return;
256 }
257
258 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
259 if (RdxPhi && RdxPhi->isOrdered())
260 return;
261
262 auto InsertPt = std::next(R->getIterator());
263 for (unsigned Part = 1; Part != UF; ++Part) {
264 VPRecipeBase *Copy = R->clone();
265 Copy->insertBefore(*R->getParent(), InsertPt);
266 addRecipeForPart(R, Copy, Part);
267 if (RdxPhi) {
268 // If the start value is a ReductionStartVector, use the identity value
269 // (second operand) for unrolled parts. If the scaling factor is > 1,
270 // create a new ReductionStartVector with the scale factor and both
271 // operands set to the identity value.
272 if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
273 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
274 "unexpected start VPInstruction");
275 if (Part != 1)
276 continue;
277 VPValue *StartV;
278 if (match(VPI->getOperand(2), m_One())) {
279 StartV = VPI->getOperand(1);
280 } else {
281 auto *C = VPI->clone();
282 C->setOperand(0, C->getOperand(1));
283 C->insertAfter(VPI);
284 StartV = C;
285 }
286 for (unsigned Part = 1; Part != UF; ++Part)
287 VPV2Parts[VPI][Part - 1] = StartV;
288 }
289 } else {
291 "unexpected header phi recipe not needing unrolled part");
292 }
293 }
294}
295
296/// Handle non-header-phi recipes.
297void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
299 return;
300
301 if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
303 addUniformForAllParts(VPI);
304 return;
305 }
306 }
307 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
308 if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
309 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
310 // Stores to an invariant address only need to store the last part.
311 remapOperands(&R, UF - 1);
312 return;
313 }
314 if (match(RepR,
316 addUniformForAllParts(RepR);
317 return;
318 }
319 }
320
321 // Unroll non-uniform recipes.
322 auto InsertPt = std::next(R.getIterator());
323 VPBasicBlock &VPBB = *R.getParent();
324 for (unsigned Part = 1; Part != UF; ++Part) {
325 VPRecipeBase *Copy = R.clone();
326 Copy->insertBefore(VPBB, InsertPt);
327 addRecipeForPart(&R, Copy, Part);
328
329 VPValue *Op;
331 m_VPValue(), m_VPValue(Op)))) {
332 Copy->setOperand(0, getValueForPart(Op, Part - 1));
333 Copy->setOperand(1, getValueForPart(Op, Part));
334 continue;
335 }
336 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
337 VPBuilder Builder(VPR);
338 const DataLayout &DL =
340 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR));
341 Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
342 VPValue *VF = Builder.createScalarZExtOrTrunc(
343 &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown());
344 // VFxUF does not wrap, so VF * Part also cannot wrap.
345 VPValue *VFxPart = Builder.createOverflowingOp(
346 Instruction::Mul, {VF, Plan.getConstantInt(IndexTy, Part)},
347 {true, true});
348 Copy->setOperand(0, VPR->getOperand(0));
349 Copy->addOperand(VFxPart);
350 continue;
351 }
352 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
353 auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
354 if (Phi && Phi->isOrdered()) {
355 auto &Parts = VPV2Parts[Phi];
356 if (Part == 1) {
357 Parts.clear();
358 Parts.push_back(Red);
359 }
360 Parts.push_back(Copy->getVPSingleValue());
361 Phi->setOperand(1, Copy->getVPSingleValue());
362 }
363 }
364 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Copy)) {
365 // Materialize PartN offset for VectorEndPointer.
366 VEPR->setOperand(0, R.getOperand(0));
367 VEPR->setOperand(1, R.getOperand(1));
368 VEPR->materializeOffset(Part);
369 continue;
370 }
371
372 remapOperands(Copy, Part);
373
374 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Copy))
375 addStartIndexForScalarSteps(ScalarIVSteps, Part);
376
377 // Add operand indicating the part to generate code for, to recipes still
378 // requiring it.
380 Copy->addOperand(getConstantInt(Part));
381
382 if (match(Copy,
384 VPBuilder Builder(Copy);
385 VPValue *ScaledByPart = Builder.createOverflowingOp(
386 Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)});
387 Copy->setOperand(1, ScaledByPart);
388 }
389 }
390 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
391 // Materialize Part0 offset for VectorEndPointer.
392 VEPR->materializeOffset();
393 }
394}
395
396void UnrollState::unrollBlock(VPBlockBase *VPB) {
397 auto *VPR = dyn_cast<VPRegionBlock>(VPB);
398 if (VPR) {
399 if (VPR->isReplicator())
400 return unrollReplicateRegionByUF(VPR);
401
402 // Traverse blocks in region in RPO to ensure defs are visited before uses
403 // across blocks.
404 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
405 RPOT(VPR->getEntry());
406 for (VPBlockBase *VPB : RPOT)
407 unrollBlock(VPB);
408 return;
409 }
410
411 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
412 auto *VPBB = cast<VPBasicBlock>(VPB);
413 auto InsertPtForPhi = VPBB->getFirstNonPhi();
414 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
415 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
416 continue;
417
418 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
419 // Compute*Result which combine all parts to compute the final value.
420 VPValue *Op1;
422 match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
423 match(&R, m_LastActiveLane(m_VPValue(Op1))) ||
424 match(&R,
427 addUniformForAllParts(cast<VPInstruction>(&R));
428 for (unsigned Part = 1; Part != UF; ++Part)
429 R.addOperand(getValueForPart(Op1, Part));
430 continue;
431 }
432 VPValue *Op0;
433 if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
434 addUniformForAllParts(cast<VPInstruction>(&R));
435 for (unsigned Part = 1; Part != UF; ++Part)
436 R.addOperand(getValueForPart(Op1, Part));
437 continue;
438 }
439
440 if (Plan.hasScalarVFOnly()) {
441 if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) ||
443 auto *I = cast<VPInstruction>(&R);
444 bool IsPenultimatePart =
446 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
447 // For scalar VF, directly use the scalar part value.
448 I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
449 continue;
450 }
451 }
452 // For vector VF, the penultimate element is always extracted from the last part.
455 addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
456 R.setOperand(0, getValueForPart(Op0, UF - 1));
457 continue;
458 }
459
460 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
461 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
462 addUniformForAllParts(SingleDef);
463 continue;
464 }
465
466 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
467 unrollHeaderPHIByUF(H, InsertPtForPhi);
468 continue;
469 }
470
471 unrollRecipeByUF(R);
472 }
473}
474
475void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
476 assert(UF > 0 && "Unroll factor must be positive");
477 Plan.setUF(UF);
478 llvm::scope_exit Cleanup([&Plan, UF]() {
479 auto Iter = vp_depth_first_deep(Plan.getEntry());
480 // Remove recipes that are redundant after unrolling.
482 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
483 auto *VPI = dyn_cast<VPInstruction>(&R);
484 if (VPI &&
485 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
486 VPI->getOperand(1) == &Plan.getVF()) {
487 VPI->replaceAllUsesWith(VPI->getOperand(0));
488 VPI->eraseFromParent();
489 }
490 }
491 }
492
493 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
494 Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF));
495 });
496 if (UF == 1) {
497 return;
498 }
499
500 UnrollState Unroller(Plan, UF);
501
502 // Iterate over all blocks in the plan starting from Entry, and unroll
503 // recipes inside them. This includes the vector preheader and middle blocks,
504 // which may set up or post-process per-part values.
506 Plan.getEntry());
507 for (VPBlockBase *VPB : RPOT)
508 Unroller.unrollBlock(VPB);
509
510 unsigned Part = 1;
511 // Remap operands of cloned header phis to update backedge values. The header
512 // phis cloned during unrolling are just after the header phi for part 0.
513 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
514 for (VPRecipeBase &H :
516 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
517 // value across the backedge, needs to remap to the last part of the spliced
518 // value.
520 Unroller.remapOperand(&H, 1, UF - 1);
521 continue;
522 }
523 if (Unroller.contains(H.getVPSingleValue())) {
524 Part = 1;
525 continue;
526 }
527 Unroller.remapOperands(&H, Part);
528 Part++;
529 }
530
532}
533
534/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
535/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
536/// definitions for operands of \DefR.
537static VPValue *
538cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
539 VPSingleDefRecipe *DefR, VPLane Lane,
540 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
541 VPValue *Op;
543 auto LaneDefs = Def2LaneDefs.find(Op);
544 if (LaneDefs != Def2LaneDefs.end())
545 return LaneDefs->second[Lane.getKnownLane()];
546
547 VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
548 return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
549 }
550
551 // Collect the operands at Lane, creating extracts as needed.
553 for (VPValue *Op : DefR->operands()) {
554 // If Op is a definition that has been unrolled, directly use the clone for
555 // the corresponding lane.
556 auto LaneDefs = Def2LaneDefs.find(Op);
557 if (LaneDefs != Def2LaneDefs.end()) {
558 NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
559 continue;
560 }
561 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
562 // Look through mandatory Unpack.
563 [[maybe_unused]] bool Matched =
565 assert(Matched && "original op must have been Unpack");
566 auto *ExtractPart =
567 Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op});
568 NewOps.push_back(
569 Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart}));
570 continue;
571 }
573 NewOps.push_back(Op);
574 continue;
575 }
576
577 // Look through buildvector to avoid unnecessary extracts.
578 if (match(Op, m_BuildVector())) {
579 NewOps.push_back(
580 cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
581 continue;
582 }
583 VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
584 VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
585 NewOps.push_back(Ext);
586 }
587
589 if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
590 // TODO: have cloning of replicate recipes also provide the desired result
591 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
592 // Mask from the operands?)
593 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
594 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
595 *RepR, *RepR, RepR->getDebugLoc());
596 } else {
598 "DefR must be a VPReplicateRecipe or VPInstruction");
599 New = DefR->clone();
600 for (const auto &[Idx, Op] : enumerate(NewOps)) {
601 New->setOperand(Idx, Op);
602 }
603 }
604 New->insertBefore(DefR);
605 return New;
606}
607
609 if (Plan.hasScalarVFOnly())
610 return;
611
612 Type *IdxTy = IntegerType::get(
614
615 // Visit all VPBBs outside the loop region and directly inside the top-level
616 // loop region.
617 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
619 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
621 auto VPBBsToUnroll =
622 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
623 // A mapping of current VPValue definitions to collections of new VPValues
624 // defined per lane. Serves to hook-up potential users of current VPValue
625 // definition that are replicated-per-VF later.
627 // The removal of current recipes being replaced by new ones needs to be
628 // delayed after Def2LaneDefs is no longer in use.
630 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
631 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
634 cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
635 (isa<VPInstruction>(&R) &&
636 !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
638 continue;
639
640 auto *DefR = cast<VPSingleDefRecipe>(&R);
641 VPBuilder Builder(DefR);
642 if (DefR->getNumUsers() == 0) {
643 // Create single-scalar version of DefR for all lanes.
644 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
645 cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
646 DefR->eraseFromParent();
647 continue;
648 }
649 /// Create single-scalar version of DefR for all lanes.
650 SmallVector<VPValue *> LaneDefs;
651 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
652 LaneDefs.push_back(
653 cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
654
655 Def2LaneDefs[DefR] = LaneDefs;
656 /// Users that only demand the first lane can use the definition for lane
657 /// 0.
658 DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
659 return U.usesFirstLaneOnly(DefR);
660 });
661
662 // Update each build vector user that currently has DefR as its only
663 // operand, to have all LaneDefs as its operands.
664 for (VPUser *U : to_vector(DefR->users())) {
665 auto *VPI = dyn_cast<VPInstruction>(U);
666 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
667 VPI->getOpcode() != VPInstruction::BuildStructVector))
668 continue;
669 assert(VPI->getNumOperands() == 1 &&
670 "Build(Struct)Vector must have a single operand before "
671 "replicating by VF");
672 VPI->setOperand(0, LaneDefs[0]);
673 for (VPValue *LaneDef : drop_begin(LaneDefs))
674 VPI->addOperand(LaneDef);
675 }
676 ToRemove.push_back(DefR);
677 }
678 }
679 for (auto *R : reverse(ToRemove))
680 R->eraseFromParent();
681}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isCanonical(const MDString *S)
static const HTTPClientCleanup Cleanup
#define _
#define I(x, y, z)
Definition MD5.cpp:57
#define H(x, y, z)
Definition MD5.cpp:56
MachineInstr unsigned OpIdx
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static ConstantInt * getConstantInt(Value *V, const DataLayout &DL)
Extract ConstantInt from value, looking through IntToPtr and PointerNullValue.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file provides utility VPlan to VPlan transformations.
static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPSingleDefRecipe *DefR, VPLane Lane, const DenseMap< VPValue *, SmallVector< VPValue * > > &Def2LaneDefs)
Create a single-scalar clone of DefR (must be a VPReplicateRecipe or VPInstruction) for lane Lane.
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, DenseMap< VPValue *, VPValue * > &Old2NewVPValues)
Definition VPlan.cpp:1143
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
static DebugLoc getUnknown()
Definition DebugLoc.h:161
static GEPNoWrapFlags none()
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4196
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4223
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4284
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:82
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:210
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:269
static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected block NewBlock before Blockptr.
Definition VPlanUtils.h:183
VPlan-based builder utility analogous to IRBuilder.
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4373
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1266
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1218
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1270
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1213
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1210
@ CanonicalIVIncrementForPart
Definition VPlan.h:1194
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Kind getKind() const
Returns the Kind of lane offset.
unsigned getKnownLane() const
Returns a compile-time known value for the lane index and asserts if the lane can only be calculated ...
@ ScalableLast
For ScalableLast, Lane is the offset from the start of the last N-element subvector in a scalable vec...
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:388
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:537
VPRegionBlock * clone() override
Clone all blocks in the single-entry single-exit region of the block and their recipes without updati...
Definition VPlan.cpp:744
const VPBlockBase * getEntry() const
Definition VPlan.h:4420
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4452
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3157
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4013
VPValue * getVFValue() const
Return the number of scalars to produce per unroll part, used to compute StartIndex during unrolling.
Definition VPlan.h:4058
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:589
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1403
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4514
VPBasicBlock * getEntry()
Definition VPlan.h:4606
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4696
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4664
VPValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4700
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1033
bool hasScalarVFOnly() const
Definition VPlan.h:4734
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4650
void setUF(unsigned UF)
Definition VPlan.h:4749
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4800
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::ComputeAnyOfResult, Op0_t, Op1_t, Op2_t > m_ComputeAnyOfResult(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each replicating VPReplicateRecipe and VPInstruction outside of any replicate region in Plan ...