LLVM 23.0.0git
VPlanUnroll.cpp
Go to the documentation of this file.
1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/IR/Intrinsics.h"
27
28using namespace llvm;
29using namespace llvm::VPlanPatternMatch;
30
31namespace {
32
33/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35/// the unrolling transformation, where the original VPValues are retained for
36/// part zero.
37class UnrollState {
38 /// Plan to unroll.
39 VPlan &Plan;
40 /// Unroll factor to unroll by.
41 const unsigned UF;
42 /// Analysis for types.
43 VPTypeAnalysis TypeInfo;
44
45 /// Unrolling may create recipes that should not be unrolled themselves.
46 /// Those are tracked in ToSkip.
47 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48
49 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50 // ..., UF-1.
51 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52
53 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55
56 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
57 /// all parts.
58 void unrollRecipeByUF(VPRecipeBase &R);
59
60 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
61 /// depends on the concrete header phi. Inserts newly created recipes at \p
62 /// InsertPtForPhi.
63 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
64 VPBasicBlock::iterator InsertPtForPhi);
65
66 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
67 /// the induction steps for each part.
68 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
69 VPBasicBlock::iterator InsertPtForPhi);
70
71 VPValue *getConstantInt(unsigned Part) {
72 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
73 return Plan.getConstantInt(CanIVIntTy, Part);
74 }
75
76public:
77 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
78
79 void unrollBlock(VPBlockBase *VPB);
80
81 VPValue *getValueForPart(VPValue *V, unsigned Part) {
82 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(V))
83 return V;
84 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
85 "accessed value does not exist");
86 return VPV2Parts[V][Part - 1];
87 }
88
89 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
90 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
91 /// corresponding VPValue defined by \p CopyR.
92 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
93 unsigned Part) {
94 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
95 const auto &[V, _] = VPV2Parts.try_emplace(VPV);
96 assert(V->second.size() == Part - 1 && "earlier parts not set");
97 V->second.push_back(CopyR->getVPValue(Idx));
98 }
99 }
100
101 /// Given a uniform recipe \p R, add it for all parts.
102 void addUniformForAllParts(VPSingleDefRecipe *R) {
103 const auto &[V, Inserted] = VPV2Parts.try_emplace(R);
104 assert(Inserted && "uniform value already added");
105 for (unsigned Part = 0; Part != UF; ++Part)
106 V->second.push_back(R);
107 }
108
109 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
110
111 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
112 /// \p P.
113 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
114 auto *Op = R->getOperand(OpIdx);
115 R->setOperand(OpIdx, getValueForPart(Op, Part));
116 }
117
118 /// Update \p R's operands with their corresponding VPValues for part \p P.
119 void remapOperands(VPRecipeBase *R, unsigned Part) {
120 for (const auto &[OpIdx, Op] : enumerate(R->operands()))
121 R->setOperand(OpIdx, getValueForPart(Op, Part));
122 }
123};
124} // namespace
125
127 unsigned Part, VPlan &Plan,
128 VPTypeAnalysis &TypeInfo) {
129 if (Part == 0)
130 return;
131
132 VPBuilder Builder(Steps);
133 Type *BaseIVTy = TypeInfo.inferScalarType(Steps->getOperand(0));
134 Type *IntStepTy =
135 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
136 VPValue *StartIndex = Steps->getVFValue();
137 if (Part > 1) {
138 StartIndex = Builder.createOverflowingOp(
139 Instruction::Mul,
140 {StartIndex,
141 Plan.getConstantInt(TypeInfo.inferScalarType(StartIndex), Part)});
142 }
143 StartIndex = Builder.createScalarSExtOrTrunc(
144 StartIndex, IntStepTy, TypeInfo.inferScalarType(StartIndex),
145 Steps->getDebugLoc());
146
147 if (BaseIVTy->isFloatingPointTy())
148 StartIndex = Builder.createScalarCast(Instruction::SIToFP, StartIndex,
149 BaseIVTy, Steps->getDebugLoc());
150
151 Steps->setStartIndex(StartIndex);
152}
153
154void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
155 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
156 for (unsigned Part = 1; Part != UF; ++Part) {
157 auto *Copy = VPR->clone();
158 VPBlockUtils::insertBlockBefore(Copy, InsertPt);
159
160 auto PartI = vp_depth_first_shallow(Copy->getEntry());
161 auto Part0 = vp_depth_first_shallow(VPR->getEntry());
162 for (const auto &[PartIVPBB, Part0VPBB] :
165 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
166 remapOperands(&PartIR, Part);
167 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR))
168 addStartIndexForScalarSteps(Steps, Part, Plan, TypeInfo);
169
170 addRecipeForPart(&Part0R, &PartIR, Part);
171 }
172 }
173 }
174}
175
176void UnrollState::unrollWidenInductionByUF(
177 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
178 VPBasicBlock *PH = cast<VPBasicBlock>(
179 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
180 Type *IVTy = TypeInfo.inferScalarType(IV);
181 auto &ID = IV->getInductionDescriptor();
182 VPIRFlags Flags;
183 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
184 Flags = ID.getInductionBinOp()->getFastMathFlags();
185
186 VPValue *ScalarStep = IV->getStepValue();
187 VPBuilder Builder(PH);
188 Type *VectorStepTy =
189 IVTy->isPointerTy() ? TypeInfo.inferScalarType(ScalarStep) : IVTy;
190 VPInstruction *VectorStep = Builder.createNaryOp(
191 VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, VectorStepTy,
192 Flags, IV->getDebugLoc());
193
194 ToSkip.insert(VectorStep);
195
196 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
197 // remains the header phi. Parts > 0 are computed by adding Step to the
198 // previous part. The header phi recipe will get 2 new operands: the step
199 // value for a single part and the last part, used to compute the backedge
200 // value during VPWidenInductionRecipe::execute.
201 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
202 // %Part.1 = %Part.0 + %VectorStep
203 // %Part.2 = %Part.1 + %VectorStep
204 // %Part.3 = %Part.2 + %VectorStep
205 //
206 // The newly added recipes are added to ToSkip to avoid interleaving them
207 // again.
208 VPValue *Prev = IV;
209 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
210 unsigned AddOpc;
211 VPIRFlags AddFlags;
212 if (IVTy->isPointerTy()) {
214 AddFlags = GEPNoWrapFlags::none();
215 } else if (IVTy->isFloatingPointTy()) {
216 AddOpc = ID.getInductionOpcode();
217 AddFlags = Flags; // FMF flags
218 } else {
219 AddOpc = Instruction::Add;
220 AddFlags = VPIRFlags::getDefaultFlags(AddOpc);
222 AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false);
223 }
224 for (unsigned Part = 1; Part != UF; ++Part) {
225 std::string Name =
226 Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
227
228 VPInstruction *Add =
229 Builder.createNaryOp(AddOpc,
230 {
231 Prev,
232 VectorStep,
233 },
234 AddFlags, IV->getDebugLoc(), Name);
235 ToSkip.insert(Add);
236 addRecipeForPart(IV, Add, Part);
237 Prev = Add;
238 }
239 IV->addOperand(VectorStep);
240 IV->addOperand(Prev);
241}
242
243void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
244 VPBasicBlock::iterator InsertPtForPhi) {
245 // First-order recurrences pass a single vector or scalar through their header
246 // phis, irrespective of interleaving.
248 return;
249
250 // Generate step vectors for each unrolled part.
251 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(R)) {
252 unrollWidenInductionByUF(IV, InsertPtForPhi);
253 return;
254 }
255
256 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
257 if (RdxPhi && RdxPhi->isOrdered())
258 return;
259
260 auto InsertPt = std::next(R->getIterator());
261 for (unsigned Part = 1; Part != UF; ++Part) {
262 VPRecipeBase *Copy = R->clone();
263 Copy->insertBefore(*R->getParent(), InsertPt);
264 addRecipeForPart(R, Copy, Part);
265 if (RdxPhi) {
266 // If the start value is a ReductionStartVector, use the identity value
267 // (second operand) for unrolled parts. If the scaling factor is > 1,
268 // create a new ReductionStartVector with the scale factor and both
269 // operands set to the identity value.
270 if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
271 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
272 "unexpected start VPInstruction");
273 if (Part != 1)
274 continue;
275 VPValue *StartV;
276 if (match(VPI->getOperand(2), m_One())) {
277 StartV = VPI->getOperand(1);
278 } else {
279 auto *C = VPI->clone();
280 C->setOperand(0, C->getOperand(1));
281 C->insertAfter(VPI);
282 StartV = C;
283 }
284 for (unsigned Part = 1; Part != UF; ++Part)
285 VPV2Parts[VPI][Part - 1] = StartV;
286 }
287 } else {
289 "unexpected header phi recipe not needing unrolled part");
290 }
291 }
292}
293
294/// Handle non-header-phi recipes.
295void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
297 return;
298
299 if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
301 addUniformForAllParts(VPI);
302 return;
303 }
304 }
305 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
306 if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
307 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
308 // Stores to an invariant address only need to store the last part.
309 remapOperands(&R, UF - 1);
310 return;
311 }
312 if (match(RepR,
314 addUniformForAllParts(RepR);
315 return;
316 }
317 }
318
319 // Unroll non-uniform recipes.
320 auto InsertPt = std::next(R.getIterator());
321 VPBasicBlock &VPBB = *R.getParent();
322 for (unsigned Part = 1; Part != UF; ++Part) {
323 VPRecipeBase *Copy = R.clone();
324 Copy->insertBefore(VPBB, InsertPt);
325 addRecipeForPart(&R, Copy, Part);
326
327 // Phi operands are updated once all other recipes have been unrolled.
328 if (isa<VPWidenPHIRecipe>(Copy))
329 continue;
330
331 VPValue *Op;
333 m_VPValue(), m_VPValue(Op)))) {
334 Copy->setOperand(0, getValueForPart(Op, Part - 1));
335 Copy->setOperand(1, getValueForPart(Op, Part));
336 continue;
337 }
338 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
339 VPBuilder Builder(VPR);
340 const DataLayout &DL = Plan.getDataLayout();
341 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR));
342 Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
343 VPValue *VF = Builder.createScalarZExtOrTrunc(
344 &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown());
345 // VFxUF does not wrap, so VF * Part also cannot wrap.
346 VPValue *VFxPart = Builder.createOverflowingOp(
347 Instruction::Mul, {VF, Plan.getConstantInt(IndexTy, Part)},
348 {true, true});
349 Copy->setOperand(0, VPR->getOperand(0));
350 Copy->addOperand(VFxPart);
351 continue;
352 }
353 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
354 auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
355 if (Phi && Phi->isOrdered()) {
356 auto &Parts = VPV2Parts[Phi];
357 if (Part == 1) {
358 Parts.clear();
359 Parts.push_back(Red);
360 }
361 Parts.push_back(Copy->getVPSingleValue());
362 Phi->setOperand(1, Copy->getVPSingleValue());
363 }
364 }
365 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Copy)) {
366 // Materialize PartN offset for VectorEndPointer.
367 VEPR->setOperand(0, R.getOperand(0));
368 VEPR->setOperand(1, R.getOperand(1));
369 VEPR->materializeOffset(Part);
370 continue;
371 }
372
373 remapOperands(Copy, Part);
374
375 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Copy))
376 addStartIndexForScalarSteps(ScalarIVSteps, Part, Plan, TypeInfo);
377
378 // Add operand indicating the part to generate code for, to recipes still
379 // requiring it.
381 Copy->addOperand(getConstantInt(Part));
382
383 if (match(Copy,
385 VPBuilder Builder(Copy);
386 VPValue *ScaledByPart = Builder.createOverflowingOp(
387 Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)});
388 Copy->setOperand(1, ScaledByPart);
389 }
390 }
391 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
392 // Materialize Part0 offset for VectorEndPointer.
393 VEPR->materializeOffset();
394 }
395}
396
397void UnrollState::unrollBlock(VPBlockBase *VPB) {
398 auto *VPR = dyn_cast<VPRegionBlock>(VPB);
399 if (VPR) {
400 if (VPR->isReplicator())
401 return unrollReplicateRegionByUF(VPR);
402
403 // Traverse blocks in region in RPO to ensure defs are visited before uses
404 // across blocks.
405 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
406 RPOT(VPR->getEntry());
407 for (VPBlockBase *VPB : RPOT)
408 unrollBlock(VPB);
409 return;
410 }
411
412 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
413 auto *VPBB = cast<VPBasicBlock>(VPB);
414 auto InsertPtForPhi = VPBB->getFirstNonPhi();
415 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
416 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
417 continue;
418
419 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
420 // Compute*Result which combine all parts to compute the final value.
421 VPValue *Op1;
423 match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
424 match(&R, m_LastActiveLane(m_VPValue(Op1))) ||
425 match(&R,
428 addUniformForAllParts(cast<VPInstruction>(&R));
429 for (unsigned Part = 1; Part != UF; ++Part)
430 R.addOperand(getValueForPart(Op1, Part));
431 continue;
432 }
433 VPValue *Op0;
434 if (match(&R, m_ExtractLane(m_VPValue(Op0), m_VPValue(Op1)))) {
435 addUniformForAllParts(cast<VPInstruction>(&R));
436 for (unsigned Part = 1; Part != UF; ++Part)
437 R.addOperand(getValueForPart(Op1, Part));
438 continue;
439 }
440
441 VPValue *Op2;
443 m_VPValue(Op2)))) {
444 addUniformForAllParts(cast<VPInstruction>(&R));
445 for (unsigned Part = 1; Part != UF; ++Part) {
446 R.addOperand(getValueForPart(Op1, Part));
447 R.addOperand(getValueForPart(Op2, Part));
448 }
449 continue;
450 }
451
452 if (Plan.hasScalarVFOnly()) {
453 if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) ||
455 auto *I = cast<VPInstruction>(&R);
456 bool IsPenultimatePart =
458 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
459 // For scalar VF, directly use the scalar part value.
460 I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
461 continue;
462 }
463 }
464 // For vector VF, the penultimate element is always extracted from the last part.
467 addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
468 R.setOperand(0, getValueForPart(Op0, UF - 1));
469 continue;
470 }
471
472 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
473 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
474 addUniformForAllParts(SingleDef);
475 continue;
476 }
477
478 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
479 unrollHeaderPHIByUF(H, InsertPtForPhi);
480 continue;
481 }
482
483 unrollRecipeByUF(R);
484 }
485}
486
487void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
488 assert(UF > 0 && "Unroll factor must be positive");
489 Plan.setUF(UF);
490 llvm::scope_exit Cleanup([&Plan, UF]() {
491 auto Iter = vp_depth_first_deep(Plan.getEntry());
492 // Remove recipes that are redundant after unrolling.
494 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
495 auto *VPI = dyn_cast<VPInstruction>(&R);
496 if (VPI &&
497 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
498 VPI->getOperand(1) == &Plan.getVF()) {
499 VPI->replaceAllUsesWith(VPI->getOperand(0));
500 VPI->eraseFromParent();
501 }
502 }
503 }
504
505 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
506 Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF));
507 });
508 if (UF == 1) {
509 return;
510 }
511
512 UnrollState Unroller(Plan, UF);
513
514 // Iterate over all blocks in the plan starting from Entry, and unroll
515 // recipes inside them. This includes the vector preheader and middle blocks,
516 // which may set up or post-process per-part values.
518 Plan.getEntry());
519 for (VPBlockBase *VPB : RPOT)
520 Unroller.unrollBlock(VPB);
521
522 unsigned Part = 1;
523 // Remap operands of cloned header phis to update backedge values. The header
524 // phis cloned during unrolling are just after the header phi for part 0.
525 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
526 for (VPRecipeBase &H :
528 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
529 // value across the backedge, needs to remap to the last part of the spliced
530 // value.
532 Unroller.remapOperand(&H, 1, UF - 1);
533 continue;
534 }
535 if (Unroller.contains(H.getVPSingleValue())) {
536 Part = 1;
537 continue;
538 }
539 Unroller.remapOperands(&H, Part);
540 Part++;
541 }
542
544}
545
546/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe,
547/// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p
548/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
549static VPValue *
550cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
551 VPSingleDefRecipe *DefR, VPLane Lane,
552 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
554 "DefR must be a VPReplicateRecipe, VPInstruction or "
555 "VPScalarIVStepsRecipe");
556 VPValue *Op;
558 auto LaneDefs = Def2LaneDefs.find(Op);
559 if (LaneDefs != Def2LaneDefs.end())
560 return LaneDefs->second[Lane.getKnownLane()];
561
562 VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
563 return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
564 }
565
566 // Collect the operands at Lane, creating extracts as needed.
568 for (VPValue *Op : DefR->operands()) {
569 // If Op is a definition that has been unrolled, directly use the clone for
570 // the corresponding lane.
571 auto LaneDefs = Def2LaneDefs.find(Op);
572 if (LaneDefs != Def2LaneDefs.end()) {
573 NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
574 continue;
575 }
576 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
577 // Look through mandatory Unpack.
578 [[maybe_unused]] bool Matched =
580 assert(Matched && "original op must have been Unpack");
581 auto *ExtractPart =
582 Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op});
583 NewOps.push_back(
584 Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart}));
585 continue;
586 }
588 NewOps.push_back(Op);
589 continue;
590 }
591
592 // Look through buildvector to avoid unnecessary extracts.
593 if (match(Op, m_BuildVector())) {
594 NewOps.push_back(
595 cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
596 continue;
597 }
598 VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
599 VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
600 NewOps.push_back(Ext);
601 }
602
604 if (auto *RepR = dyn_cast<VPReplicateRecipe>(DefR)) {
605 // TODO: have cloning of replicate recipes also provide the desired result
606 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
607 // Mask from the operands?)
608 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
609 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
610 *RepR, *RepR, RepR->getDebugLoc());
611 } else {
612 New = DefR->clone();
613 for (const auto &[Idx, Op] : enumerate(NewOps)) {
614 New->setOperand(Idx, Op);
615 }
616 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(New)) {
617 // Skip lane 0: an absent start index is implicitly zero.
618 unsigned KnownLane = Lane.getKnownLane();
619 if (KnownLane != 0) {
620 VPTypeAnalysis TypeInfo(Plan);
621 Type *BaseIVTy = TypeInfo.inferScalarType(DefR->getOperand(0));
622
623 VPValue *StartIndex = Steps->getStartIndex();
624 VPValue *LaneOffset;
625 unsigned AddOp;
626 VPIRFlags Flags;
627 if (BaseIVTy->isFloatingPointTy()) {
628 int SignedLane = static_cast<int>(KnownLane);
629 if (!StartIndex && Steps->getInductionOpcode() == Instruction::FSub)
630 SignedLane = -SignedLane;
631 LaneOffset =
632 Plan.getOrAddLiveIn(ConstantFP::get(BaseIVTy, SignedLane));
633 AddOp = Steps->getInductionOpcode();
634 Flags = VPIRFlags(FastMathFlags());
635 } else {
636 unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits();
637 LaneOffset = Plan.getConstantInt(APInt(BaseIVBits, KnownLane,
638 /*isSigned*/ false,
639 /*implicitTrunc*/ true));
640 AddOp = Instruction::Add;
641 Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false));
642 }
643
644 if (StartIndex) {
645 VPBuilder LaneBuilder(DefR);
646 LaneOffset =
647 LaneBuilder.createNaryOp(AddOp, {StartIndex, LaneOffset}, Flags);
648 }
649 Steps->setStartIndex(LaneOffset);
650 }
651 }
652 }
653 New->insertBefore(DefR);
654 return New;
655}
656
658 if (Plan.hasScalarVFOnly())
659 return;
660
661 Type *IdxTy = IntegerType::get(
663
664 // Visit all VPBBs outside the loop region and directly inside the top-level
665 // loop region.
666 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
668 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
670 auto VPBBsToUnroll =
671 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
672 // A mapping of current VPValue definitions to collections of new VPValues
673 // defined per lane. Serves to hook-up potential users of current VPValue
674 // definition that are replicated-per-VF later.
676 // The removal of current recipes being replaced by new ones needs to be
677 // delayed after Def2LaneDefs is no longer in use.
679 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
680 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
683 cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
684 (isa<VPInstruction>(&R) &&
685 !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
687 continue;
688
689 auto *DefR = cast<VPSingleDefRecipe>(&R);
690 VPBuilder Builder(DefR);
691 if (DefR->getNumUsers() == 0) {
692 // Create single-scalar version of DefR for all lanes.
693 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
694 cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs);
695 DefR->eraseFromParent();
696 continue;
697 }
698 /// Create single-scalar version of DefR for all lanes.
699 SmallVector<VPValue *> LaneDefs;
700 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
701 LaneDefs.push_back(
702 cloneForLane(Plan, Builder, IdxTy, DefR, VPLane(I), Def2LaneDefs));
703
704 Def2LaneDefs[DefR] = LaneDefs;
705 /// Users that only demand the first lane can use the definition for lane
706 /// 0.
707 DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
708 return U.usesFirstLaneOnly(DefR);
709 });
710
711 // Update each build vector user that currently has DefR as its only
712 // operand, to have all LaneDefs as its operands.
713 for (VPUser *U : to_vector(DefR->users())) {
714 auto *VPI = dyn_cast<VPInstruction>(U);
715 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
716 VPI->getOpcode() != VPInstruction::BuildStructVector))
717 continue;
718 assert(VPI->getNumOperands() == 1 &&
719 "Build(Struct)Vector must have a single operand before "
720 "replicating by VF");
721 VPI->setOperand(0, LaneDefs[0]);
722 for (VPValue *LaneDef : drop_begin(LaneDefs))
723 VPI->addOperand(LaneDef);
724 }
725 ToRemove.push_back(DefR);
726 }
727 }
728 for (auto *R : reverse(ToRemove))
729 R->eraseFromParent();
730}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isCanonical(const MDString *S)
ManagedStatic< HTTPClientCleanup > Cleanup
#define _
#define I(x, y, z)
Definition MD5.cpp:57
#define H(x, y, z)
Definition MD5.cpp:56
MachineInstr unsigned OpIdx
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static ConstantInt * getConstantInt(Value *V, const DataLayout &DL)
Extract ConstantInt from value, looking through IntToPtr and PointerNullValue.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file provides utility VPlan to VPlan transformations.
static VPValue * cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, VPSingleDefRecipe *DefR, VPLane Lane, const DenseMap< VPValue *, SmallVector< VPValue * > > &Def2LaneDefs)
Create a single-scalar clone of DefR (must be a VPReplicateRecipe, VPInstruction or VPScalarIVStepsRe...
static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, unsigned Part, VPlan &Plan, VPTypeAnalysis &TypeInfo)
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, DenseMap< VPValue *, VPValue * > &Old2NewVPValues)
Definition VPlan.cpp:1174
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
static DebugLoc getUnknown()
Definition DebugLoc.h:161
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
static GEPNoWrapFlags none()
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4255
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4282
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4343
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:266
static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected block NewBlock before Blockptr.
Definition VPlanUtils.h:182
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4432
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1303
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1255
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1307
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1250
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1247
@ CanonicalIVIncrementForPart
Definition VPlan.h:1231
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Kind getKind() const
Returns the Kind of lane offset.
unsigned getKnownLane() const
Returns a compile-time known value for the lane index and asserts if the lane can only be calculated ...
@ ScalableLast
For ScalableLast, Lane is the offset from the start of the last N-element subvector in a scalable vec...
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4443
VPRegionBlock * clone() override
Clone all blocks in the single-entry single-exit region of the block and their recipes without updati...
Definition VPlan.cpp:749
const VPBlockBase * getEntry() const
Definition VPlan.h:4479
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4511
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3203
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4059
void setStartIndex(VPValue *StartIndex)
Set or add the StartIndex operand.
Definition VPlan.h:4116
VPValue * getVFValue() const
Return the number of scalars to produce per unroll part, used to compute StartIndex during unrolling.
Definition VPlan.h:4107
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1434
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4573
const DataLayout & getDataLayout() const
Definition VPlan.h:4768
VPBasicBlock * getEntry()
Definition VPlan.h:4665
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4723
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4829
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1064
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4759
bool hasScalarVFOnly() const
Definition VPlan.h:4797
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4709
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4755
void setUF(unsigned UF)
Definition VPlan.h:4812
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4863
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastActive, Op0_t, Op1_t, Op2_t > m_ExtractLastActive(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::ComputeAnyOfResult, Op0_t, Op1_t, Op2_t > m_ComputeAnyOfResult(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each replicating VPReplicateRecipe and VPInstruction outside of any replicate region in Plan ...