LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
76 case VPCanonicalIVPHISC:
77 case VPBranchOnMaskSC:
78 case VPDerivedIVSC:
79 case VPFirstOrderRecurrencePHISC:
80 case VPReductionPHISC:
81 case VPScalarIVStepsSC:
82 case VPPredInstPHISC:
83 return false;
84 case VPBlendSC:
85 case VPReductionEVLSC:
86 case VPReductionSC:
87 case VPVectorPointerSC:
88 case VPWidenCanonicalIVSC:
89 case VPWidenCastSC:
90 case VPWidenGEPSC:
91 case VPWidenIntOrFpInductionSC:
92 case VPWidenLoadEVLSC:
93 case VPWidenLoadSC:
94 case VPWidenPHISC:
95 case VPWidenPointerInductionSC:
96 case VPWidenSC: {
97 const Instruction *I =
98 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
99 (void)I;
100 assert((!I || !I->mayWriteToMemory()) &&
101 "underlying instruction may write to memory");
102 return false;
103 }
104 default:
105 return true;
106 }
107}
108
110 switch (getVPDefID()) {
111 case VPExpressionSC:
112 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
113 case VPInstructionSC:
114 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
115 case VPWidenLoadEVLSC:
116 case VPWidenLoadSC:
117 return true;
118 case VPReplicateSC:
119 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
120 ->mayReadFromMemory();
121 case VPWidenCallSC:
122 return !cast<VPWidenCallRecipe>(this)
123 ->getCalledScalarFunction()
124 ->onlyWritesMemory();
125 case VPWidenIntrinsicSC:
126 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
127 case VPBranchOnMaskSC:
128 case VPDerivedIVSC:
129 case VPFirstOrderRecurrencePHISC:
130 case VPPredInstPHISC:
131 case VPScalarIVStepsSC:
132 case VPWidenStoreEVLSC:
133 case VPWidenStoreSC:
134 return false;
135 case VPBlendSC:
136 case VPReductionEVLSC:
137 case VPReductionSC:
138 case VPVectorPointerSC:
139 case VPWidenCanonicalIVSC:
140 case VPWidenCastSC:
141 case VPWidenGEPSC:
142 case VPWidenIntOrFpInductionSC:
143 case VPWidenPHISC:
144 case VPWidenPointerInductionSC:
145 case VPWidenSC: {
146 const Instruction *I =
147 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
148 (void)I;
149 assert((!I || !I->mayReadFromMemory()) &&
150 "underlying instruction may read from memory");
151 return false;
152 }
153 default:
154 // FIXME: Return false if the recipe represents an interleaved store.
155 return true;
156 }
157}
158
160 switch (getVPDefID()) {
161 case VPExpressionSC:
162 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
163 case VPDerivedIVSC:
164 case VPFirstOrderRecurrencePHISC:
165 case VPPredInstPHISC:
166 case VPVectorEndPointerSC:
167 return false;
168 case VPInstructionSC: {
169 auto *VPI = cast<VPInstruction>(this);
170 return mayWriteToMemory() ||
171 VPI->getOpcode() == VPInstruction::BranchOnCount ||
172 VPI->getOpcode() == VPInstruction::BranchOnCond ||
173 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
174 }
175 case VPWidenCallSC: {
176 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
177 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
178 }
179 case VPWidenIntrinsicSC:
180 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
181 case VPBlendSC:
182 case VPReductionEVLSC:
183 case VPReductionSC:
184 case VPScalarIVStepsSC:
185 case VPVectorPointerSC:
186 case VPWidenCanonicalIVSC:
187 case VPWidenCastSC:
188 case VPWidenGEPSC:
189 case VPWidenIntOrFpInductionSC:
190 case VPWidenPHISC:
191 case VPWidenPointerInductionSC:
192 case VPWidenSC: {
193 const Instruction *I =
194 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
195 (void)I;
196 assert((!I || !I->mayHaveSideEffects()) &&
197 "underlying instruction has side-effects");
198 return false;
199 }
200 case VPInterleaveEVLSC:
201 case VPInterleaveSC:
202 return mayWriteToMemory();
203 case VPWidenLoadEVLSC:
204 case VPWidenLoadSC:
205 case VPWidenStoreEVLSC:
206 case VPWidenStoreSC:
207 assert(
208 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
210 "mayHaveSideffects result for ingredient differs from this "
211 "implementation");
212 return mayWriteToMemory();
213 case VPReplicateSC: {
214 auto *R = cast<VPReplicateRecipe>(this);
215 return R->getUnderlyingInstr()->mayHaveSideEffects();
216 }
217 default:
218 return true;
219 }
220}
221
223 assert(!Parent && "Recipe already in some VPBasicBlock");
224 assert(InsertPos->getParent() &&
225 "Insertion position not in any VPBasicBlock");
226 InsertPos->getParent()->insert(this, InsertPos->getIterator());
227}
228
229void VPRecipeBase::insertBefore(VPBasicBlock &BB,
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(I == BB.end() || I->getParent() == &BB);
233 BB.insert(this, I);
234}
235
237 assert(!Parent && "Recipe already in some VPBasicBlock");
238 assert(InsertPos->getParent() &&
239 "Insertion position not in any VPBasicBlock");
240 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
241}
242
244 assert(getParent() && "Recipe not in any VPBasicBlock");
246 Parent = nullptr;
247}
248
250 assert(getParent() && "Recipe not in any VPBasicBlock");
252}
253
256 insertAfter(InsertPos);
257}
258
264
266 // Get the underlying instruction for the recipe, if there is one. It is used
267 // to
268 // * decide if cost computation should be skipped for this recipe,
269 // * apply forced target instruction cost.
270 Instruction *UI = nullptr;
271 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
272 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
273 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
274 UI = IG->getInsertPos();
275 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
276 UI = &WidenMem->getIngredient();
277
278 InstructionCost RecipeCost;
279 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
280 RecipeCost = 0;
281 } else {
282 RecipeCost = computeCost(VF, Ctx);
283 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
284 RecipeCost.isValid()) {
285 if (UI)
287 else
288 RecipeCost = InstructionCost(0);
289 }
290 }
291
292 LLVM_DEBUG({
293 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
294 dump();
295 });
296 return RecipeCost;
297}
298
300 VPCostContext &Ctx) const {
301 llvm_unreachable("subclasses should implement computeCost");
302}
303
305 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
307}
308
310 auto *VPI = dyn_cast<VPInstruction>(this);
311 return VPI && Instruction::isCast(VPI->getOpcode());
312}
313
315 assert(OpType == Other.OpType && "OpType must match");
316 switch (OpType) {
317 case OperationType::OverflowingBinOp:
318 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
319 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
320 break;
321 case OperationType::Trunc:
322 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
323 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
324 break;
325 case OperationType::DisjointOp:
326 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
327 break;
328 case OperationType::PossiblyExactOp:
329 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
330 break;
331 case OperationType::GEPOp:
332 GEPFlags &= Other.GEPFlags;
333 break;
334 case OperationType::FPMathOp:
335 case OperationType::FCmp:
336 assert((OpType != OperationType::FCmp ||
337 FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
338 "Cannot drop CmpPredicate");
339 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
340 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
341 break;
342 case OperationType::NonNegOp:
343 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
344 break;
345 case OperationType::Cmp:
346 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
347 break;
348 case OperationType::ReductionOp:
349 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
350 "Cannot change RecurKind");
351 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
352 "Cannot change IsOrdered");
353 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
354 "Cannot change IsInLoop");
355 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
356 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
357 break;
358 case OperationType::Other:
359 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
360 break;
361 }
362}
363
365 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
366 OpType == OperationType::ReductionOp) &&
367 "recipe doesn't have fast math flags");
368 const FastMathFlagsTy &F = getFMFsRef();
369 FastMathFlags Res;
370 Res.setAllowReassoc(F.AllowReassoc);
371 Res.setNoNaNs(F.NoNaNs);
372 Res.setNoInfs(F.NoInfs);
373 Res.setNoSignedZeros(F.NoSignedZeros);
374 Res.setAllowReciprocal(F.AllowReciprocal);
375 Res.setAllowContract(F.AllowContract);
376 Res.setApproxFunc(F.ApproxFunc);
377 return Res;
378}
379
380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
382
383void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
384 VPSlotTracker &SlotTracker) const {
385 printRecipe(O, Indent, SlotTracker);
386 if (auto DL = getDebugLoc()) {
387 O << ", !dbg ";
388 DL.print(O);
389 }
390
391 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
393}
394#endif
395
396template <unsigned PartOpIdx>
397VPValue *
399 if (U.getNumOperands() == PartOpIdx + 1)
400 return U.getOperand(PartOpIdx);
401 return nullptr;
402}
403
404template <unsigned PartOpIdx>
406 if (auto *UnrollPartOp = getUnrollPartOperand(U))
407 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
408 return 0;
409}
410
411namespace llvm {
412template class VPUnrollPartAccessor<1>;
413template class VPUnrollPartAccessor<2>;
414template class VPUnrollPartAccessor<3>;
415}
416
418 const VPIRFlags &Flags, const VPIRMetadata &MD,
419 DebugLoc DL, const Twine &Name)
420 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
421 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
423 "Set flags not supported for the provided opcode");
424 assert((getNumOperandsForOpcode(Opcode) == -1u ||
426 "number of operands does not match opcode");
427}
428
429#ifndef NDEBUG
430unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
431 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
432 return 1;
433
434 if (Instruction::isBinaryOp(Opcode))
435 return 2;
436
437 switch (Opcode) {
440 return 0;
441 case Instruction::Alloca:
442 case Instruction::ExtractValue:
443 case Instruction::Freeze:
444 case Instruction::Load:
460 return 1;
461 case Instruction::ICmp:
462 case Instruction::FCmp:
463 case Instruction::ExtractElement:
464 case Instruction::Store:
472 return 2;
473 case Instruction::Select:
477 return 3;
479 return 4;
480 case Instruction::Call:
481 case Instruction::GetElementPtr:
482 case Instruction::PHI:
483 case Instruction::Switch:
490 // Cannot determine the number of operands from the opcode.
491 return -1u;
492 }
493 llvm_unreachable("all cases should be handled above");
494}
495#endif
496
500
501bool VPInstruction::canGenerateScalarForFirstLane() const {
503 return true;
505 return true;
506 switch (Opcode) {
507 case Instruction::Freeze:
508 case Instruction::ICmp:
509 case Instruction::PHI:
510 case Instruction::Select:
520 return true;
521 default:
522 return false;
523 }
524}
525
526Value *VPInstruction::generate(VPTransformState &State) {
527 IRBuilderBase &Builder = State.Builder;
528
530 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
531 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
532 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
533 auto *Res =
534 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
535 if (auto *I = dyn_cast<Instruction>(Res))
536 applyFlags(*I);
537 return Res;
538 }
539
540 switch (getOpcode()) {
541 case VPInstruction::Not: {
542 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
543 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
544 return Builder.CreateNot(A, Name);
545 }
546 case Instruction::ExtractElement: {
547 assert(State.VF.isVector() && "Only extract elements from vectors");
548 if (auto *IdxIRV = dyn_cast<VPIRValue>(getOperand(1))) {
549 unsigned IdxToExtract =
550 cast<ConstantInt>(IdxIRV->getValue())->getZExtValue();
551 return State.get(getOperand(0), VPLane(IdxToExtract));
552 }
553 Value *Vec = State.get(getOperand(0));
554 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
555 return Builder.CreateExtractElement(Vec, Idx, Name);
556 }
557 case Instruction::Freeze: {
559 return Builder.CreateFreeze(Op, Name);
560 }
561 case Instruction::FCmp:
562 case Instruction::ICmp: {
563 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
564 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
565 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
566 return Builder.CreateCmp(getPredicate(), A, B, Name);
567 }
568 case Instruction::PHI: {
569 llvm_unreachable("should be handled by VPPhi::execute");
570 }
571 case Instruction::Select: {
572 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
573 Value *Cond =
574 State.get(getOperand(0),
575 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
576 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
577 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
578 return Builder.CreateSelect(Cond, Op1, Op2, Name);
579 }
581 // Get first lane of vector induction variable.
582 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
583 // Get the original loop tripcount.
584 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
585
586 // If this part of the active lane mask is scalar, generate the CMP directly
587 // to avoid unnecessary extracts.
588 if (State.VF.isScalar())
589 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
590 Name);
591
592 ElementCount EC = State.VF.multiplyCoefficientBy(
593 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
594 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
595 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
596 {PredTy, ScalarTC->getType()},
597 {VIVElem0, ScalarTC}, nullptr, Name);
598 }
600 // Generate code to combine the previous and current values in vector v3.
601 //
602 // vector.ph:
603 // v_init = vector(..., ..., ..., a[-1])
604 // br vector.body
605 //
606 // vector.body
607 // i = phi [0, vector.ph], [i+4, vector.body]
608 // v1 = phi [v_init, vector.ph], [v2, vector.body]
609 // v2 = a[i, i+1, i+2, i+3];
610 // v3 = vector(v1(3), v2(0, 1, 2))
611
612 auto *V1 = State.get(getOperand(0));
613 if (!V1->getType()->isVectorTy())
614 return V1;
615 Value *V2 = State.get(getOperand(1));
616 return Builder.CreateVectorSplice(V1, V2, -1, Name);
617 }
619 unsigned UF = getParent()->getPlan()->getUF();
620 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
621 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
622 Value *Sub = Builder.CreateSub(ScalarTC, Step);
623 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
625 return Builder.CreateSelect(Cmp, Sub, Zero);
626 }
628 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
629 // be outside of the main loop.
630 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
631 // Compute EVL
632 assert(AVL->getType()->isIntegerTy() &&
633 "Requested vector length should be an integer.");
634
635 assert(State.VF.isScalable() && "Expected scalable vector factor.");
636 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
637
638 Value *EVL = Builder.CreateIntrinsic(
639 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
640 {AVL, VFArg, Builder.getTrue()});
641 return EVL;
642 }
644 unsigned Part = getUnrollPart(*this);
645 auto *IV = State.get(getOperand(0), VPLane(0));
646 assert(Part != 0 && "Must have a positive part");
647 // The canonical IV is incremented by the vectorization factor (num of
648 // SIMD elements) times the unroll part.
649 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
650 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
652 }
654 Value *Cond = State.get(getOperand(0), VPLane(0));
655 // Replace the temporary unreachable terminator with a new conditional
656 // branch, hooking it up to backward destination for latch blocks now, and
657 // to forward destination(s) later when they are created.
658 // Second successor may be backwards - iff it is already in VPBB2IRBB.
659 VPBasicBlock *SecondVPSucc =
660 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
661 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
662 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
663 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
664 // First successor is always forward, reset it to nullptr.
665 Br->setSuccessor(0, nullptr);
667 applyMetadata(*Br);
668 return Br;
669 }
671 return Builder.CreateVectorSplat(
672 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
673 }
675 // For struct types, we need to build a new 'wide' struct type, where each
676 // element is widened, i.e., we create a struct of vectors.
677 auto *StructTy =
679 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
680 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
681 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
682 FieldIndex++) {
683 Value *ScalarValue =
684 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
685 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
686 VectorValue =
687 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
688 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
689 }
690 }
691 return Res;
692 }
694 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
695 auto NumOfElements = ElementCount::getFixed(getNumOperands());
696 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
697 for (const auto &[Idx, Op] : enumerate(operands()))
698 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
699 Builder.getInt32(Idx));
700 return Res;
701 }
703 if (State.VF.isScalar())
704 return State.get(getOperand(0), true);
705 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
707 // If this start vector is scaled then it should produce a vector with fewer
708 // elements than the VF.
709 ElementCount VF = State.VF.divideCoefficientBy(
710 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
711 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
712 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
713 Builder.getInt32(0));
714 }
716 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
717 // and will be removed by breaking up the recipe further.
718 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
719 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
720 Value *ReducedPartRdx = State.get(getOperand(2));
721 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
722 ReducedPartRdx =
723 Builder.CreateBinOp(Instruction::Or, State.get(getOperand(Idx)),
724 ReducedPartRdx, "bin.rdx");
725 return createAnyOfReduction(Builder, ReducedPartRdx,
726 State.get(getOperand(1), VPLane(0)), OrigPhi);
727 }
729 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
730 // and will be removed by breaking up the recipe further.
731 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
732 // Get its reduction variable descriptor.
733 RecurKind RK = PhiR->getRecurrenceKind();
735 "Unexpected reduction kind");
736 assert(!PhiR->isInLoop() &&
737 "In-loop FindLastIV reduction is not supported yet");
738
739 // The recipe's operands are the reduction phi, the start value, the
740 // sentinel value, followed by one operand for each part of the reduction.
741 unsigned UF = getNumOperands() - 3;
742 Value *ReducedPartRdx = State.get(getOperand(3));
743 RecurKind MinMaxKind;
746 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
747 else
748 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
749 for (unsigned Part = 1; Part < UF; ++Part)
750 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
751 State.get(getOperand(3 + Part)));
752
753 Value *Start = State.get(getOperand(1), true);
755
756 // Reduce the vector to a scalar.
758 Value *ReducedIV =
759 ReducedPartRdx->getType()->isVectorTy()
760 ? (IsFindLast
761 ? Builder.CreateIntMaxReduce(ReducedPartRdx, IsSigned)
762 : Builder.CreateIntMinReduce(ReducedPartRdx, IsSigned))
763 : ReducedPartRdx;
764 // Correct the final reduction result back to the start value if the
765 // reduction result is the sentinel value.
766 Value *Cmp = Builder.CreateICmpNE(ReducedIV, Sentinel, "rdx.select.cmp");
767 return Builder.CreateSelect(Cmp, ReducedIV, Start, "rdx.select");
768 }
770 RecurKind RK = getRecurKind();
771 bool IsOrdered = isReductionOrdered();
772 bool IsInLoop = isReductionInLoop();
774 "should be handled by ComputeFindIVResult");
775
776 // The recipe may have multiple operands to be reduced together.
777 unsigned NumOperandsToReduce = getNumOperands();
778 VectorParts RdxParts(NumOperandsToReduce);
779 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
780 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
781
782 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
783 if (hasFastMathFlags())
785
786 // Reduce multiple operands into one.
787 Value *ReducedPartRdx = RdxParts[0];
788 if (IsOrdered) {
789 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
790 } else {
791 // Floating-point operations should have some FMF to enable the reduction.
792 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
793 Value *RdxPart = RdxParts[Part];
795 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
796 else {
797 // For sub-recurrences, each part's reduction variable is already
798 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
800 RK == RecurKind::Sub
801 ? Instruction::Add
803 ReducedPartRdx =
804 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
805 }
806 }
807 }
808
809 // Create the reduction after the loop. Note that inloop reductions create
810 // the target reduction in the loop using a Reduction recipe.
811 if (State.VF.isVector() && !IsInLoop) {
812 // TODO: Support in-order reductions based on the recurrence descriptor.
813 // All ops in the reduction inherit fast-math-flags from the recurrence
814 // descriptor.
815 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
816 }
817
818 return ReducedPartRdx;
819 }
822 unsigned Offset =
824 Value *Res;
825 if (State.VF.isVector()) {
826 assert(Offset <= State.VF.getKnownMinValue() &&
827 "invalid offset to extract from");
828 // Extract lane VF - Offset from the operand.
829 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
830 } else {
831 // TODO: Remove ExtractLastLane for scalar VFs.
832 assert(Offset <= 1 && "invalid offset to extract from");
833 Res = State.get(getOperand(0));
834 }
836 Res->setName(Name);
837 return Res;
838 }
840 Value *A = State.get(getOperand(0));
841 Value *B = State.get(getOperand(1));
842 return Builder.CreateLogicalAnd(A, B, Name);
843 }
846 "can only generate first lane for PtrAdd");
847 Value *Ptr = State.get(getOperand(0), VPLane(0));
848 Value *Addend = State.get(getOperand(1), VPLane(0));
849 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
850 }
852 Value *Ptr =
854 Value *Addend = State.get(getOperand(1));
855 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
856 }
858 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
859 for (VPValue *Op : drop_begin(operands()))
860 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
861 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
862 }
864 Value *LaneToExtract = State.get(getOperand(0), true);
865 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
866 Value *Res = nullptr;
867 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
868
869 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
870 Value *VectorStart =
871 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
872 Value *VectorIdx = Idx == 1
873 ? LaneToExtract
874 : Builder.CreateSub(LaneToExtract, VectorStart);
875 Value *Ext = State.VF.isScalar()
876 ? State.get(getOperand(Idx))
877 : Builder.CreateExtractElement(
878 State.get(getOperand(Idx)), VectorIdx);
879 if (Res) {
880 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
881 Res = Builder.CreateSelect(Cmp, Ext, Res);
882 } else {
883 Res = Ext;
884 }
885 }
886 return Res;
887 }
889 if (getNumOperands() == 1) {
890 Value *Mask = State.get(getOperand(0));
891 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
892 /*ZeroIsPoison=*/false, Name);
893 }
894 // If there are multiple operands, create a chain of selects to pick the
895 // first operand with an active lane and add the number of lanes of the
896 // preceding operands.
897 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt64Ty(), State.VF);
898 unsigned LastOpIdx = getNumOperands() - 1;
899 Value *Res = nullptr;
900 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
901 Value *TrailingZeros =
902 State.VF.isScalar()
903 ? Builder.CreateZExt(
904 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
905 Builder.getFalse()),
906 Builder.getInt64Ty())
908 Builder.getInt64Ty(), State.get(getOperand(Idx)),
909 /*ZeroIsPoison=*/false, Name);
910 Value *Current = Builder.CreateAdd(
911 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
912 if (Res) {
913 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
914 Res = Builder.CreateSelect(Cmp, Current, Res);
915 } else {
916 Res = Current;
917 }
918 }
919
920 return Res;
921 }
923 return State.get(getOperand(0), true);
925 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
926 default:
927 llvm_unreachable("Unsupported opcode for instruction");
928 }
929}
930
932 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
933 Type *ScalarTy = Ctx.Types.inferScalarType(this);
934 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
935 switch (Opcode) {
936 case Instruction::FNeg:
937 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
938 case Instruction::UDiv:
939 case Instruction::SDiv:
940 case Instruction::SRem:
941 case Instruction::URem:
942 case Instruction::Add:
943 case Instruction::FAdd:
944 case Instruction::Sub:
945 case Instruction::FSub:
946 case Instruction::Mul:
947 case Instruction::FMul:
948 case Instruction::FDiv:
949 case Instruction::FRem:
950 case Instruction::Shl:
951 case Instruction::LShr:
952 case Instruction::AShr:
953 case Instruction::And:
954 case Instruction::Or:
955 case Instruction::Xor: {
958
959 if (VF.isVector()) {
960 // Certain instructions can be cheaper to vectorize if they have a
961 // constant second vector operand. One example of this are shifts on x86.
962 VPValue *RHS = getOperand(1);
963 RHSInfo = Ctx.getOperandInfo(RHS);
964
965 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
968 }
969
972 if (CtxI)
973 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
974 return Ctx.TTI.getArithmeticInstrCost(
975 Opcode, ResultTy, Ctx.CostKind,
976 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
977 RHSInfo, Operands, CtxI, &Ctx.TLI);
978 }
979 case Instruction::Freeze:
980 // This opcode is unknown. Assume that it is the same as 'mul'.
981 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
982 Ctx.CostKind);
983 case Instruction::ExtractValue:
984 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
985 Ctx.CostKind);
986 case Instruction::ICmp:
987 case Instruction::FCmp: {
988 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
989 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
991 return Ctx.TTI.getCmpSelInstrCost(
992 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
993 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
994 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
995 }
996 case Instruction::BitCast: {
997 Type *ScalarTy = Ctx.Types.inferScalarType(this);
998 if (ScalarTy->isPointerTy())
999 return 0;
1000 [[fallthrough]];
1001 }
1002 case Instruction::SExt:
1003 case Instruction::ZExt:
1004 case Instruction::FPToUI:
1005 case Instruction::FPToSI:
1006 case Instruction::FPExt:
1007 case Instruction::PtrToInt:
1008 case Instruction::PtrToAddr:
1009 case Instruction::IntToPtr:
1010 case Instruction::SIToFP:
1011 case Instruction::UIToFP:
1012 case Instruction::Trunc:
1013 case Instruction::FPTrunc:
1014 case Instruction::AddrSpaceCast: {
1015 // Computes the CastContextHint from a recipe that may access memory.
1016 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1017 if (isa<VPInterleaveBase>(R))
1019 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1020 // Only compute CCH for memory operations, matching the legacy model
1021 // which only considers loads/stores for cast context hints.
1022 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1023 if (!isa<LoadInst, StoreInst>(UI))
1025 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1028 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1029 if (WidenMemoryRecipe == nullptr)
1031 if (VF.isScalar())
1033 if (!WidenMemoryRecipe->isConsecutive())
1035 if (WidenMemoryRecipe->isReverse())
1037 if (WidenMemoryRecipe->isMasked())
1040 };
1041
1042 VPValue *Operand = getOperand(0);
1044 // For Trunc/FPTrunc, get the context from the only user.
1045 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1046 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1047 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1048 return nullptr;
1049 return dyn_cast<VPRecipeBase>(*R->user_begin());
1050 };
1051 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1052 if (match(Recipe, m_Reverse(m_VPValue())))
1053 Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
1054 if (Recipe)
1055 CCH = ComputeCCH(Recipe);
1056 }
1057 }
1058 // For Z/Sext, get the context from the operand.
1059 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1060 Opcode == Instruction::FPExt) {
1061 if (auto *Recipe = Operand->getDefiningRecipe()) {
1062 VPValue *ReverseOp;
1063 if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
1064 Recipe = ReverseOp->getDefiningRecipe();
1065 if (Recipe)
1066 CCH = ComputeCCH(Recipe);
1067 }
1068 }
1069
1070 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1071 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1072 // Arm TTI will use the underlying instruction to determine the cost.
1073 return Ctx.TTI.getCastInstrCost(
1074 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1076 }
1077 case Instruction::Select: {
1079 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1080 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1081
1082 VPValue *Op0, *Op1;
1083 bool IsLogicalAnd =
1084 match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1085 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1086
1087 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1088 (IsLogicalAnd || IsLogicalOr)) {
1089 // select x, y, false --> x & y
1090 // select x, true, y --> x | y
1091 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1092 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1093
1094 SmallVector<const Value *, 2> Operands;
1095 if (SI && all_of(operands(),
1096 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1097 append_range(Operands, SI->operands());
1098 return Ctx.TTI.getArithmeticInstrCost(
1099 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1100 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1101 }
1102
1103 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1104 if (!IsScalarCond)
1105 CondTy = VectorType::get(CondTy, VF);
1106
1107 llvm::CmpPredicate Pred;
1108 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1109 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1110 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1111 Pred = Cmp->getPredicate();
1112 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1113 return Ctx.TTI.getCmpSelInstrCost(
1114 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1115 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1116 }
1117 }
1118 llvm_unreachable("called for unsupported opcode");
1119}
1120
1122 VPCostContext &Ctx) const {
1124 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1125 // TODO: Compute cost for VPInstructions without underlying values once
1126 // the legacy cost model has been retired.
1127 return 0;
1128 }
1129
1131 "Should only generate a vector value or single scalar, not scalars "
1132 "for all lanes.");
1134 getOpcode(),
1136 }
1137
1138 switch (getOpcode()) {
1139 case Instruction::Select: {
1141 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1142 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1143 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1144 if (!vputils::onlyFirstLaneUsed(this)) {
1145 CondTy = toVectorTy(CondTy, VF);
1146 VecTy = toVectorTy(VecTy, VF);
1147 }
1148 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1149 Ctx.CostKind);
1150 }
1151 case Instruction::ExtractElement:
1153 if (VF.isScalar()) {
1154 // ExtractLane with VF=1 takes care of handling extracting across multiple
1155 // parts.
1156 return 0;
1157 }
1158
1159 // Add on the cost of extracting the element.
1160 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1161 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1162 Ctx.CostKind);
1163 }
1164 case VPInstruction::AnyOf: {
1165 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1166 return Ctx.TTI.getArithmeticReductionCost(
1167 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1168 }
1170 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1171 if (VF.isScalar())
1172 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1174 CmpInst::ICMP_EQ, Ctx.CostKind);
1175 // Calculate the cost of determining the lane index.
1176 auto *PredTy = toVectorTy(ScalarTy, VF);
1177 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1178 Type::getInt64Ty(Ctx.LLVMCtx),
1179 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1180 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1181 }
1183 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1184 if (VF.isScalar())
1185 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1187 CmpInst::ICMP_EQ, Ctx.CostKind);
1188 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1189 auto *PredTy = toVectorTy(ScalarTy, VF);
1190 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1191 Type::getInt64Ty(Ctx.LLVMCtx),
1192 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1193 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1194 // Add cost of NOT operation on the predicate.
1195 Cost += Ctx.TTI.getArithmeticInstrCost(
1196 Instruction::Xor, PredTy, Ctx.CostKind,
1197 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1198 {TargetTransformInfo::OK_UniformConstantValue,
1199 TargetTransformInfo::OP_None});
1200 // Add cost of SUB operation on the index.
1201 Cost += Ctx.TTI.getArithmeticInstrCost(
1202 Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
1203 return Cost;
1204 }
1206 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1208 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1209 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1210
1211 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
1212 cast<VectorType>(VectorTy),
1213 cast<VectorType>(VectorTy), Mask,
1214 Ctx.CostKind, VF.getKnownMinValue() - 1);
1215 }
1217 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1218 unsigned Multiplier =
1219 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1220 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1221 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1222 {ArgTy, ArgTy});
1223 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1224 }
1226 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1227 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1228 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1229 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1230 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1231 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1232 }
1234 assert(VF.isVector() && "Reverse operation must be vector type");
1235 auto *VectorTy = cast<VectorType>(
1236 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
1237 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
1238 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1239 /*Index=*/0);
1240 }
1242 // Add on the cost of extracting the element.
1243 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1244 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1245 VecTy, Ctx.CostKind, 0);
1246 }
1248 if (VF == ElementCount::getScalable(1))
1250 [[fallthrough]];
1251 default:
1252 // TODO: Compute cost other VPInstructions once the legacy cost model has
1253 // been retired.
1255 "unexpected VPInstruction witht underlying value");
1256 return 0;
1257 }
1258}
1259
1272
1274 switch (getOpcode()) {
1275 case Instruction::PHI:
1279 return true;
1280 default:
1281 return isScalarCast();
1282 }
1283}
1284
1286 assert(!State.Lane && "VPInstruction executing an Lane");
1287 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1289 "Set flags not supported for the provided opcode");
1290 if (hasFastMathFlags())
1291 State.Builder.setFastMathFlags(getFastMathFlags());
1292 Value *GeneratedValue = generate(State);
1293 if (!hasResult())
1294 return;
1295 assert(GeneratedValue && "generate must produce a value");
1296 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1299 assert((((GeneratedValue->getType()->isVectorTy() ||
1300 GeneratedValue->getType()->isStructTy()) ==
1301 !GeneratesPerFirstLaneOnly) ||
1302 State.VF.isScalar()) &&
1303 "scalar value but not only first lane defined");
1304 State.set(this, GeneratedValue,
1305 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1306}
1307
1310 return false;
1311 switch (getOpcode()) {
1312 case Instruction::GetElementPtr:
1313 case Instruction::ExtractElement:
1314 case Instruction::Freeze:
1315 case Instruction::FCmp:
1316 case Instruction::ICmp:
1317 case Instruction::Select:
1318 case Instruction::PHI:
1338 case VPInstruction::Not:
1347 return false;
1348 default:
1349 return true;
1350 }
1351}
1352
1354 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1356 return vputils::onlyFirstLaneUsed(this);
1357
1358 switch (getOpcode()) {
1359 default:
1360 return false;
1361 case Instruction::ExtractElement:
1362 return Op == getOperand(1);
1363 case Instruction::PHI:
1364 return true;
1365 case Instruction::FCmp:
1366 case Instruction::ICmp:
1367 case Instruction::Select:
1368 case Instruction::Or:
1369 case Instruction::Freeze:
1370 case VPInstruction::Not:
1371 // TODO: Cover additional opcodes.
1372 return vputils::onlyFirstLaneUsed(this);
1381 return true;
1384 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1385 // operand, after replicating its operands only the first lane is used.
1386 // Before replicating, it will have only a single operand.
1387 return getNumOperands() > 1;
1389 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1391 // WidePtrAdd supports scalar and vector base addresses.
1392 return false;
1395 return Op == getOperand(1);
1397 return Op == getOperand(0);
1398 };
1399 llvm_unreachable("switch should return");
1400}
1401
1403 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1405 return vputils::onlyFirstPartUsed(this);
1406
1407 switch (getOpcode()) {
1408 default:
1409 return false;
1410 case Instruction::FCmp:
1411 case Instruction::ICmp:
1412 case Instruction::Select:
1413 return vputils::onlyFirstPartUsed(this);
1418 return true;
1419 };
1420 llvm_unreachable("switch should return");
1421}
1422
1423#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1425 VPSlotTracker SlotTracker(getParent()->getPlan());
1427}
1428
1430 VPSlotTracker &SlotTracker) const {
1431 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1432
1433 if (hasResult()) {
1435 O << " = ";
1436 }
1437
1438 switch (getOpcode()) {
1439 case VPInstruction::Not:
1440 O << "not";
1441 break;
1443 O << "combined load";
1444 break;
1446 O << "combined store";
1447 break;
1449 O << "active lane mask";
1450 break;
1452 O << "EXPLICIT-VECTOR-LENGTH";
1453 break;
1455 O << "first-order splice";
1456 break;
1458 O << "branch-on-cond";
1459 break;
1461 O << "branch-on-two-conds";
1462 break;
1464 O << "TC > VF ? TC - VF : 0";
1465 break;
1467 O << "VF * Part +";
1468 break;
1470 O << "branch-on-count";
1471 break;
1473 O << "broadcast";
1474 break;
1476 O << "buildstructvector";
1477 break;
1479 O << "buildvector";
1480 break;
1482 O << "extract-lane";
1483 break;
1485 O << "extract-last-lane";
1486 break;
1488 O << "extract-last-part";
1489 break;
1491 O << "extract-penultimate-element";
1492 break;
1494 O << "compute-anyof-result";
1495 break;
1497 O << "compute-find-iv-result";
1498 break;
1500 O << "compute-reduction-result";
1501 break;
1503 O << "logical-and";
1504 break;
1506 O << "ptradd";
1507 break;
1509 O << "wide-ptradd";
1510 break;
1512 O << "any-of";
1513 break;
1515 O << "first-active-lane";
1516 break;
1518 O << "last-active-lane";
1519 break;
1521 O << "reduction-start-vector";
1522 break;
1524 O << "resume-for-epilogue";
1525 break;
1527 O << "reverse";
1528 break;
1530 O << "unpack";
1531 break;
1532 default:
1534 }
1535
1536 printFlags(O);
1538}
1539#endif
1540
1542 State.setDebugLocFrom(getDebugLoc());
1543 if (isScalarCast()) {
1544 Value *Op = State.get(getOperand(0), VPLane(0));
1545 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1546 Op, ResultTy);
1547 State.set(this, Cast, VPLane(0));
1548 return;
1549 }
1550 switch (getOpcode()) {
1552 Value *StepVector =
1553 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1554 State.set(this, StepVector);
1555 break;
1556 }
1557 case VPInstruction::VScale: {
1558 Value *VScale = State.Builder.CreateVScale(ResultTy);
1559 State.set(this, VScale, true);
1560 break;
1561 }
1562
1563 default:
1564 llvm_unreachable("opcode not implemented yet");
1565 }
1566}
1567
1568#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1570 VPSlotTracker &SlotTracker) const {
1571 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1573 O << " = ";
1574
1575 switch (getOpcode()) {
1577 O << "wide-iv-step ";
1579 break;
1581 O << "step-vector " << *ResultTy;
1582 break;
1584 O << "vscale " << *ResultTy;
1585 break;
1586 default:
1587 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1590 O << " to " << *ResultTy;
1591 }
1592}
1593#endif
1594
1596 State.setDebugLocFrom(getDebugLoc());
1597 PHINode *NewPhi = State.Builder.CreatePHI(
1598 State.TypeAnalysis.inferScalarType(this), 2, getName());
1599 unsigned NumIncoming = getNumIncoming();
1600 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1601 // TODO: Fixup all incoming values of header phis once recipes defining them
1602 // are introduced.
1603 NumIncoming = 1;
1604 }
1605 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1606 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1607 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1608 NewPhi->addIncoming(IncV, PredBB);
1609 }
1610 State.set(this, NewPhi, VPLane(0));
1611}
1612
1613#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1614void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1615 VPSlotTracker &SlotTracker) const {
1616 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1618 O << " = phi ";
1620}
1621#endif
1622
1623VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1624 if (auto *Phi = dyn_cast<PHINode>(&I))
1625 return new VPIRPhi(*Phi);
1626 return new VPIRInstruction(I);
1627}
1628
1630 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1631 "PHINodes must be handled by VPIRPhi");
1632 // Advance the insert point after the wrapped IR instruction. This allows
1633 // interleaving VPIRInstructions and other recipes.
1634 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1635}
1636
1638 VPCostContext &Ctx) const {
1639 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1640 // hence it does not contribute to the cost-modeling for the VPlan.
1641 return 0;
1642}
1643
1645 VPBuilder &Builder) {
1647 "can only update exiting operands to phi nodes");
1648 assert(getNumOperands() > 0 && "must have at least one operand");
1649 VPValue *Exiting = getOperand(0);
1650 if (isa<VPIRValue>(Exiting))
1651 return;
1652
1653 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
1654 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
1655 setOperand(0, Exiting);
1656}
1657
1658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1660 VPSlotTracker &SlotTracker) const {
1661 O << Indent << "IR " << I;
1662}
1663#endif
1664
1666 PHINode *Phi = &getIRPhi();
1667 for (const auto &[Idx, Op] : enumerate(operands())) {
1668 VPValue *ExitValue = Op;
1669 auto Lane = vputils::isSingleScalar(ExitValue)
1671 : VPLane::getLastLaneForVF(State.VF);
1672 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1673 auto *PredVPBB = Pred->getExitingBasicBlock();
1674 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1675 // Set insertion point in PredBB in case an extract needs to be generated.
1676 // TODO: Model extracts explicitly.
1677 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1678 Value *V = State.get(ExitValue, VPLane(Lane));
1679 // If there is no existing block for PredBB in the phi, add a new incoming
1680 // value. Otherwise update the existing incoming value for PredBB.
1681 if (Phi->getBasicBlockIndex(PredBB) == -1)
1682 Phi->addIncoming(V, PredBB);
1683 else
1684 Phi->setIncomingValueForBlock(PredBB, V);
1685 }
1686
1687 // Advance the insert point after the wrapped IR instruction. This allows
1688 // interleaving VPIRInstructions and other recipes.
1689 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1690}
1691
1693 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1694 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1695 "Number of phi operands must match number of predecessors");
1696 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1697 R->removeOperand(Position);
1698}
1699
1700#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1702 VPSlotTracker &SlotTracker) const {
1703 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1704 [this, &O, &SlotTracker](auto Op) {
1705 O << "[ ";
1706 Op.value()->printAsOperand(O, SlotTracker);
1707 O << ", ";
1708 getIncomingBlock(Op.index())->printAsOperand(O);
1709 O << " ]";
1710 });
1711}
1712#endif
1713
1714#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1716 VPSlotTracker &SlotTracker) const {
1718
1719 if (getNumOperands() != 0) {
1720 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1722 [&O, &SlotTracker](auto Op) {
1723 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1724 O << " from ";
1725 std::get<1>(Op)->printAsOperand(O);
1726 });
1727 O << ")";
1728 }
1729}
1730#endif
1731
1733 for (const auto &[Kind, Node] : Metadata)
1734 I.setMetadata(Kind, Node);
1735}
1736
1738 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1739 for (const auto &[KindA, MDA] : Metadata) {
1740 for (const auto &[KindB, MDB] : Other.Metadata) {
1741 if (KindA == KindB && MDA == MDB) {
1742 MetadataIntersection.emplace_back(KindA, MDA);
1743 break;
1744 }
1745 }
1746 }
1747 Metadata = std::move(MetadataIntersection);
1748}
1749
1750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1752 const Module *M = SlotTracker.getModule();
1753 if (Metadata.empty() || !M)
1754 return;
1755
1756 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1757 O << " (";
1758 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1759 auto [Kind, Node] = KindNodePair;
1760 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1761 "Unexpected unnamed metadata kind");
1762 O << "!" << MDNames[Kind] << " ";
1763 Node->printAsOperand(O, M);
1764 });
1765 O << ")";
1766}
1767#endif
1768
1770 assert(State.VF.isVector() && "not widening");
1771 assert(Variant != nullptr && "Can't create vector function.");
1772
1773 FunctionType *VFTy = Variant->getFunctionType();
1774 // Add return type if intrinsic is overloaded on it.
1776 for (const auto &I : enumerate(args())) {
1777 Value *Arg;
1778 // Some vectorized function variants may also take a scalar argument,
1779 // e.g. linear parameters for pointers. This needs to be the scalar value
1780 // from the start of the respective part when interleaving.
1781 if (!VFTy->getParamType(I.index())->isVectorTy())
1782 Arg = State.get(I.value(), VPLane(0));
1783 else
1784 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1785 Args.push_back(Arg);
1786 }
1787
1790 if (CI)
1791 CI->getOperandBundlesAsDefs(OpBundles);
1792
1793 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1794 applyFlags(*V);
1795 applyMetadata(*V);
1796 V->setCallingConv(Variant->getCallingConv());
1797
1798 if (!V->getType()->isVoidTy())
1799 State.set(this, V);
1800}
1801
1803 VPCostContext &Ctx) const {
1804 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1805 Variant->getFunctionType()->params(),
1806 Ctx.CostKind);
1807}
1808
1809#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1811 VPSlotTracker &SlotTracker) const {
1812 O << Indent << "WIDEN-CALL ";
1813
1814 Function *CalledFn = getCalledScalarFunction();
1815 if (CalledFn->getReturnType()->isVoidTy())
1816 O << "void ";
1817 else {
1819 O << " = ";
1820 }
1821
1822 O << "call";
1823 printFlags(O);
1824 O << " @" << CalledFn->getName() << "(";
1825 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1826 Op->printAsOperand(O, SlotTracker);
1827 });
1828 O << ")";
1829
1830 O << " (using library function";
1831 if (Variant->hasName())
1832 O << ": " << Variant->getName();
1833 O << ")";
1834}
1835#endif
1836
1838 assert(State.VF.isVector() && "not widening");
1839
1840 SmallVector<Type *, 2> TysForDecl;
1841 // Add return type if intrinsic is overloaded on it.
1842 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
1843 State.TTI)) {
1844 Type *RetTy = toVectorizedTy(getResultType(), State.VF);
1845 append_range(TysForDecl, getContainedTypes(RetTy));
1846 }
1848 for (const auto &I : enumerate(operands())) {
1849 // Some intrinsics have a scalar argument - don't replace it with a
1850 // vector.
1851 Value *Arg;
1852 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1853 State.TTI))
1854 Arg = State.get(I.value(), VPLane(0));
1855 else
1856 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1857 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1858 State.TTI))
1859 TysForDecl.push_back(Arg->getType());
1860 Args.push_back(Arg);
1861 }
1862
1863 // Use vector version of the intrinsic.
1864 Module *M = State.Builder.GetInsertBlock()->getModule();
1865 Function *VectorF =
1866 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1867 assert(VectorF &&
1868 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1869
1872 if (CI)
1873 CI->getOperandBundlesAsDefs(OpBundles);
1874
1875 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1876
1877 applyFlags(*V);
1878 applyMetadata(*V);
1879
1880 if (!V->getType()->isVoidTy())
1881 State.set(this, V);
1882}
1883
1884/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1887 const VPRecipeWithIRFlags &R,
1888 ElementCount VF,
1889 VPCostContext &Ctx) {
1890 // Some backends analyze intrinsic arguments to determine cost. Use the
1891 // underlying value for the operand if it has one. Otherwise try to use the
1892 // operand of the underlying call instruction, if there is one. Otherwise
1893 // clear Arguments.
1894 // TODO: Rework TTI interface to be independent of concrete IR values.
1896 for (const auto &[Idx, Op] : enumerate(Operands)) {
1897 auto *V = Op->getUnderlyingValue();
1898 if (!V) {
1899 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1900 Arguments.push_back(UI->getArgOperand(Idx));
1901 continue;
1902 }
1903 Arguments.clear();
1904 break;
1905 }
1906 Arguments.push_back(V);
1907 }
1908
1909 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1910 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1911 SmallVector<Type *> ParamTys;
1912 for (const VPValue *Op : Operands) {
1913 ParamTys.push_back(VF.isVector()
1914 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1915 : Ctx.Types.inferScalarType(Op));
1916 }
1917
1918 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1919 FastMathFlags FMF =
1920 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1921 IntrinsicCostAttributes CostAttrs(
1922 ID, RetTy, Arguments, ParamTys, FMF,
1923 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1924 InstructionCost::getInvalid(), &Ctx.TLI);
1925 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1926}
1927
1929 VPCostContext &Ctx) const {
1931 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1932}
1933
1935 return Intrinsic::getBaseName(VectorIntrinsicID);
1936}
1937
1939 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1940 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1941 auto [Idx, V] = X;
1943 Idx, nullptr);
1944 });
1945}
1946
1947#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1949 VPSlotTracker &SlotTracker) const {
1950 O << Indent << "WIDEN-INTRINSIC ";
1951 if (ResultTy->isVoidTy()) {
1952 O << "void ";
1953 } else {
1955 O << " = ";
1956 }
1957
1958 O << "call";
1959 printFlags(O);
1960 O << getIntrinsicName() << "(";
1961
1963 Op->printAsOperand(O, SlotTracker);
1964 });
1965 O << ")";
1966}
1967#endif
1968
1970 IRBuilderBase &Builder = State.Builder;
1971
1972 Value *Address = State.get(getOperand(0));
1973 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1974 VectorType *VTy = cast<VectorType>(Address->getType());
1975
1976 // The histogram intrinsic requires a mask even if the recipe doesn't;
1977 // if the mask operand was omitted then all lanes should be executed and
1978 // we just need to synthesize an all-true mask.
1979 Value *Mask = nullptr;
1980 if (VPValue *VPMask = getMask())
1981 Mask = State.get(VPMask);
1982 else
1983 Mask =
1984 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1985
1986 // If this is a subtract, we want to invert the increment amount. We may
1987 // add a separate intrinsic in future, but for now we'll try this.
1988 if (Opcode == Instruction::Sub)
1989 IncAmt = Builder.CreateNeg(IncAmt);
1990 else
1991 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1992
1993 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1994 {VTy, IncAmt->getType()},
1995 {Address, IncAmt, Mask});
1996}
1997
1999 VPCostContext &Ctx) const {
2000 // FIXME: Take the gather and scatter into account as well. For now we're
2001 // generating the same cost as the fallback path, but we'll likely
2002 // need to create a new TTI method for determining the cost, including
2003 // whether we can use base + vec-of-smaller-indices or just
2004 // vec-of-pointers.
2005 assert(VF.isVector() && "Invalid VF for histogram cost");
2006 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2007 VPValue *IncAmt = getOperand(1);
2008 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2009 VectorType *VTy = VectorType::get(IncTy, VF);
2010
2011 // Assume that a non-constant update value (or a constant != 1) requires
2012 // a multiply, and add that into the cost.
2013 InstructionCost MulCost =
2014 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2015 if (auto *IncAmountIRV = dyn_cast<VPIRValue>(IncAmt)) {
2016 ConstantInt *CI = dyn_cast<ConstantInt>(IncAmountIRV->getValue());
2017
2018 if (CI && CI->getZExtValue() == 1)
2019 MulCost = TTI::TCC_Free;
2020 }
2021
2022 // Find the cost of the histogram operation itself.
2023 Type *PtrTy = VectorType::get(AddressTy, VF);
2024 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2025 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2026 Type::getVoidTy(Ctx.LLVMCtx),
2027 {PtrTy, IncTy, MaskTy});
2028
2029 // Add the costs together with the add/sub operation.
2030 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2031 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2032}
2033
2034#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2036 VPSlotTracker &SlotTracker) const {
2037 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2039
2040 if (Opcode == Instruction::Sub)
2041 O << ", dec: ";
2042 else {
2043 assert(Opcode == Instruction::Add);
2044 O << ", inc: ";
2045 }
2047
2048 if (VPValue *Mask = getMask()) {
2049 O << ", mask: ";
2050 Mask->printAsOperand(O, SlotTracker);
2051 }
2052}
2053#endif
2054
2055VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2056 AllowReassoc = FMF.allowReassoc();
2057 NoNaNs = FMF.noNaNs();
2058 NoInfs = FMF.noInfs();
2059 NoSignedZeros = FMF.noSignedZeros();
2060 AllowReciprocal = FMF.allowReciprocal();
2061 AllowContract = FMF.allowContract();
2062 ApproxFunc = FMF.approxFunc();
2063}
2064
2065#if !defined(NDEBUG)
2066bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2067 switch (OpType) {
2068 case OperationType::OverflowingBinOp:
2069 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2070 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2071 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2072 case OperationType::Trunc:
2073 return Opcode == Instruction::Trunc;
2074 case OperationType::DisjointOp:
2075 return Opcode == Instruction::Or;
2076 case OperationType::PossiblyExactOp:
2077 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2078 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2079 case OperationType::GEPOp:
2080 return Opcode == Instruction::GetElementPtr ||
2081 Opcode == VPInstruction::PtrAdd ||
2082 Opcode == VPInstruction::WidePtrAdd;
2083 case OperationType::FPMathOp:
2084 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2085 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2086 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2087 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2088 Opcode == Instruction::FPTrunc || Opcode == Instruction::Select ||
2089 Opcode == VPInstruction::WideIVStep ||
2091 case OperationType::FCmp:
2092 return Opcode == Instruction::FCmp;
2093 case OperationType::NonNegOp:
2094 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2095 case OperationType::Cmp:
2096 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2097 case OperationType::ReductionOp:
2099 case OperationType::Other:
2100 return true;
2101 }
2102 llvm_unreachable("Unknown OperationType enum");
2103}
2104#endif
2105
2106#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2108 switch (OpType) {
2109 case OperationType::Cmp:
2111 break;
2112 case OperationType::FCmp:
2115 break;
2116 case OperationType::DisjointOp:
2117 if (DisjointFlags.IsDisjoint)
2118 O << " disjoint";
2119 break;
2120 case OperationType::PossiblyExactOp:
2121 if (ExactFlags.IsExact)
2122 O << " exact";
2123 break;
2124 case OperationType::OverflowingBinOp:
2125 if (WrapFlags.HasNUW)
2126 O << " nuw";
2127 if (WrapFlags.HasNSW)
2128 O << " nsw";
2129 break;
2130 case OperationType::Trunc:
2131 if (TruncFlags.HasNUW)
2132 O << " nuw";
2133 if (TruncFlags.HasNSW)
2134 O << " nsw";
2135 break;
2136 case OperationType::FPMathOp:
2138 break;
2139 case OperationType::GEPOp:
2140 if (GEPFlags.isInBounds())
2141 O << " inbounds";
2142 else if (GEPFlags.hasNoUnsignedSignedWrap())
2143 O << " nusw";
2144 if (GEPFlags.hasNoUnsignedWrap())
2145 O << " nuw";
2146 break;
2147 case OperationType::NonNegOp:
2148 if (NonNegFlags.NonNeg)
2149 O << " nneg";
2150 break;
2151 case OperationType::ReductionOp: {
2152 RecurKind RK = getRecurKind();
2153 O << " ("
2155 if (isReductionInLoop())
2156 O << ", in-loop";
2157 if (isReductionOrdered())
2158 O << ", ordered";
2159 O << ")";
2161 break;
2162 }
2163 case OperationType::Other:
2164 break;
2165 }
2166 O << " ";
2167}
2168#endif
2169
2171 auto &Builder = State.Builder;
2172 switch (Opcode) {
2173 case Instruction::Call:
2174 case Instruction::Br:
2175 case Instruction::PHI:
2176 case Instruction::GetElementPtr:
2177 llvm_unreachable("This instruction is handled by a different recipe.");
2178 case Instruction::UDiv:
2179 case Instruction::SDiv:
2180 case Instruction::SRem:
2181 case Instruction::URem:
2182 case Instruction::Add:
2183 case Instruction::FAdd:
2184 case Instruction::Sub:
2185 case Instruction::FSub:
2186 case Instruction::FNeg:
2187 case Instruction::Mul:
2188 case Instruction::FMul:
2189 case Instruction::FDiv:
2190 case Instruction::FRem:
2191 case Instruction::Shl:
2192 case Instruction::LShr:
2193 case Instruction::AShr:
2194 case Instruction::And:
2195 case Instruction::Or:
2196 case Instruction::Xor: {
2197 // Just widen unops and binops.
2199 for (VPValue *VPOp : operands())
2200 Ops.push_back(State.get(VPOp));
2201
2202 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2203
2204 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2205 applyFlags(*VecOp);
2206 applyMetadata(*VecOp);
2207 }
2208
2209 // Use this vector value for all users of the original instruction.
2210 State.set(this, V);
2211 break;
2212 }
2213 case Instruction::ExtractValue: {
2214 assert(getNumOperands() == 2 && "expected single level extractvalue");
2215 Value *Op = State.get(getOperand(0));
2217 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
2218 State.set(this, Extract);
2219 break;
2220 }
2221 case Instruction::Freeze: {
2222 Value *Op = State.get(getOperand(0));
2223 Value *Freeze = Builder.CreateFreeze(Op);
2224 State.set(this, Freeze);
2225 break;
2226 }
2227 case Instruction::ICmp:
2228 case Instruction::FCmp: {
2229 // Widen compares. Generate vector compares.
2230 bool FCmp = Opcode == Instruction::FCmp;
2231 Value *A = State.get(getOperand(0));
2232 Value *B = State.get(getOperand(1));
2233 Value *C = nullptr;
2234 if (FCmp) {
2235 C = Builder.CreateFCmp(getPredicate(), A, B);
2236 } else {
2237 C = Builder.CreateICmp(getPredicate(), A, B);
2238 }
2239 if (auto *I = dyn_cast<Instruction>(C)) {
2240 applyFlags(*I);
2241 applyMetadata(*I);
2242 }
2243 State.set(this, C);
2244 break;
2245 }
2246 case Instruction::Select: {
2247 VPValue *CondOp = getOperand(0);
2248 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2249 Value *Op0 = State.get(getOperand(1));
2250 Value *Op1 = State.get(getOperand(2));
2251 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2252 State.set(this, Sel);
2253 if (auto *I = dyn_cast<Instruction>(Sel)) {
2255 applyFlags(*I);
2256 applyMetadata(*I);
2257 }
2258 break;
2259 }
2260 default:
2261 // This instruction is not vectorized by simple widening.
2262 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2263 << Instruction::getOpcodeName(Opcode));
2264 llvm_unreachable("Unhandled instruction!");
2265 } // end of switch.
2266
2267#if !defined(NDEBUG)
2268 // Verify that VPlan type inference results agree with the type of the
2269 // generated values.
2270 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2271 State.get(this)->getType() &&
2272 "inferred type and type from generated instructions do not match");
2273#endif
2274}
2275
2277 VPCostContext &Ctx) const {
2278 switch (Opcode) {
2279 case Instruction::UDiv:
2280 case Instruction::SDiv:
2281 case Instruction::SRem:
2282 case Instruction::URem:
2283 // If the div/rem operation isn't safe to speculate and requires
2284 // predication, then the only way we can even create a vplan is to insert
2285 // a select on the second input operand to ensure we use the value of 1
2286 // for the inactive lanes. The select will be costed separately.
2287 case Instruction::FNeg:
2288 case Instruction::Add:
2289 case Instruction::FAdd:
2290 case Instruction::Sub:
2291 case Instruction::FSub:
2292 case Instruction::Mul:
2293 case Instruction::FMul:
2294 case Instruction::FDiv:
2295 case Instruction::FRem:
2296 case Instruction::Shl:
2297 case Instruction::LShr:
2298 case Instruction::AShr:
2299 case Instruction::And:
2300 case Instruction::Or:
2301 case Instruction::Xor:
2302 case Instruction::Freeze:
2303 case Instruction::ExtractValue:
2304 case Instruction::ICmp:
2305 case Instruction::FCmp:
2306 case Instruction::Select:
2307 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2308 default:
2309 llvm_unreachable("Unsupported opcode for instruction");
2310 }
2311}
2312
2313#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2315 VPSlotTracker &SlotTracker) const {
2316 O << Indent << "WIDEN ";
2318 O << " = " << Instruction::getOpcodeName(Opcode);
2319 printFlags(O);
2321}
2322#endif
2323
2325 auto &Builder = State.Builder;
2326 /// Vectorize casts.
2327 assert(State.VF.isVector() && "Not vectorizing?");
2328 Type *DestTy = VectorType::get(getResultType(), State.VF);
2329 VPValue *Op = getOperand(0);
2330 Value *A = State.get(Op);
2331 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2332 State.set(this, Cast);
2333 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2334 applyFlags(*CastOp);
2335 applyMetadata(*CastOp);
2336 }
2337}
2338
2340 VPCostContext &Ctx) const {
2341 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2342 // the legacy cost model, including truncates/extends when evaluating a
2343 // reduction in a smaller type.
2344 if (!getUnderlyingValue())
2345 return 0;
2346 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2347}
2348
2349#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2351 VPSlotTracker &SlotTracker) const {
2352 O << Indent << "WIDEN-CAST ";
2354 O << " = " << Instruction::getOpcodeName(Opcode);
2355 printFlags(O);
2357 O << " to " << *getResultType();
2358}
2359#endif
2360
2362 VPCostContext &Ctx) const {
2363 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2364}
2365
2366#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2368 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2369 O << Indent;
2371 O << " = WIDEN-INDUCTION";
2372 printFlags(O);
2374
2375 if (auto *TI = getTruncInst())
2376 O << " (truncated to " << *TI->getType() << ")";
2377}
2378#endif
2379
2381 // The step may be defined by a recipe in the preheader (e.g. if it requires
2382 // SCEV expansion), but for the canonical induction the step is required to be
2383 // 1, which is represented as live-in.
2385 if (!Step)
2386 return false;
2387 ;
2388 auto *StepC = dyn_cast<ConstantInt>(Step->getValue());
2389 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getValue());
2390 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2391 getScalarType() == getRegion()->getCanonicalIVType();
2392}
2393
2394#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2396 VPSlotTracker &SlotTracker) const {
2397 O << Indent;
2399 O << " = DERIVED-IV ";
2400 getStartValue()->printAsOperand(O, SlotTracker);
2401 O << " + ";
2402 getOperand(1)->printAsOperand(O, SlotTracker);
2403 O << " * ";
2404 getStepValue()->printAsOperand(O, SlotTracker);
2405}
2406#endif
2407
2409 // Fast-math-flags propagate from the original induction instruction.
2410 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2411 if (hasFastMathFlags())
2412 State.Builder.setFastMathFlags(getFastMathFlags());
2413
2414 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2415 /// variable on which to base the steps, \p Step is the size of the step.
2416
2417 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2418 Value *Step = State.get(getStepValue(), VPLane(0));
2419 IRBuilderBase &Builder = State.Builder;
2420
2421 // Ensure step has the same type as that of scalar IV.
2422 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2423 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2424
2425 // We build scalar steps for both integer and floating-point induction
2426 // variables. Here, we determine the kind of arithmetic we will perform.
2429 if (BaseIVTy->isIntegerTy()) {
2430 AddOp = Instruction::Add;
2431 MulOp = Instruction::Mul;
2432 } else {
2433 AddOp = InductionOpcode;
2434 MulOp = Instruction::FMul;
2435 }
2436
2437 // Determine the number of scalars we need to generate for each unroll
2438 // iteration.
2439 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2440 // Compute the scalar steps and save the results in State.
2441 Type *IntStepTy =
2442 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2443
2444 unsigned StartLane = 0;
2445 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2446 if (State.Lane) {
2447 StartLane = State.Lane->getKnownLane();
2448 EndLane = StartLane + 1;
2449 }
2450 Value *StartIdx0;
2451 if (getUnrollPart(*this) == 0)
2452 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2453 else {
2454 StartIdx0 = State.get(getOperand(2), true);
2455 if (getUnrollPart(*this) != 1) {
2456 StartIdx0 =
2457 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2458 getUnrollPart(*this)));
2459 }
2460 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2461 }
2462
2463 if (BaseIVTy->isFloatingPointTy())
2464 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2465
2466 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2467 // It is okay if the induction variable type cannot hold the lane number,
2468 // we expect truncation in this case.
2469 Constant *LaneValue =
2470 BaseIVTy->isIntegerTy()
2471 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2472 /*ImplicitTrunc=*/true)
2473 : ConstantFP::get(BaseIVTy, Lane);
2474 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2475 // The step returned by `createStepForVF` is a runtime-evaluated value
2476 // when VF is scalable. Otherwise, it should be folded into a Constant.
2477 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2478 "Expected StartIdx to be folded to a constant when VF is not "
2479 "scalable");
2480 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2481 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2482 State.set(this, Add, VPLane(Lane));
2483 }
2484}
2485
2486#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2488 VPSlotTracker &SlotTracker) const {
2489 O << Indent;
2491 O << " = SCALAR-STEPS ";
2493}
2494#endif
2495
2497 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2499}
2500
2502 assert(State.VF.isVector() && "not widening");
2503 // Construct a vector GEP by widening the operands of the scalar GEP as
2504 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2505 // results in a vector of pointers when at least one operand of the GEP
2506 // is vector-typed. Thus, to keep the representation compact, we only use
2507 // vector-typed operands for loop-varying values.
2508
2509 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2510 return Op->isDefinedOutsideLoopRegions();
2511 });
2512 if (AllOperandsAreInvariant) {
2513 // If we are vectorizing, but the GEP has only loop-invariant operands,
2514 // the GEP we build (by only using vector-typed operands for
2515 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2516 // produce a vector of pointers, we need to either arbitrarily pick an
2517 // operand to broadcast, or broadcast a clone of the original GEP.
2518 // Here, we broadcast a clone of the original.
2519
2521 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2522 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2523
2524 auto *NewGEP =
2525 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2526 "", getGEPNoWrapFlags());
2527 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2528 State.set(this, Splat);
2529 return;
2530 }
2531
2532 // If the GEP has at least one loop-varying operand, we are sure to
2533 // produce a vector of pointers unless VF is scalar.
2534 // The pointer operand of the new GEP. If it's loop-invariant, we
2535 // won't broadcast it.
2536 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2537
2538 // Collect all the indices for the new GEP. If any index is
2539 // loop-invariant, we won't broadcast it.
2541 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2542 VPValue *Operand = getOperand(I);
2543 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2544 }
2545
2546 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2547 // but it should be a vector, otherwise.
2548 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2549 "", getGEPNoWrapFlags());
2550 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2551 "NewGEP is not a pointer vector");
2552 State.set(this, NewGEP);
2553}
2554
2555#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2557 VPSlotTracker &SlotTracker) const {
2558 O << Indent << "WIDEN-GEP ";
2559 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2560 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2561 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2562
2563 O << " ";
2565 O << " = getelementptr";
2566 printFlags(O);
2568}
2569#endif
2570
2572 auto &Builder = State.Builder;
2573 unsigned CurrentPart = getUnrollPart(*this);
2574 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2575 Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
2576
2577 // The wide store needs to start at the last vector element.
2578 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2579 if (IndexTy != RunTimeVF->getType())
2580 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2581 // NumElt = Stride * CurrentPart * RunTimeVF
2582 Value *NumElt = Builder.CreateMul(
2583 ConstantInt::getSigned(IndexTy, Stride * (int64_t)CurrentPart),
2584 RunTimeVF);
2585 // LastLane = Stride * (RunTimeVF - 1)
2586 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2587 if (Stride != 1)
2588 LastLane =
2589 Builder.CreateMul(ConstantInt::getSigned(IndexTy, Stride), LastLane);
2590 Value *Ptr = State.get(getOperand(0), VPLane(0));
2591 Value *ResultPtr =
2592 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2593 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2595
2596 State.set(this, ResultPtr, /*IsScalar*/ true);
2597}
2598
2599#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2601 VPSlotTracker &SlotTracker) const {
2602 O << Indent;
2604 O << " = vector-end-pointer";
2605 printFlags(O);
2607}
2608#endif
2609
2611 auto &Builder = State.Builder;
2612 assert(getOffset() &&
2613 "Expected prior simplification of recipe without offset");
2614 Value *Ptr = State.get(getOperand(0), VPLane(0));
2615 Value *Offset = State.get(getOffset(), true);
2616 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2618 State.set(this, ResultPtr, /*IsScalar*/ true);
2619}
2620
2621#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2623 VPSlotTracker &SlotTracker) const {
2624 O << Indent;
2626 O << " = vector-pointer";
2627 printFlags(O);
2629}
2630#endif
2631
2633 VPCostContext &Ctx) const {
2634 // A blend will be expanded to a select VPInstruction, which will generate a
2635 // scalar select if only the first lane is used.
2637 VF = ElementCount::getFixed(1);
2638
2639 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2640 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2641 return (getNumIncomingValues() - 1) *
2642 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2643 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2644}
2645
2646#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2648 VPSlotTracker &SlotTracker) const {
2649 O << Indent << "BLEND ";
2651 O << " =";
2652 if (getNumIncomingValues() == 1) {
2653 // Not a User of any mask: not really blending, this is a
2654 // single-predecessor phi.
2655 O << " ";
2656 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2657 } else {
2658 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2659 O << " ";
2660 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2661 if (I == 0)
2662 continue;
2663 O << "/";
2664 getMask(I)->printAsOperand(O, SlotTracker);
2665 }
2666 }
2667}
2668#endif
2669
2671 assert(!State.Lane && "Reduction being replicated.");
2674 "In-loop AnyOf reductions aren't currently supported");
2675 // Propagate the fast-math flags carried by the underlying instruction.
2676 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2677 State.Builder.setFastMathFlags(getFastMathFlags());
2678 Value *NewVecOp = State.get(getVecOp());
2679 if (VPValue *Cond = getCondOp()) {
2680 Value *NewCond = State.get(Cond, State.VF.isScalar());
2681 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2682 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2683
2684 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2685 if (State.VF.isVector())
2686 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2687
2688 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2689 NewVecOp = Select;
2690 }
2691 Value *NewRed;
2692 Value *NextInChain;
2693 if (isOrdered()) {
2694 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2695 if (State.VF.isVector())
2696 NewRed =
2697 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2698 else
2699 NewRed = State.Builder.CreateBinOp(
2701 PrevInChain, NewVecOp);
2702 PrevInChain = NewRed;
2703 NextInChain = NewRed;
2704 } else if (isPartialReduction()) {
2705 assert(Kind == RecurKind::Add && "Unexpected partial reduction kind");
2706 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
2707 NewRed = State.Builder.CreateIntrinsic(
2708 PrevInChain->getType(), Intrinsic::vector_partial_reduce_add,
2709 {PrevInChain, NewVecOp}, nullptr, "partial.reduce");
2710 PrevInChain = NewRed;
2711 NextInChain = NewRed;
2712 } else {
2713 assert(isInLoop() &&
2714 "The reduction must either be ordered, partial or in-loop");
2715 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2716 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2718 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2719 else
2720 NextInChain = State.Builder.CreateBinOp(
2722 PrevInChain, NewRed);
2723 }
2724 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
2725}
2726
2728 assert(!State.Lane && "Reduction being replicated.");
2729
2730 auto &Builder = State.Builder;
2731 // Propagate the fast-math flags carried by the underlying instruction.
2732 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2733 Builder.setFastMathFlags(getFastMathFlags());
2734
2736 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2737 Value *VecOp = State.get(getVecOp());
2738 Value *EVL = State.get(getEVL(), VPLane(0));
2739
2740 Value *Mask;
2741 if (VPValue *CondOp = getCondOp())
2742 Mask = State.get(CondOp);
2743 else
2744 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2745
2746 Value *NewRed;
2747 if (isOrdered()) {
2748 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2749 } else {
2750 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2752 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2753 else
2754 NewRed = Builder.CreateBinOp(
2756 Prev);
2757 }
2758 State.set(this, NewRed, /*IsScalar*/ true);
2759}
2760
2762 VPCostContext &Ctx) const {
2763 RecurKind RdxKind = getRecurrenceKind();
2764 Type *ElementTy = Ctx.Types.inferScalarType(this);
2765 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2766 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2768 std::optional<FastMathFlags> OptionalFMF =
2769 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2770
2771 if (isPartialReduction()) {
2772 InstructionCost CondCost = 0;
2773 if (isConditional()) {
2775 auto *CondTy = cast<VectorType>(
2776 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
2777 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
2778 CondTy, Pred, Ctx.CostKind);
2779 }
2780 return CondCost + Ctx.TTI.getPartialReductionCost(
2781 Opcode, ElementTy, ElementTy, ElementTy, VF,
2783 TargetTransformInfo::PR_None, std::nullopt,
2784 Ctx.CostKind);
2785 }
2786
2787 // TODO: Support any-of reductions.
2788 assert(
2790 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2791 "Any-of reduction not implemented in VPlan-based cost model currently.");
2792
2793 // Note that TTI should model the cost of moving result to the scalar register
2794 // and the BinOp cost in the getMinMaxReductionCost().
2797 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2798 }
2799
2800 // Note that TTI should model the cost of moving result to the scalar register
2801 // and the BinOp cost in the getArithmeticReductionCost().
2802 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2803 Ctx.CostKind);
2804}
2805
2806VPExpressionRecipe::VPExpressionRecipe(
2807 ExpressionTypes ExpressionType,
2808 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2809 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2810 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2811 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2812 assert(
2813 none_of(ExpressionRecipes,
2814 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2815 "expression cannot contain recipes with side-effects");
2816
2817 // Maintain a copy of the expression recipes as a set of users.
2818 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2819 for (auto *R : ExpressionRecipes)
2820 ExpressionRecipesAsSetOfUsers.insert(R);
2821
2822 // Recipes in the expression, except the last one, must only be used by
2823 // (other) recipes inside the expression. If there are other users, external
2824 // to the expression, use a clone of the recipe for external users.
2825 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2826 if (R != ExpressionRecipes.back() &&
2827 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2828 return !ExpressionRecipesAsSetOfUsers.contains(U);
2829 })) {
2830 // There are users outside of the expression. Clone the recipe and use the
2831 // clone those external users.
2832 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2833 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2834 VPUser &U, unsigned) {
2835 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2836 });
2837 CopyForExtUsers->insertBefore(R);
2838 }
2839 if (R->getParent())
2840 R->removeFromParent();
2841 }
2842
2843 // Internalize all external operands to the expression recipes. To do so,
2844 // create new temporary VPValues for all operands defined by a recipe outside
2845 // the expression. The original operands are added as operands of the
2846 // VPExpressionRecipe itself.
2847 for (auto *R : ExpressionRecipes) {
2848 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2849 auto *Def = Op->getDefiningRecipe();
2850 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2851 continue;
2852 addOperand(Op);
2853 LiveInPlaceholders.push_back(new VPSymbolicValue());
2854 }
2855 }
2856
2857 // Replace each external operand with the first one created for it in
2858 // LiveInPlaceholders.
2859 for (auto *R : ExpressionRecipes)
2860 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2861 R->replaceUsesOfWith(LiveIn, Tmp);
2862}
2863
2865 for (auto *R : ExpressionRecipes)
2866 // Since the list could contain duplicates, make sure the recipe hasn't
2867 // already been inserted.
2868 if (!R->getParent())
2869 R->insertBefore(this);
2870
2871 for (const auto &[Idx, Op] : enumerate(operands()))
2872 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2873
2874 replaceAllUsesWith(ExpressionRecipes.back());
2875 ExpressionRecipes.clear();
2876}
2877
2879 VPCostContext &Ctx) const {
2880 Type *RedTy = Ctx.Types.inferScalarType(this);
2881 auto *SrcVecTy = cast<VectorType>(
2882 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2883 assert(RedTy->isIntegerTy() &&
2884 "VPExpressionRecipe only supports integer types currently.");
2885 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2886 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
2887 switch (ExpressionType) {
2888 case ExpressionTypes::ExtendedReduction: {
2889 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2890 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2891 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2892
2893 return cast<VPReductionRecipe>(ExpressionRecipes.back())
2894 ->isPartialReduction()
2895 ? Ctx.TTI.getPartialReductionCost(
2896 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
2897 RedTy, VF,
2899 ExtR->getOpcode()),
2900 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
2901 : Ctx.TTI.getExtendedReductionCost(
2902 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
2903 SrcVecTy, std::nullopt, Ctx.CostKind);
2904 }
2905 case ExpressionTypes::MulAccReduction:
2906 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
2907 Ctx.CostKind);
2908
2909 case ExpressionTypes::ExtNegatedMulAccReduction:
2910 assert(Opcode == Instruction::Add && "Unexpected opcode");
2911 Opcode = Instruction::Sub;
2912 [[fallthrough]];
2913 case ExpressionTypes::ExtMulAccReduction: {
2914 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
2915 if (RedR->isPartialReduction()) {
2916 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2917 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2918 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2919 return Ctx.TTI.getPartialReductionCost(
2920 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
2921 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
2923 Ext0R->getOpcode()),
2925 Ext1R->getOpcode()),
2926 Mul->getOpcode(), Ctx.CostKind);
2927 }
2928 return Ctx.TTI.getMulAccReductionCost(
2929 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2930 Instruction::ZExt,
2931 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2932 }
2933 }
2934 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2935}
2936
2938 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2939 return R->mayReadFromMemory() || R->mayWriteToMemory();
2940 });
2941}
2942
2944 assert(
2945 none_of(ExpressionRecipes,
2946 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2947 "expression cannot contain recipes with side-effects");
2948 return false;
2949}
2950
2952 // Cannot use vputils::isSingleScalar(), because all external operands
2953 // of the expression will be live-ins while bundled.
2954 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
2955 return RR && !RR->isPartialReduction();
2956}
2957
2958#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2959
2961 VPSlotTracker &SlotTracker) const {
2962 O << Indent << "EXPRESSION ";
2964 O << " = ";
2965 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2966 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2967
2968 switch (ExpressionType) {
2969 case ExpressionTypes::ExtendedReduction: {
2971 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
2972 O << Instruction::getOpcodeName(Opcode) << " (";
2974 Red->printFlags(O);
2975
2976 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2977 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2978 << *Ext0->getResultType();
2979 if (Red->isConditional()) {
2980 O << ", ";
2981 Red->getCondOp()->printAsOperand(O, SlotTracker);
2982 }
2983 O << ")";
2984 break;
2985 }
2986 case ExpressionTypes::ExtNegatedMulAccReduction: {
2988 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
2990 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2991 << " (sub (0, mul";
2992 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2993 Mul->printFlags(O);
2994 O << "(";
2996 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2997 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2998 << *Ext0->getResultType() << "), (";
3000 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3001 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3002 << *Ext1->getResultType() << ")";
3003 if (Red->isConditional()) {
3004 O << ", ";
3005 Red->getCondOp()->printAsOperand(O, SlotTracker);
3006 }
3007 O << "))";
3008 break;
3009 }
3010 case ExpressionTypes::MulAccReduction:
3011 case ExpressionTypes::ExtMulAccReduction: {
3013 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3015 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3016 << " (";
3017 O << "mul";
3018 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3019 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3020 : ExpressionRecipes[0]);
3021 Mul->printFlags(O);
3022 if (IsExtended)
3023 O << "(";
3025 if (IsExtended) {
3026 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3027 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3028 << *Ext0->getResultType() << "), (";
3029 } else {
3030 O << ", ";
3031 }
3033 if (IsExtended) {
3034 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3035 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3036 << *Ext1->getResultType() << ")";
3037 }
3038 if (Red->isConditional()) {
3039 O << ", ";
3040 Red->getCondOp()->printAsOperand(O, SlotTracker);
3041 }
3042 O << ")";
3043 break;
3044 }
3045 }
3046}
3047
3049 VPSlotTracker &SlotTracker) const {
3050 if (isPartialReduction())
3051 O << Indent << "PARTIAL-REDUCE ";
3052 else
3053 O << Indent << "REDUCE ";
3055 O << " = ";
3057 O << " +";
3058 printFlags(O);
3059 O << " reduce."
3062 << " (";
3064 if (isConditional()) {
3065 O << ", ";
3067 }
3068 O << ")";
3069}
3070
3072 VPSlotTracker &SlotTracker) const {
3073 O << Indent << "REDUCE ";
3075 O << " = ";
3077 O << " +";
3078 printFlags(O);
3079 O << " vp.reduce."
3082 << " (";
3084 O << ", ";
3086 if (isConditional()) {
3087 O << ", ";
3089 }
3090 O << ")";
3091}
3092
3093#endif
3094
3095/// A helper function to scalarize a single Instruction in the innermost loop.
3096/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3097/// operands from \p RepRecipe instead of \p Instr's operands.
3098static void scalarizeInstruction(const Instruction *Instr,
3099 VPReplicateRecipe *RepRecipe,
3100 const VPLane &Lane, VPTransformState &State) {
3101 assert((!Instr->getType()->isAggregateType() ||
3102 canVectorizeTy(Instr->getType())) &&
3103 "Expected vectorizable or non-aggregate type.");
3104
3105 // Does this instruction return a value ?
3106 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3107
3108 Instruction *Cloned = Instr->clone();
3109 if (!IsVoidRetTy) {
3110 Cloned->setName(Instr->getName() + ".cloned");
3111 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3112 // The operands of the replicate recipe may have been narrowed, resulting in
3113 // a narrower result type. Update the type of the cloned instruction to the
3114 // correct type.
3115 if (ResultTy != Cloned->getType())
3116 Cloned->mutateType(ResultTy);
3117 }
3118
3119 RepRecipe->applyFlags(*Cloned);
3120 RepRecipe->applyMetadata(*Cloned);
3121
3122 if (RepRecipe->hasPredicate())
3123 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3124
3125 if (auto DL = RepRecipe->getDebugLoc())
3126 State.setDebugLocFrom(DL);
3127
3128 // Replace the operands of the cloned instructions with their scalar
3129 // equivalents in the new loop.
3130 for (const auto &I : enumerate(RepRecipe->operands())) {
3131 auto InputLane = Lane;
3132 VPValue *Operand = I.value();
3133 if (vputils::isSingleScalar(Operand))
3134 InputLane = VPLane::getFirstLane();
3135 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3136 }
3137
3138 // Place the cloned scalar in the new loop.
3139 State.Builder.Insert(Cloned);
3140
3141 State.set(RepRecipe, Cloned, Lane);
3142
3143 // If we just cloned a new assumption, add it the assumption cache.
3144 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3145 State.AC->registerAssumption(II);
3146
3147 assert(
3148 (RepRecipe->getRegion() ||
3149 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3150 all_of(RepRecipe->operands(),
3151 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3152 "Expected a recipe is either within a region or all of its operands "
3153 "are defined outside the vectorized region.");
3154}
3155
3158
3159 if (!State.Lane) {
3160 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3161 "must have already been unrolled");
3162 scalarizeInstruction(UI, this, VPLane(0), State);
3163 return;
3164 }
3165
3166 assert((State.VF.isScalar() || !isSingleScalar()) &&
3167 "uniform recipe shouldn't be predicated");
3168 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3169 scalarizeInstruction(UI, this, *State.Lane, State);
3170 // Insert scalar instance packing it into a vector.
3171 if (State.VF.isVector() && shouldPack()) {
3172 Value *WideValue =
3173 State.Lane->isFirstLane()
3174 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3175 : State.get(this);
3176 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3177 *State.Lane));
3178 }
3179}
3180
3182 // Find if the recipe is used by a widened recipe via an intervening
3183 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3184 return any_of(users(), [](const VPUser *U) {
3185 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3186 return !vputils::onlyScalarValuesUsed(PredR);
3187 return false;
3188 });
3189}
3190
3191/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3192/// which the legacy cost model computes a SCEV expression when computing the
3193/// address cost. Computing SCEVs for VPValues is incomplete and returns
3194/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3195/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3196static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3198 const Loop *L) {
3199 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3200 if (isa<SCEVCouldNotCompute>(Addr))
3201 return Addr;
3202
3203 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3204}
3205
3206/// Returns true if \p V is used as part of the address of another load or
3207/// store.
3208static bool isUsedByLoadStoreAddress(const VPUser *V) {
3210 SmallVector<const VPUser *> WorkList = {V};
3211
3212 while (!WorkList.empty()) {
3213 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3214 if (!Cur || !Seen.insert(Cur).second)
3215 continue;
3216
3217 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3218 // Skip blends that use V only through a compare by checking if any incoming
3219 // value was already visited.
3220 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3221 [&](unsigned I) {
3222 return Seen.contains(
3223 Blend->getIncomingValue(I)->getDefiningRecipe());
3224 }))
3225 continue;
3226
3227 for (VPUser *U : Cur->users()) {
3228 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3229 if (InterleaveR->getAddr() == Cur)
3230 return true;
3231 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3232 if (RepR->getOpcode() == Instruction::Load &&
3233 RepR->getOperand(0) == Cur)
3234 return true;
3235 if (RepR->getOpcode() == Instruction::Store &&
3236 RepR->getOperand(1) == Cur)
3237 return true;
3238 }
3239 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3240 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3241 return true;
3242 }
3243 }
3244
3245 // The legacy cost model only supports scalarization loads/stores with phi
3246 // addresses, if the phi is directly used as load/store address. Don't
3247 // traverse further for Blends.
3248 if (Blend)
3249 continue;
3250
3251 append_range(WorkList, Cur->users());
3252 }
3253 return false;
3254}
3255
3257 VPCostContext &Ctx) const {
3259 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3260 // transform, avoid computing their cost multiple times for now.
3261 Ctx.SkipCostComputation.insert(UI);
3262
3263 if (VF.isScalable() && !isSingleScalar())
3265
3266 switch (UI->getOpcode()) {
3267 case Instruction::Alloca:
3268 if (VF.isScalable())
3270 return Ctx.TTI.getArithmeticInstrCost(
3271 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3272 case Instruction::GetElementPtr:
3273 // We mark this instruction as zero-cost because the cost of GEPs in
3274 // vectorized code depends on whether the corresponding memory instruction
3275 // is scalarized or not. Therefore, we handle GEPs with the memory
3276 // instruction cost.
3277 return 0;
3278 case Instruction::Call: {
3279 auto *CalledFn =
3281
3284 for (const VPValue *ArgOp : ArgOps)
3285 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3286
3287 if (CalledFn->isIntrinsic())
3288 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3289 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3290 switch (CalledFn->getIntrinsicID()) {
3291 case Intrinsic::assume:
3292 case Intrinsic::lifetime_end:
3293 case Intrinsic::lifetime_start:
3294 case Intrinsic::sideeffect:
3295 case Intrinsic::pseudoprobe:
3296 case Intrinsic::experimental_noalias_scope_decl: {
3297 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3298 ElementCount::getFixed(1), Ctx) == 0 &&
3299 "scalarizing intrinsic should be free");
3300 return InstructionCost(0);
3301 }
3302 default:
3303 break;
3304 }
3305
3306 Type *ResultTy = Ctx.Types.inferScalarType(this);
3307 InstructionCost ScalarCallCost =
3308 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3309 if (isSingleScalar()) {
3310 if (CalledFn->isIntrinsic())
3311 ScalarCallCost = std::min(
3312 ScalarCallCost,
3313 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3314 ElementCount::getFixed(1), Ctx));
3315 return ScalarCallCost;
3316 }
3317
3318 return ScalarCallCost * VF.getFixedValue() +
3319 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3320 }
3321 case Instruction::Add:
3322 case Instruction::Sub:
3323 case Instruction::FAdd:
3324 case Instruction::FSub:
3325 case Instruction::Mul:
3326 case Instruction::FMul:
3327 case Instruction::FDiv:
3328 case Instruction::FRem:
3329 case Instruction::Shl:
3330 case Instruction::LShr:
3331 case Instruction::AShr:
3332 case Instruction::And:
3333 case Instruction::Or:
3334 case Instruction::Xor:
3335 case Instruction::ICmp:
3336 case Instruction::FCmp:
3338 Ctx) *
3339 (isSingleScalar() ? 1 : VF.getFixedValue());
3340 case Instruction::SDiv:
3341 case Instruction::UDiv:
3342 case Instruction::SRem:
3343 case Instruction::URem: {
3344 InstructionCost ScalarCost =
3346 if (isSingleScalar())
3347 return ScalarCost;
3348
3349 ScalarCost = ScalarCost * VF.getFixedValue() +
3350 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3351 to_vector(operands()), VF);
3352 // If the recipe is not predicated (i.e. not in a replicate region), return
3353 // the scalar cost. Otherwise handle predicated cost.
3354 if (!getRegion()->isReplicator())
3355 return ScalarCost;
3356
3357 // Account for the phi nodes that we will create.
3358 ScalarCost += VF.getFixedValue() *
3359 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3360 // Scale the cost by the probability of executing the predicated blocks.
3361 // This assumes the predicated block for each vector lane is equally
3362 // likely.
3363 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3364 return ScalarCost;
3365 }
3366 case Instruction::Load:
3367 case Instruction::Store: {
3368 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3369 // predicated cases.
3370 const VPRegionBlock *ParentRegion = getRegion();
3371 if (ParentRegion && ParentRegion->isReplicator())
3372 break;
3373
3374 bool IsLoad = UI->getOpcode() == Instruction::Load;
3375 const VPValue *PtrOp = getOperand(!IsLoad);
3376 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3378 break;
3379
3380 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3381 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3382 const Align Alignment = getLoadStoreAlignment(UI);
3383 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3385 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3386 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3387
3388 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3389 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3390 bool UsedByLoadStoreAddress =
3391 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3392 InstructionCost ScalarCost =
3393 ScalarMemOpCost +
3394 Ctx.TTI.getAddressComputationCost(
3395 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3396 Ctx.CostKind);
3397 if (isSingleScalar())
3398 return ScalarCost;
3399
3400 SmallVector<const VPValue *> OpsToScalarize;
3401 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3402 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3403 // don't assign scalarization overhead in general, if the target prefers
3404 // vectorized addressing or the loaded value is used as part of an address
3405 // of another load or store.
3406 if (!UsedByLoadStoreAddress) {
3407 bool EfficientVectorLoadStore =
3408 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3409 if (!(IsLoad && !PreferVectorizedAddressing) &&
3410 !(!IsLoad && EfficientVectorLoadStore))
3411 append_range(OpsToScalarize, operands());
3412
3413 if (!EfficientVectorLoadStore)
3414 ResultTy = Ctx.Types.inferScalarType(this);
3415 }
3416
3417 return (ScalarCost * VF.getFixedValue()) +
3418 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3419 }
3420 case Instruction::SExt:
3421 case Instruction::ZExt:
3422 case Instruction::FPToUI:
3423 case Instruction::FPToSI:
3424 case Instruction::FPExt:
3425 case Instruction::PtrToInt:
3426 case Instruction::PtrToAddr:
3427 case Instruction::IntToPtr:
3428 case Instruction::SIToFP:
3429 case Instruction::UIToFP:
3430 case Instruction::Trunc:
3431 case Instruction::FPTrunc:
3432 case Instruction::AddrSpaceCast: {
3434 Ctx) *
3435 (isSingleScalar() ? 1 : VF.getFixedValue());
3436 }
3437 case Instruction::ExtractValue:
3438 case Instruction::InsertValue:
3439 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3440 }
3441
3442 return Ctx.getLegacyCost(UI, VF);
3443}
3444
3445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3447 VPSlotTracker &SlotTracker) const {
3448 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3449
3450 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3452 O << " = ";
3453 }
3454 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3455 O << "call";
3456 printFlags(O);
3457 O << "@" << CB->getCalledFunction()->getName() << "(";
3459 O, [&O, &SlotTracker](VPValue *Op) {
3460 Op->printAsOperand(O, SlotTracker);
3461 });
3462 O << ")";
3463 } else {
3465 printFlags(O);
3467 }
3468
3469 if (shouldPack())
3470 O << " (S->V)";
3471}
3472#endif
3473
3475 assert(State.Lane && "Branch on Mask works only on single instance.");
3476
3477 VPValue *BlockInMask = getOperand(0);
3478 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3479
3480 // Replace the temporary unreachable terminator with a new conditional branch,
3481 // whose two destinations will be set later when they are created.
3482 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3483 assert(isa<UnreachableInst>(CurrentTerminator) &&
3484 "Expected to replace unreachable terminator with conditional branch.");
3485 auto CondBr =
3486 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3487 CondBr->setSuccessor(0, nullptr);
3488 CurrentTerminator->eraseFromParent();
3489}
3490
3492 VPCostContext &Ctx) const {
3493 // The legacy cost model doesn't assign costs to branches for individual
3494 // replicate regions. Match the current behavior in the VPlan cost model for
3495 // now.
3496 return 0;
3497}
3498
3500 assert(State.Lane && "Predicated instruction PHI works per instance.");
3501 Instruction *ScalarPredInst =
3502 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3503 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3504 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3505 assert(PredicatingBB && "Predicated block has no single predecessor.");
3507 "operand must be VPReplicateRecipe");
3508
3509 // By current pack/unpack logic we need to generate only a single phi node: if
3510 // a vector value for the predicated instruction exists at this point it means
3511 // the instruction has vector users only, and a phi for the vector value is
3512 // needed. In this case the recipe of the predicated instruction is marked to
3513 // also do that packing, thereby "hoisting" the insert-element sequence.
3514 // Otherwise, a phi node for the scalar value is needed.
3515 if (State.hasVectorValue(getOperand(0))) {
3516 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3518 "Packed operands must generate an insertelement or insertvalue");
3519
3520 // If VectorI is a struct, it will be a sequence like:
3521 // %1 = insertvalue %unmodified, %x, 0
3522 // %2 = insertvalue %1, %y, 1
3523 // %VectorI = insertvalue %2, %z, 2
3524 // To get the unmodified vector we need to look through the chain.
3525 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3526 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3527 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3528
3529 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3530 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3531 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3532 if (State.hasVectorValue(this))
3533 State.reset(this, VPhi);
3534 else
3535 State.set(this, VPhi);
3536 // NOTE: Currently we need to update the value of the operand, so the next
3537 // predicated iteration inserts its generated value in the correct vector.
3538 State.reset(getOperand(0), VPhi);
3539 } else {
3540 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3541 return;
3542
3543 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3544 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3545 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3546 PredicatingBB);
3547 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3548 if (State.hasScalarValue(this, *State.Lane))
3549 State.reset(this, Phi, *State.Lane);
3550 else
3551 State.set(this, Phi, *State.Lane);
3552 // NOTE: Currently we need to update the value of the operand, so the next
3553 // predicated iteration inserts its generated value in the correct vector.
3554 State.reset(getOperand(0), Phi, *State.Lane);
3555 }
3556}
3557
3558#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3560 VPSlotTracker &SlotTracker) const {
3561 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3563 O << " = ";
3565}
3566#endif
3567
3569 VPCostContext &Ctx) const {
3571 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3572 ->getAddressSpace();
3573 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3574 ? Instruction::Load
3575 : Instruction::Store;
3576
3577 if (!Consecutive) {
3578 // TODO: Using the original IR may not be accurate.
3579 // Currently, ARM will use the underlying IR to calculate gather/scatter
3580 // instruction cost.
3581 assert(!Reverse &&
3582 "Inconsecutive memory access should not have the order.");
3583
3585 Type *PtrTy = Ptr->getType();
3586
3587 // If the address value is uniform across all lanes, then the address can be
3588 // calculated with scalar type and broadcast.
3590 PtrTy = toVectorTy(PtrTy, VF);
3591
3592 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
3593 : isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
3594 : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
3595 : Intrinsic::vp_scatter;
3596 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3597 Ctx.CostKind) +
3598 Ctx.TTI.getMemIntrinsicInstrCost(
3600 &Ingredient),
3601 Ctx.CostKind);
3602 }
3603
3605 if (IsMasked) {
3606 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3607 : Intrinsic::masked_store;
3608 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3609 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3610 } else {
3611 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3613 : getOperand(1));
3614 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3615 OpInfo, &Ingredient);
3616 }
3617 return Cost;
3618}
3619
3621 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3622 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3623 bool CreateGather = !isConsecutive();
3624
3625 auto &Builder = State.Builder;
3626 Value *Mask = nullptr;
3627 if (auto *VPMask = getMask()) {
3628 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3629 // of a null all-one mask is a null mask.
3630 Mask = State.get(VPMask);
3631 if (isReverse())
3632 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3633 }
3634
3635 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3636 Value *NewLI;
3637 if (CreateGather) {
3638 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3639 "wide.masked.gather");
3640 } else if (Mask) {
3641 NewLI =
3642 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3643 PoisonValue::get(DataTy), "wide.masked.load");
3644 } else {
3645 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3646 }
3648 State.set(this, NewLI);
3649}
3650
3651#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3653 VPSlotTracker &SlotTracker) const {
3654 O << Indent << "WIDEN ";
3656 O << " = load ";
3658}
3659#endif
3660
3661/// Use all-true mask for reverse rather than actual mask, as it avoids a
3662/// dependence w/o affecting the result.
3664 Value *EVL, const Twine &Name) {
3665 VectorType *ValTy = cast<VectorType>(Operand->getType());
3666 Value *AllTrueMask =
3667 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3668 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3669 {Operand, AllTrueMask, EVL}, nullptr, Name);
3670}
3671
3673 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3674 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3675 bool CreateGather = !isConsecutive();
3676
3677 auto &Builder = State.Builder;
3678 CallInst *NewLI;
3679 Value *EVL = State.get(getEVL(), VPLane(0));
3680 Value *Addr = State.get(getAddr(), !CreateGather);
3681 Value *Mask = nullptr;
3682 if (VPValue *VPMask = getMask()) {
3683 Mask = State.get(VPMask);
3684 if (isReverse())
3685 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3686 } else {
3687 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3688 }
3689
3690 if (CreateGather) {
3691 NewLI =
3692 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3693 nullptr, "wide.masked.gather");
3694 } else {
3695 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3696 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3697 }
3698 NewLI->addParamAttr(
3700 applyMetadata(*NewLI);
3701 Instruction *Res = NewLI;
3702 State.set(this, Res);
3703}
3704
3706 VPCostContext &Ctx) const {
3707 if (!Consecutive || IsMasked)
3708 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3709
3710 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3711 // here because the EVL recipes using EVL to replace the tail mask. But in the
3712 // legacy model, it will always calculate the cost of mask.
3713 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3714 // don't need to compare to the legacy cost model.
3716 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3717 ->getAddressSpace();
3718 return Ctx.TTI.getMemIntrinsicInstrCost(
3719 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3720 Ctx.CostKind);
3721}
3722
3723#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3725 VPSlotTracker &SlotTracker) const {
3726 O << Indent << "WIDEN ";
3728 O << " = vp.load ";
3730}
3731#endif
3732
3734 VPValue *StoredVPValue = getStoredValue();
3735 bool CreateScatter = !isConsecutive();
3736
3737 auto &Builder = State.Builder;
3738
3739 Value *Mask = nullptr;
3740 if (auto *VPMask = getMask()) {
3741 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3742 // of a null all-one mask is a null mask.
3743 Mask = State.get(VPMask);
3744 if (isReverse())
3745 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3746 }
3747
3748 Value *StoredVal = State.get(StoredVPValue);
3749 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3750 Instruction *NewSI = nullptr;
3751 if (CreateScatter)
3752 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3753 else if (Mask)
3754 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3755 else
3756 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3757 applyMetadata(*NewSI);
3758}
3759
3760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3762 VPSlotTracker &SlotTracker) const {
3763 O << Indent << "WIDEN store ";
3765}
3766#endif
3767
3769 VPValue *StoredValue = getStoredValue();
3770 bool CreateScatter = !isConsecutive();
3771
3772 auto &Builder = State.Builder;
3773
3774 CallInst *NewSI = nullptr;
3775 Value *StoredVal = State.get(StoredValue);
3776 Value *EVL = State.get(getEVL(), VPLane(0));
3777 Value *Mask = nullptr;
3778 if (VPValue *VPMask = getMask()) {
3779 Mask = State.get(VPMask);
3780 if (isReverse())
3781 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3782 } else {
3783 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3784 }
3785 Value *Addr = State.get(getAddr(), !CreateScatter);
3786 if (CreateScatter) {
3787 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3788 Intrinsic::vp_scatter,
3789 {StoredVal, Addr, Mask, EVL});
3790 } else {
3791 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3792 Intrinsic::vp_store,
3793 {StoredVal, Addr, Mask, EVL});
3794 }
3795 NewSI->addParamAttr(
3797 applyMetadata(*NewSI);
3798}
3799
3801 VPCostContext &Ctx) const {
3802 if (!Consecutive || IsMasked)
3803 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3804
3805 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3806 // here because the EVL recipes using EVL to replace the tail mask. But in the
3807 // legacy model, it will always calculate the cost of mask.
3808 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3809 // don't need to compare to the legacy cost model.
3811 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3812 ->getAddressSpace();
3813 return Ctx.TTI.getMemIntrinsicInstrCost(
3814 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
3815 Ctx.CostKind);
3816}
3817
3818#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3820 VPSlotTracker &SlotTracker) const {
3821 O << Indent << "WIDEN vp.store ";
3823}
3824#endif
3825
3827 VectorType *DstVTy, const DataLayout &DL) {
3828 // Verify that V is a vector type with same number of elements as DstVTy.
3829 auto VF = DstVTy->getElementCount();
3830 auto *SrcVecTy = cast<VectorType>(V->getType());
3831 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3832 Type *SrcElemTy = SrcVecTy->getElementType();
3833 Type *DstElemTy = DstVTy->getElementType();
3834 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3835 "Vector elements must have same size");
3836
3837 // Do a direct cast if element types are castable.
3838 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3839 return Builder.CreateBitOrPointerCast(V, DstVTy);
3840 }
3841 // V cannot be directly casted to desired vector type.
3842 // May happen when V is a floating point vector but DstVTy is a vector of
3843 // pointers or vice-versa. Handle this using a two-step bitcast using an
3844 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3845 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3846 "Only one type should be a pointer type");
3847 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3848 "Only one type should be a floating point type");
3849 Type *IntTy =
3850 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3851 auto *VecIntTy = VectorType::get(IntTy, VF);
3852 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3853 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3854}
3855
3856/// Return a vector containing interleaved elements from multiple
3857/// smaller input vectors.
3859 const Twine &Name) {
3860 unsigned Factor = Vals.size();
3861 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3862
3863 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3864#ifndef NDEBUG
3865 for (Value *Val : Vals)
3866 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3867#endif
3868
3869 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3870 // must use intrinsics to interleave.
3871 if (VecTy->isScalableTy()) {
3872 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3873 return Builder.CreateVectorInterleave(Vals, Name);
3874 }
3875
3876 // Fixed length. Start by concatenating all vectors into a wide vector.
3877 Value *WideVec = concatenateVectors(Builder, Vals);
3878
3879 // Interleave the elements into the wide vector.
3880 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3881 return Builder.CreateShuffleVector(
3882 WideVec, createInterleaveMask(NumElts, Factor), Name);
3883}
3884
3885// Try to vectorize the interleave group that \p Instr belongs to.
3886//
3887// E.g. Translate following interleaved load group (factor = 3):
3888// for (i = 0; i < N; i+=3) {
3889// R = Pic[i]; // Member of index 0
3890// G = Pic[i+1]; // Member of index 1
3891// B = Pic[i+2]; // Member of index 2
3892// ... // do something to R, G, B
3893// }
3894// To:
3895// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3896// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3897// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3898// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3899//
3900// Or translate following interleaved store group (factor = 3):
3901// for (i = 0; i < N; i+=3) {
3902// ... do something to R, G, B
3903// Pic[i] = R; // Member of index 0
3904// Pic[i+1] = G; // Member of index 1
3905// Pic[i+2] = B; // Member of index 2
3906// }
3907// To:
3908// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3909// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3910// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3911// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3912// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3914 assert(!State.Lane && "Interleave group being replicated.");
3915 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3916 "Masking gaps for scalable vectors is not yet supported.");
3918 Instruction *Instr = Group->getInsertPos();
3919
3920 // Prepare for the vector type of the interleaved load/store.
3921 Type *ScalarTy = getLoadStoreType(Instr);
3922 unsigned InterleaveFactor = Group->getFactor();
3923 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3924
3925 VPValue *BlockInMask = getMask();
3926 VPValue *Addr = getAddr();
3927 Value *ResAddr = State.get(Addr, VPLane(0));
3928
3929 auto CreateGroupMask = [&BlockInMask, &State,
3930 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3931 if (State.VF.isScalable()) {
3932 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3933 assert(InterleaveFactor <= 8 &&
3934 "Unsupported deinterleave factor for scalable vectors");
3935 auto *ResBlockInMask = State.get(BlockInMask);
3936 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3937 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3938 }
3939
3940 if (!BlockInMask)
3941 return MaskForGaps;
3942
3943 Value *ResBlockInMask = State.get(BlockInMask);
3944 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3945 ResBlockInMask,
3946 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3947 "interleaved.mask");
3948 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3949 ShuffledMask, MaskForGaps)
3950 : ShuffledMask;
3951 };
3952
3953 const DataLayout &DL = Instr->getDataLayout();
3954 // Vectorize the interleaved load group.
3955 if (isa<LoadInst>(Instr)) {
3956 Value *MaskForGaps = nullptr;
3957 if (needsMaskForGaps()) {
3958 MaskForGaps =
3959 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3960 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3961 }
3962
3963 Instruction *NewLoad;
3964 if (BlockInMask || MaskForGaps) {
3965 Value *GroupMask = CreateGroupMask(MaskForGaps);
3966 Value *PoisonVec = PoisonValue::get(VecTy);
3967 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3968 Group->getAlign(), GroupMask,
3969 PoisonVec, "wide.masked.vec");
3970 } else
3971 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3972 Group->getAlign(), "wide.vec");
3973 applyMetadata(*NewLoad);
3974 // TODO: Also manage existing metadata using VPIRMetadata.
3975 Group->addMetadata(NewLoad);
3976
3978 if (VecTy->isScalableTy()) {
3979 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3980 // so must use intrinsics to deinterleave.
3981 assert(InterleaveFactor <= 8 &&
3982 "Unsupported deinterleave factor for scalable vectors");
3983 NewLoad = State.Builder.CreateIntrinsic(
3984 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
3985 NewLoad->getType(), NewLoad,
3986 /*FMFSource=*/nullptr, "strided.vec");
3987 }
3988
3989 auto CreateStridedVector = [&InterleaveFactor, &State,
3990 &NewLoad](unsigned Index) -> Value * {
3991 assert(Index < InterleaveFactor && "Illegal group index");
3992 if (State.VF.isScalable())
3993 return State.Builder.CreateExtractValue(NewLoad, Index);
3994
3995 // For fixed length VF, use shuffle to extract the sub-vectors from the
3996 // wide load.
3997 auto StrideMask =
3998 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
3999 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4000 "strided.vec");
4001 };
4002
4003 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4004 Instruction *Member = Group->getMember(I);
4005
4006 // Skip the gaps in the group.
4007 if (!Member)
4008 continue;
4009
4010 Value *StridedVec = CreateStridedVector(I);
4011
4012 // If this member has different type, cast the result type.
4013 if (Member->getType() != ScalarTy) {
4014 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4015 StridedVec =
4016 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4017 }
4018
4019 if (Group->isReverse())
4020 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4021
4022 State.set(VPDefs[J], StridedVec);
4023 ++J;
4024 }
4025 return;
4026 }
4027
4028 // The sub vector type for current instruction.
4029 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4030
4031 // Vectorize the interleaved store group.
4032 Value *MaskForGaps =
4033 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4034 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4035 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4036 ArrayRef<VPValue *> StoredValues = getStoredValues();
4037 // Collect the stored vector from each member.
4038 SmallVector<Value *, 4> StoredVecs;
4039 unsigned StoredIdx = 0;
4040 for (unsigned i = 0; i < InterleaveFactor; i++) {
4041 assert((Group->getMember(i) || MaskForGaps) &&
4042 "Fail to get a member from an interleaved store group");
4043 Instruction *Member = Group->getMember(i);
4044
4045 // Skip the gaps in the group.
4046 if (!Member) {
4047 Value *Undef = PoisonValue::get(SubVT);
4048 StoredVecs.push_back(Undef);
4049 continue;
4050 }
4051
4052 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4053 ++StoredIdx;
4054
4055 if (Group->isReverse())
4056 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4057
4058 // If this member has different type, cast it to a unified type.
4059
4060 if (StoredVec->getType() != SubVT)
4061 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4062
4063 StoredVecs.push_back(StoredVec);
4064 }
4065
4066 // Interleave all the smaller vectors into one wider vector.
4067 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4068 Instruction *NewStoreInstr;
4069 if (BlockInMask || MaskForGaps) {
4070 Value *GroupMask = CreateGroupMask(MaskForGaps);
4071 NewStoreInstr = State.Builder.CreateMaskedStore(
4072 IVec, ResAddr, Group->getAlign(), GroupMask);
4073 } else
4074 NewStoreInstr =
4075 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4076
4077 applyMetadata(*NewStoreInstr);
4078 // TODO: Also manage existing metadata using VPIRMetadata.
4079 Group->addMetadata(NewStoreInstr);
4080}
4081
4082#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4084 VPSlotTracker &SlotTracker) const {
4086 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4087 IG->getInsertPos()->printAsOperand(O, false);
4088 O << ", ";
4090 VPValue *Mask = getMask();
4091 if (Mask) {
4092 O << ", ";
4093 Mask->printAsOperand(O, SlotTracker);
4094 }
4095
4096 unsigned OpIdx = 0;
4097 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4098 if (!IG->getMember(i))
4099 continue;
4100 if (getNumStoreOperands() > 0) {
4101 O << "\n" << Indent << " store ";
4103 O << " to index " << i;
4104 } else {
4105 O << "\n" << Indent << " ";
4107 O << " = load from index " << i;
4108 }
4109 ++OpIdx;
4110 }
4111}
4112#endif
4113
4115 assert(!State.Lane && "Interleave group being replicated.");
4116 assert(State.VF.isScalable() &&
4117 "Only support scalable VF for EVL tail-folding.");
4119 "Masking gaps for scalable vectors is not yet supported.");
4121 Instruction *Instr = Group->getInsertPos();
4122
4123 // Prepare for the vector type of the interleaved load/store.
4124 Type *ScalarTy = getLoadStoreType(Instr);
4125 unsigned InterleaveFactor = Group->getFactor();
4126 assert(InterleaveFactor <= 8 &&
4127 "Unsupported deinterleave/interleave factor for scalable vectors");
4128 ElementCount WideVF = State.VF * InterleaveFactor;
4129 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4130
4131 VPValue *Addr = getAddr();
4132 Value *ResAddr = State.get(Addr, VPLane(0));
4133 Value *EVL = State.get(getEVL(), VPLane(0));
4134 Value *InterleaveEVL = State.Builder.CreateMul(
4135 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4136 /* NUW= */ true, /* NSW= */ true);
4137 LLVMContext &Ctx = State.Builder.getContext();
4138
4139 Value *GroupMask = nullptr;
4140 if (VPValue *BlockInMask = getMask()) {
4141 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4142 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4143 } else {
4144 GroupMask =
4145 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4146 }
4147
4148 // Vectorize the interleaved load group.
4149 if (isa<LoadInst>(Instr)) {
4150 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4151 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4152 "wide.vp.load");
4153 NewLoad->addParamAttr(0,
4154 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4155
4156 applyMetadata(*NewLoad);
4157 // TODO: Also manage existing metadata using VPIRMetadata.
4158 Group->addMetadata(NewLoad);
4159
4160 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4161 // so must use intrinsics to deinterleave.
4162 NewLoad = State.Builder.CreateIntrinsic(
4163 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4164 NewLoad->getType(), NewLoad,
4165 /*FMFSource=*/nullptr, "strided.vec");
4166
4167 const DataLayout &DL = Instr->getDataLayout();
4168 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4169 Instruction *Member = Group->getMember(I);
4170 // Skip the gaps in the group.
4171 if (!Member)
4172 continue;
4173
4174 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4175 // If this member has different type, cast the result type.
4176 if (Member->getType() != ScalarTy) {
4177 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4178 StridedVec =
4179 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4180 }
4181
4182 State.set(getVPValue(J), StridedVec);
4183 ++J;
4184 }
4185 return;
4186 } // End for interleaved load.
4187
4188 // The sub vector type for current instruction.
4189 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4190 // Vectorize the interleaved store group.
4191 ArrayRef<VPValue *> StoredValues = getStoredValues();
4192 // Collect the stored vector from each member.
4193 SmallVector<Value *, 4> StoredVecs;
4194 const DataLayout &DL = Instr->getDataLayout();
4195 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4196 Instruction *Member = Group->getMember(I);
4197 // Skip the gaps in the group.
4198 if (!Member) {
4199 StoredVecs.push_back(PoisonValue::get(SubVT));
4200 continue;
4201 }
4202
4203 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4204 // If this member has different type, cast it to a unified type.
4205 if (StoredVec->getType() != SubVT)
4206 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4207
4208 StoredVecs.push_back(StoredVec);
4209 ++StoredIdx;
4210 }
4211
4212 // Interleave all the smaller vectors into one wider vector.
4213 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4214 CallInst *NewStore =
4215 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4216 {IVec, ResAddr, GroupMask, InterleaveEVL});
4217 NewStore->addParamAttr(1,
4218 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4219
4220 applyMetadata(*NewStore);
4221 // TODO: Also manage existing metadata using VPIRMetadata.
4222 Group->addMetadata(NewStore);
4223}
4224
4225#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4227 VPSlotTracker &SlotTracker) const {
4229 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4230 IG->getInsertPos()->printAsOperand(O, false);
4231 O << ", ";
4233 O << ", ";
4235 if (VPValue *Mask = getMask()) {
4236 O << ", ";
4237 Mask->printAsOperand(O, SlotTracker);
4238 }
4239
4240 unsigned OpIdx = 0;
4241 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4242 if (!IG->getMember(i))
4243 continue;
4244 if (getNumStoreOperands() > 0) {
4245 O << "\n" << Indent << " vp.store ";
4247 O << " to index " << i;
4248 } else {
4249 O << "\n" << Indent << " ";
4251 O << " = vp.load from index " << i;
4252 }
4253 ++OpIdx;
4254 }
4255}
4256#endif
4257
4259 VPCostContext &Ctx) const {
4260 Instruction *InsertPos = getInsertPos();
4261 // Find the VPValue index of the interleave group. We need to skip gaps.
4262 unsigned InsertPosIdx = 0;
4263 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4264 if (auto *Member = IG->getMember(Idx)) {
4265 if (Member == InsertPos)
4266 break;
4267 InsertPosIdx++;
4268 }
4269 Type *ValTy = Ctx.Types.inferScalarType(
4270 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4271 : getStoredValues()[InsertPosIdx]);
4272 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4273 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4274 ->getAddressSpace();
4275
4276 unsigned InterleaveFactor = IG->getFactor();
4277 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4278
4279 // Holds the indices of existing members in the interleaved group.
4281 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4282 if (IG->getMember(IF))
4283 Indices.push_back(IF);
4284
4285 // Calculate the cost of the whole interleaved group.
4286 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4287 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4288 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4289
4290 if (!IG->isReverse())
4291 return Cost;
4292
4293 return Cost + IG->getNumMembers() *
4294 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4295 VectorTy, VectorTy, {}, Ctx.CostKind,
4296 0);
4297}
4298
4299#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4301 VPSlotTracker &SlotTracker) const {
4302 O << Indent << "EMIT ";
4304 O << " = CANONICAL-INDUCTION ";
4306}
4307#endif
4308
4310 return vputils::onlyScalarValuesUsed(this) &&
4311 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4312}
4313
4314#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4316 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4317 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4318 "unexpected number of operands");
4319 O << Indent << "EMIT ";
4321 O << " = WIDEN-POINTER-INDUCTION ";
4323 O << ", ";
4325 O << ", ";
4327 if (getNumOperands() == 5) {
4328 O << ", ";
4330 O << ", ";
4332 }
4333}
4334
4336 VPSlotTracker &SlotTracker) const {
4337 O << Indent << "EMIT ";
4339 O << " = EXPAND SCEV " << *Expr;
4340}
4341#endif
4342
4344 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4345 Type *STy = CanonicalIV->getType();
4346 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4347 ElementCount VF = State.VF;
4348 Value *VStart = VF.isScalar()
4349 ? CanonicalIV
4350 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4351 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
4352 if (VF.isVector()) {
4353 VStep = Builder.CreateVectorSplat(VF, VStep);
4354 VStep =
4355 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4356 }
4357 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4358 State.set(this, CanonicalVectorIV);
4359}
4360
4361#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4363 VPSlotTracker &SlotTracker) const {
4364 O << Indent << "EMIT ";
4366 O << " = WIDEN-CANONICAL-INDUCTION ";
4368}
4369#endif
4370
4372 auto &Builder = State.Builder;
4373 // Create a vector from the initial value.
4374 auto *VectorInit = getStartValue()->getLiveInIRValue();
4375
4376 Type *VecTy = State.VF.isScalar()
4377 ? VectorInit->getType()
4378 : VectorType::get(VectorInit->getType(), State.VF);
4379
4380 BasicBlock *VectorPH =
4381 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4382 if (State.VF.isVector()) {
4383 auto *IdxTy = Builder.getInt32Ty();
4384 auto *One = ConstantInt::get(IdxTy, 1);
4385 IRBuilder<>::InsertPointGuard Guard(Builder);
4386 Builder.SetInsertPoint(VectorPH->getTerminator());
4387 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4388 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4389 VectorInit = Builder.CreateInsertElement(
4390 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4391 }
4392
4393 // Create a phi node for the new recurrence.
4394 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4395 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4396 Phi->addIncoming(VectorInit, VectorPH);
4397 State.set(this, Phi);
4398}
4399
4402 VPCostContext &Ctx) const {
4403 if (VF.isScalar())
4404 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4405
4406 return 0;
4407}
4408
4409#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4411 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4412 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4414 O << " = phi ";
4416}
4417#endif
4418
4420 // Reductions do not have to start at zero. They can start with
4421 // any loop invariant values.
4422 VPValue *StartVPV = getStartValue();
4423
4424 // In order to support recurrences we need to be able to vectorize Phi nodes.
4425 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4426 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4427 // this value when we vectorize all of the instructions that use the PHI.
4428 BasicBlock *VectorPH =
4429 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4430 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4431 Value *StartV = State.get(StartVPV, ScalarPHI);
4432 Type *VecTy = StartV->getType();
4433
4434 BasicBlock *HeaderBB = State.CFG.PrevBB;
4435 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4436 "recipe must be in the vector loop header");
4437 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4438 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4439 State.set(this, Phi, isInLoop());
4440
4441 Phi->addIncoming(StartV, VectorPH);
4442}
4443
4444#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4446 VPSlotTracker &SlotTracker) const {
4447 O << Indent << "WIDEN-REDUCTION-PHI ";
4448
4450 O << " = phi ";
4452 if (getVFScaleFactor() > 1)
4453 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4454}
4455#endif
4456
4458 Value *Op0 = State.get(getOperand(0));
4459 Type *VecTy = Op0->getType();
4460 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4461 State.set(this, VecPhi);
4462}
4463
4464#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4466 VPSlotTracker &SlotTracker) const {
4467 O << Indent << "WIDEN-PHI ";
4468
4470 O << " = phi ";
4472}
4473#endif
4474
4476 BasicBlock *VectorPH =
4477 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4478 Value *StartMask = State.get(getOperand(0));
4479 PHINode *Phi =
4480 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4481 Phi->addIncoming(StartMask, VectorPH);
4482 State.set(this, Phi);
4483}
4484
4485#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4487 VPSlotTracker &SlotTracker) const {
4488 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4489
4491 O << " = phi ";
4493}
4494#endif
4495
4496#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4498 VPSlotTracker &SlotTracker) const {
4499 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4500
4502 O << " = phi ";
4504}
4505#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:272
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:661
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:594
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2585
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2639
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2573
LLVM_ABI Value * CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, const Twine &Name="")
Return a vector splice intrinsic if using scalable vectors, otherwise return a shufflevector.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2632
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2651
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2039
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:567
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1725
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1808
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1134
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1197
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
LLVM_ABI CallInst * CreateIntMaxReduce(Value *Src, bool IsSigned=false)
Create a vector integer max reduction intrinsic of the source vector.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2344
LLVM_ABI CallInst * CreateIntMinReduce(Value *Src, bool IsSigned=false)
Create a vector integer min reduction intrinsic of the source vector.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static bool isSignedRecurrenceKind(RecurKind Kind)
Returns true if recurrece kind is a signed redux kind.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
value_op_iterator value_op_end()
Definition User.h:314
void setOperand(unsigned i, Value *Val)
Definition User.h:238
Value * getOperand(unsigned i) const
Definition User.h:233
value_op_iterator value_op_begin()
Definition User.h:311
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4053
iterator end()
Definition VPlan.h:4037
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4066
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2578
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2573
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:173
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:349
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
This class augments a recipe with a set of VPValues defined by the recipe.
Definition VPlanValue.h:332
LLVM_ABI_FOR_TEST void dump() const
Dump the VPDef to stderr (for debugging).
Definition VPlan.cpp:110
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:453
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:438
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:448
unsigned getVPDefID() const
Definition VPlanValue.h:458
VPIRValue * getStartValue() const
Definition VPlan.h:3801
VPValue * getStepValue() const
Definition VPlan.h:3802
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2102
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:1853
Class to record and manage LLVM IR flags.
Definition VPlan.h:608
FastMathFlagsTy FMFs
Definition VPlan.h:695
ReductionFlagsTy ReductionFlags
Definition VPlan.h:697
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
WrapFlagsTy WrapFlags
Definition VPlan.h:689
CmpInst::Predicate CmpPredicate
Definition VPlan.h:688
void printFlags(raw_ostream &O) const
GEPNoWrapFlags GEPFlags
Definition VPlan.h:693
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:882
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:932
TruncFlagsTy TruncFlags
Definition VPlan.h:690
CmpInst::Predicate getPredicate() const
Definition VPlan.h:859
ExactFlagsTy ExactFlags
Definition VPlan.h:692
bool hasNoSignedWrap() const
Definition VPlan.h:909
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:874
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:877
DisjointFlagsTy DisjointFlags
Definition VPlan.h:691
unsigned AllFlags
Definition VPlan.h:698
bool hasNoUnsignedWrap() const
Definition VPlan.h:898
FCmpFlagsTy FCmpFlags
Definition VPlan.h:696
NonNegFlagsTy NonNegFlags
Definition VPlan.h:694
bool isReductionInLoop() const
Definition VPlan.h:938
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:816
RecurKind getRecurKind() const
Definition VPlan.h:926
Instruction & getInstruction() const
Definition VPlan.h:1507
void extractLastLaneOfLastPartOfFirstOperand(VPBuilder &Builder)
Update the recipe's first operand to the last lane of the last part of the operand using Builder.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1482
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
static unsigned getNumOperandsForOpcode(unsigned Opcode)
Return the number of operands determined by the opcode of the VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1188
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1133
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1178
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1191
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1130
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1182
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1125
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1122
@ VScale
Returns the value for vscale.
Definition VPlan.h:1193
@ CanonicalIVIncrementForPart
Definition VPlan.h:1106
bool hasResult() const
Definition VPlan.h:1257
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1298
unsigned getOpcode() const
Definition VPlan.h:1241
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2690
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2694
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2692
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2684
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2713
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2678
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2788
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2801
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2751
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1397
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4144
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1422
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1389
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4305
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override final
Print the recipe, delegating to printRecipe().
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:398
friend class VPValue
Definition VPlanValue.h:212
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2951
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2495
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2519
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:2893
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:2904
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:2906
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:2889
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:2895
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:2902
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:2897
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4188
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4256
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2973
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3014
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3043
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPValue * getStepValue() const
Definition VPlan.h:3868
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:594
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:533
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:1020
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:229
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1428
operand_range operands()
Definition VPlanValue.h:297
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:273
unsigned getNumOperands() const
Definition VPlanValue.h:267
operand_iterator op_begin()
Definition VPlanValue.h:293
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:268
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:312
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:45
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:133
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1382
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:119
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1424
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:72
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1385
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2007
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:1809
Function * getCalledScalarFunction() const
Definition VPlan.h:1805
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1659
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:1905
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2165
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2168
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2263
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2278
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2287
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1741
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1744
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3298
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3295
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3338
Instruction & Ingredient
Definition VPlan.h:3286
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3292
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3352
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3289
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3345
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3342
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getUF() const
Definition VPlan.h:4535
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1022
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
bool match(Val *V, const Pattern &P)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2303
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI Value * createAnyOfReduction(IRBuilderBase &B, Value *Src, Value *InitVal, PHINode *OrigPhi)
Create a reduction of the given vector Src for a reduction of kind RecurKind::AnyOf.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const
Returns the OperandInfo for V, if it is a live-in.
Definition VPlan.cpp:1737
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1545
PHINode & getIRPhi()
Definition VPlan.h:1553
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:184
Value * getValue() const
Returns the underlying IR value.
Definition VPlanValue.h:190
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:974
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:975
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:202
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:275
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3430
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3513
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3516
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3476