LLVM 22.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
35#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC:
55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56 case VPInterleaveEVLSC:
57 case VPInterleaveSC:
58 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
59 case VPWidenStoreEVLSC:
60 case VPWidenStoreSC:
61 return true;
62 case VPReplicateSC:
63 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
64 ->mayWriteToMemory();
65 case VPWidenCallSC:
66 return !cast<VPWidenCallRecipe>(this)
67 ->getCalledScalarFunction()
68 ->onlyReadsMemory();
69 case VPWidenIntrinsicSC:
70 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
71 case VPCanonicalIVPHISC:
72 case VPBranchOnMaskSC:
73 case VPDerivedIVSC:
74 case VPFirstOrderRecurrencePHISC:
75 case VPReductionPHISC:
76 case VPScalarIVStepsSC:
77 case VPPredInstPHISC:
78 return false;
79 case VPBlendSC:
80 case VPReductionEVLSC:
81 case VPReductionSC:
82 case VPVectorPointerSC:
83 case VPWidenCanonicalIVSC:
84 case VPWidenCastSC:
85 case VPWidenGEPSC:
86 case VPWidenIntOrFpInductionSC:
87 case VPWidenLoadEVLSC:
88 case VPWidenLoadSC:
89 case VPWidenPHISC:
90 case VPWidenPointerInductionSC:
91 case VPWidenSC:
92 case VPWidenSelectSC: {
93 const Instruction *I =
94 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
95 (void)I;
96 assert((!I || !I->mayWriteToMemory()) &&
97 "underlying instruction may write to memory");
98 return false;
99 }
100 default:
101 return true;
102 }
103}
104
106 switch (getVPDefID()) {
107 case VPExpressionSC:
108 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
109 case VPInstructionSC:
110 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
111 case VPWidenLoadEVLSC:
112 case VPWidenLoadSC:
113 return true;
114 case VPReplicateSC:
115 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
116 ->mayReadFromMemory();
117 case VPWidenCallSC:
118 return !cast<VPWidenCallRecipe>(this)
119 ->getCalledScalarFunction()
120 ->onlyWritesMemory();
121 case VPWidenIntrinsicSC:
122 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
123 case VPBranchOnMaskSC:
124 case VPDerivedIVSC:
125 case VPFirstOrderRecurrencePHISC:
126 case VPPredInstPHISC:
127 case VPScalarIVStepsSC:
128 case VPWidenStoreEVLSC:
129 case VPWidenStoreSC:
130 return false;
131 case VPBlendSC:
132 case VPReductionEVLSC:
133 case VPReductionSC:
134 case VPVectorPointerSC:
135 case VPWidenCanonicalIVSC:
136 case VPWidenCastSC:
137 case VPWidenGEPSC:
138 case VPWidenIntOrFpInductionSC:
139 case VPWidenPHISC:
140 case VPWidenPointerInductionSC:
141 case VPWidenSC:
142 case VPWidenSelectSC: {
143 const Instruction *I =
144 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
145 (void)I;
146 assert((!I || !I->mayReadFromMemory()) &&
147 "underlying instruction may read from memory");
148 return false;
149 }
150 default:
151 // FIXME: Return false if the recipe represents an interleaved store.
152 return true;
153 }
154}
155
157 switch (getVPDefID()) {
158 case VPExpressionSC:
159 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
160 case VPDerivedIVSC:
161 case VPFirstOrderRecurrencePHISC:
162 case VPPredInstPHISC:
163 case VPVectorEndPointerSC:
164 return false;
165 case VPInstructionSC:
166 return mayWriteToMemory();
167 case VPWidenCallSC: {
168 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
169 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
170 }
171 case VPWidenIntrinsicSC:
172 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
173 case VPBlendSC:
174 case VPReductionEVLSC:
175 case VPPartialReductionSC:
176 case VPReductionSC:
177 case VPScalarIVStepsSC:
178 case VPVectorPointerSC:
179 case VPWidenCanonicalIVSC:
180 case VPWidenCastSC:
181 case VPWidenGEPSC:
182 case VPWidenIntOrFpInductionSC:
183 case VPWidenPHISC:
184 case VPWidenPointerInductionSC:
185 case VPWidenSC:
186 case VPWidenSelectSC: {
187 const Instruction *I =
188 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
189 (void)I;
190 assert((!I || !I->mayHaveSideEffects()) &&
191 "underlying instruction has side-effects");
192 return false;
193 }
194 case VPInterleaveEVLSC:
195 case VPInterleaveSC:
196 return mayWriteToMemory();
197 case VPWidenLoadEVLSC:
198 case VPWidenLoadSC:
199 case VPWidenStoreEVLSC:
200 case VPWidenStoreSC:
201 assert(
202 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
204 "mayHaveSideffects result for ingredient differs from this "
205 "implementation");
206 return mayWriteToMemory();
207 case VPReplicateSC: {
208 auto *R = cast<VPReplicateRecipe>(this);
209 return R->getUnderlyingInstr()->mayHaveSideEffects();
210 }
211 default:
212 return true;
213 }
214}
215
217 assert(!Parent && "Recipe already in some VPBasicBlock");
218 assert(InsertPos->getParent() &&
219 "Insertion position not in any VPBasicBlock");
220 InsertPos->getParent()->insert(this, InsertPos->getIterator());
221}
222
223void VPRecipeBase::insertBefore(VPBasicBlock &BB,
225 assert(!Parent && "Recipe already in some VPBasicBlock");
226 assert(I == BB.end() || I->getParent() == &BB);
227 BB.insert(this, I);
228}
229
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(InsertPos->getParent() &&
233 "Insertion position not in any VPBasicBlock");
234 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
235}
236
238 assert(getParent() && "Recipe not in any VPBasicBlock");
240 Parent = nullptr;
241}
242
244 assert(getParent() && "Recipe not in any VPBasicBlock");
246}
247
250 insertAfter(InsertPos);
251}
252
258
260 // Get the underlying instruction for the recipe, if there is one. It is used
261 // to
262 // * decide if cost computation should be skipped for this recipe,
263 // * apply forced target instruction cost.
264 Instruction *UI = nullptr;
265 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
266 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
267 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
268 UI = IG->getInsertPos();
269 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
270 UI = &WidenMem->getIngredient();
271
272 InstructionCost RecipeCost;
273 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
274 RecipeCost = 0;
275 } else {
276 RecipeCost = computeCost(VF, Ctx);
277 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
278 RecipeCost.isValid())
280 }
281
282 LLVM_DEBUG({
283 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
284 dump();
285 });
286 return RecipeCost;
287}
288
290 VPCostContext &Ctx) const {
291 llvm_unreachable("subclasses should implement computeCost");
292}
293
295 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
297}
298
300 auto *VPI = dyn_cast<VPInstruction>(this);
301 return VPI && Instruction::isCast(VPI->getOpcode());
302}
303
306 VPCostContext &Ctx) const {
307 std::optional<unsigned> Opcode;
308 VPValue *Op = getVecOp();
309 uint64_t MulConst;
310 // If the partial reduction is predicated, a select will be operand 1.
311 // If it isn't predicated and the mul isn't operating on a constant, then it
312 // should have been turned into a VPExpressionRecipe.
313 // FIXME: Replace the entire function with this once all partial reduction
314 // variants are bundled into VPExpressionRecipe.
316 !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
317 auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
318 auto *InputType = Ctx.Types.inferScalarType(getVecOp());
319 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
320 PhiType, VF, TTI::PR_None,
321 TTI::PR_None, {}, Ctx.CostKind);
322 }
323
324 VPRecipeBase *OpR = Op->getDefiningRecipe();
325 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
327 ExtBType = TTI::PR_None;
328
329 auto GetExtendKind = [](VPRecipeBase *R) {
330 if (!R)
331 return TTI::PR_None;
332 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
333 if (!WidenCastR)
334 return TTI::PR_None;
335 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
336 return TTI::PR_ZeroExtend;
337 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
338 return TTI::PR_SignExtend;
339 return TTI::PR_None;
340 };
341
342 // Pick out opcode, type/ext information and use sub side effects from a widen
343 // recipe.
344 auto HandleWiden = [&](VPWidenRecipe *Widen) {
345 if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
346 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
347 }
348 Opcode = Widen->getOpcode();
349 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
350 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
351 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
352 : Widen->getOperand(0));
353 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
354 : Widen->getOperand(1));
355 ExtAType = GetExtendKind(ExtAR);
356 ExtBType = GetExtendKind(ExtBR);
357
358 using namespace VPlanPatternMatch;
359 const APInt *C;
360 if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
361 canConstantBeExtended(C, InputTypeA, ExtAType)) {
362 InputTypeB = InputTypeA;
363 ExtBType = ExtAType;
364 }
365 };
366
367 if (isa<VPWidenCastRecipe>(OpR)) {
368 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
369 ExtAType = GetExtendKind(OpR);
370 } else if (isa<VPReductionPHIRecipe>(OpR)) {
371 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
372 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
373 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
374 ExtAType = GetExtendKind(RedPhiOp1R);
375 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
376 HandleWiden(Widen);
377 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
378 HandleWiden(Widen);
379 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
380 return Reduction->computeCost(VF, Ctx);
381 }
382 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
383 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
384 PhiType, VF, ExtAType, ExtBType,
385 Opcode, Ctx.CostKind);
386}
387
389 auto &Builder = State.Builder;
390
391 assert(getOpcode() == Instruction::Add &&
392 "Unhandled partial reduction opcode");
393
394 Value *BinOpVal = State.get(getOperand(1));
395 Value *PhiVal = State.get(getOperand(0));
396 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
397
398 Type *RetTy = PhiVal->getType();
399
400 CallInst *V =
401 Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
402 {PhiVal, BinOpVal}, nullptr, "partial.reduce");
403
404 State.set(this, V);
405}
406
407#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
409 VPSlotTracker &SlotTracker) const {
410 O << Indent << "PARTIAL-REDUCE ";
412 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
414}
415#endif
416
418 assert(OpType == Other.OpType && "OpType must match");
419 switch (OpType) {
420 case OperationType::OverflowingBinOp:
421 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
422 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
423 break;
424 case OperationType::Trunc:
425 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
426 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
427 break;
428 case OperationType::DisjointOp:
429 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
430 break;
431 case OperationType::PossiblyExactOp:
432 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
433 break;
434 case OperationType::GEPOp:
435 GEPFlags &= Other.GEPFlags;
436 break;
437 case OperationType::FPMathOp:
438 FMFs.NoNaNs &= Other.FMFs.NoNaNs;
439 FMFs.NoInfs &= Other.FMFs.NoInfs;
440 break;
441 case OperationType::NonNegOp:
442 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
443 break;
444 case OperationType::Cmp:
445 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
446 break;
447 case OperationType::Other:
448 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
449 break;
450 }
451}
452
454 assert(OpType == OperationType::FPMathOp &&
455 "recipe doesn't have fast math flags");
456 FastMathFlags Res;
457 Res.setAllowReassoc(FMFs.AllowReassoc);
458 Res.setNoNaNs(FMFs.NoNaNs);
459 Res.setNoInfs(FMFs.NoInfs);
460 Res.setNoSignedZeros(FMFs.NoSignedZeros);
461 Res.setAllowReciprocal(FMFs.AllowReciprocal);
462 Res.setAllowContract(FMFs.AllowContract);
463 Res.setApproxFunc(FMFs.ApproxFunc);
464 return Res;
465}
466
467#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
469#endif
470
471template <unsigned PartOpIdx>
472VPValue *
474 if (U.getNumOperands() == PartOpIdx + 1)
475 return U.getOperand(PartOpIdx);
476 return nullptr;
477}
478
479template <unsigned PartOpIdx>
481 if (auto *UnrollPartOp = getUnrollPartOperand(U))
482 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
483 return 0;
484}
485
486namespace llvm {
487template class VPUnrollPartAccessor<1>;
488template class VPUnrollPartAccessor<2>;
489template class VPUnrollPartAccessor<3>;
490}
491
493 const VPIRFlags &Flags, DebugLoc DL,
494 const Twine &Name)
495 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
496 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
498 "Set flags not supported for the provided opcode");
499 assert((getNumOperandsForOpcode(Opcode) == -1u ||
500 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
501 "number of operands does not match opcode");
502}
503
504#ifndef NDEBUG
505unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
506 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
507 return 1;
508
509 if (Instruction::isBinaryOp(Opcode))
510 return 2;
511
512 switch (Opcode) {
515 return 0;
516 case Instruction::Alloca:
517 case Instruction::ExtractValue:
518 case Instruction::Freeze:
519 case Instruction::Load:
533 return 1;
534 case Instruction::ICmp:
535 case Instruction::FCmp:
536 case Instruction::Store:
544 return 2;
545 case Instruction::Select:
549 return 3;
551 return 4;
552 case Instruction::Call:
553 case Instruction::GetElementPtr:
554 case Instruction::PHI:
555 case Instruction::Switch:
556 // Cannot determine the number of operands from the opcode.
557 return -1u;
558 }
559 llvm_unreachable("all cases should be handled above");
560}
561#endif
562
566
567bool VPInstruction::canGenerateScalarForFirstLane() const {
569 return true;
571 return true;
572 switch (Opcode) {
573 case Instruction::Freeze:
574 case Instruction::ICmp:
575 case Instruction::PHI:
576 case Instruction::Select:
585 return true;
586 default:
587 return false;
588 }
589}
590
591/// Create a conditional branch using \p Cond branching to the successors of \p
592/// VPBB. Note that the first successor is always forward (i.e. not created yet)
593/// while the second successor may already have been created (if it is a header
594/// block and VPBB is a latch).
596 VPTransformState &State) {
597 // Replace the temporary unreachable terminator with a new conditional
598 // branch, hooking it up to backward destination (header) for latch blocks
599 // now, and to forward destination(s) later when they are created.
600 // Second successor may be backwards - iff it is already in VPBB2IRBB.
601 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
602 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
603 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
604 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
605 // First successor is always forward, reset it to nullptr
606 CondBr->setSuccessor(0, nullptr);
608 return CondBr;
609}
610
611Value *VPInstruction::generate(VPTransformState &State) {
612 IRBuilderBase &Builder = State.Builder;
613
615 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
616 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
617 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
618 auto *Res =
619 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
620 if (auto *I = dyn_cast<Instruction>(Res))
621 applyFlags(*I);
622 return Res;
623 }
624
625 switch (getOpcode()) {
626 case VPInstruction::Not: {
627 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
628 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
629 return Builder.CreateNot(A, Name);
630 }
631 case Instruction::ExtractElement: {
632 assert(State.VF.isVector() && "Only extract elements from vectors");
633 if (getOperand(1)->isLiveIn()) {
634 unsigned IdxToExtract =
635 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
636 return State.get(getOperand(0), VPLane(IdxToExtract));
637 }
638 Value *Vec = State.get(getOperand(0));
639 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
640 return Builder.CreateExtractElement(Vec, Idx, Name);
641 }
642 case Instruction::Freeze: {
644 return Builder.CreateFreeze(Op, Name);
645 }
646 case Instruction::FCmp:
647 case Instruction::ICmp: {
648 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
649 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
650 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
651 return Builder.CreateCmp(getPredicate(), A, B, Name);
652 }
653 case Instruction::PHI: {
654 llvm_unreachable("should be handled by VPPhi::execute");
655 }
656 case Instruction::Select: {
657 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
658 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
659 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
660 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
661 return Builder.CreateSelect(Cond, Op1, Op2, Name);
662 }
664 // Get first lane of vector induction variable.
665 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
666 // Get the original loop tripcount.
667 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
668
669 // If this part of the active lane mask is scalar, generate the CMP directly
670 // to avoid unnecessary extracts.
671 if (State.VF.isScalar())
672 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
673 Name);
674
675 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
676 auto PredTy = VectorType::get(
677 Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
678 ->getZExtValue());
679 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
680 {PredTy, ScalarTC->getType()},
681 {VIVElem0, ScalarTC}, nullptr, Name);
682 }
684 // Generate code to combine the previous and current values in vector v3.
685 //
686 // vector.ph:
687 // v_init = vector(..., ..., ..., a[-1])
688 // br vector.body
689 //
690 // vector.body
691 // i = phi [0, vector.ph], [i+4, vector.body]
692 // v1 = phi [v_init, vector.ph], [v2, vector.body]
693 // v2 = a[i, i+1, i+2, i+3];
694 // v3 = vector(v1(3), v2(0, 1, 2))
695
696 auto *V1 = State.get(getOperand(0));
697 if (!V1->getType()->isVectorTy())
698 return V1;
699 Value *V2 = State.get(getOperand(1));
700 return Builder.CreateVectorSplice(V1, V2, -1, Name);
701 }
703 unsigned UF = getParent()->getPlan()->getUF();
704 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
705 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
706 Value *Sub = Builder.CreateSub(ScalarTC, Step);
707 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
708 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
709 return Builder.CreateSelect(Cmp, Sub, Zero);
710 }
712 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
713 // be outside of the main loop.
714 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
715 // Compute EVL
716 assert(AVL->getType()->isIntegerTy() &&
717 "Requested vector length should be an integer.");
718
719 assert(State.VF.isScalable() && "Expected scalable vector factor.");
720 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
721
722 Value *EVL = State.Builder.CreateIntrinsic(
723 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
724 {AVL, VFArg, State.Builder.getTrue()});
725 return EVL;
726 }
728 unsigned Part = getUnrollPart(*this);
729 auto *IV = State.get(getOperand(0), VPLane(0));
730 assert(Part != 0 && "Must have a positive part");
731 // The canonical IV is incremented by the vectorization factor (num of
732 // SIMD elements) times the unroll part.
733 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
734 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
736 }
738 Value *Cond = State.get(getOperand(0), VPLane(0));
739 auto *Br = createCondBranch(Cond, getParent(), State);
740 applyMetadata(*Br);
741 return Br;
742 }
744 // First create the compare.
745 Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
746 Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
747 Value *Cond = Builder.CreateICmpEQ(IV, TC);
748 return createCondBranch(Cond, getParent(), State);
749 }
751 return Builder.CreateVectorSplat(
752 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
753 }
755 // For struct types, we need to build a new 'wide' struct type, where each
756 // element is widened, i.e., we create a struct of vectors.
757 auto *StructTy =
759 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
760 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
761 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
762 FieldIndex++) {
763 Value *ScalarValue =
764 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
765 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
766 VectorValue =
767 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
768 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
769 }
770 }
771 return Res;
772 }
774 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
775 auto NumOfElements = ElementCount::getFixed(getNumOperands());
776 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
777 for (const auto &[Idx, Op] : enumerate(operands()))
778 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
779 State.Builder.getInt32(Idx));
780 return Res;
781 }
783 if (State.VF.isScalar())
784 return State.get(getOperand(0), true);
785 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
787 // If this start vector is scaled then it should produce a vector with fewer
788 // elements than the VF.
789 ElementCount VF = State.VF.divideCoefficientBy(
790 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
791 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
792 Constant *Zero = Builder.getInt32(0);
793 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
794 Zero);
795 }
797 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
798 // and will be removed by breaking up the recipe further.
799 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
800 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
801 Value *ReducedPartRdx = State.get(getOperand(2));
802 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
803 ReducedPartRdx = Builder.CreateBinOp(
806 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
807 return createAnyOfReduction(Builder, ReducedPartRdx,
808 State.get(getOperand(1), VPLane(0)), OrigPhi);
809 }
811 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
812 // and will be removed by breaking up the recipe further.
813 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
814 // Get its reduction variable descriptor.
815 RecurKind RK = PhiR->getRecurrenceKind();
817 "Unexpected reduction kind");
818 assert(!PhiR->isInLoop() &&
819 "In-loop FindLastIV reduction is not supported yet");
820
821 // The recipe's operands are the reduction phi, the start value, the
822 // sentinel value, followed by one operand for each part of the reduction.
823 unsigned UF = getNumOperands() - 3;
824 Value *ReducedPartRdx = State.get(getOperand(3));
825 RecurKind MinMaxKind;
828 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
829 else
830 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
831 for (unsigned Part = 1; Part < UF; ++Part)
832 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
833 State.get(getOperand(3 + Part)));
834
835 Value *Start = State.get(getOperand(1), true);
837 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
838 Sentinel);
839 }
841 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
842 // and will be removed by breaking up the recipe further.
843 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
844 // Get its reduction variable descriptor.
845
846 RecurKind RK = PhiR->getRecurrenceKind();
848 "should be handled by ComputeFindIVResult");
849
850 // The recipe's operands are the reduction phi, followed by one operand for
851 // each part of the reduction.
852 unsigned UF = getNumOperands() - 1;
853 VectorParts RdxParts(UF);
854 for (unsigned Part = 0; Part < UF; ++Part)
855 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
856
857 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
858 if (hasFastMathFlags())
860
861 // Reduce all of the unrolled parts into a single vector.
862 Value *ReducedPartRdx = RdxParts[0];
863 if (PhiR->isOrdered()) {
864 ReducedPartRdx = RdxParts[UF - 1];
865 } else {
866 // Floating-point operations should have some FMF to enable the reduction.
867 for (unsigned Part = 1; Part < UF; ++Part) {
868 Value *RdxPart = RdxParts[Part];
870 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
871 else {
873 // For sub-recurrences, each UF's reduction variable is already
874 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
875 if (RK == RecurKind::Sub)
876 Opcode = Instruction::Add;
877 else
878 Opcode =
880 ReducedPartRdx =
881 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
882 }
883 }
884 }
885
886 // Create the reduction after the loop. Note that inloop reductions create
887 // the target reduction in the loop using a Reduction recipe.
888 if (State.VF.isVector() && !PhiR->isInLoop()) {
889 // TODO: Support in-order reductions based on the recurrence descriptor.
890 // All ops in the reduction inherit fast-math-flags from the recurrence
891 // descriptor.
892 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
893 }
894
895 return ReducedPartRdx;
896 }
900 unsigned Offset =
902 Value *Res;
903 if (State.VF.isVector()) {
904 assert(Offset <= State.VF.getKnownMinValue() &&
905 "invalid offset to extract from");
906 // Extract lane VF - Offset from the operand.
907 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
908 } else {
909 assert(Offset <= 1 && "invalid offset to extract from");
910 Res = State.get(getOperand(0));
911 }
913 Res->setName(Name);
914 return Res;
915 }
917 Value *A = State.get(getOperand(0));
918 Value *B = State.get(getOperand(1));
919 return Builder.CreateLogicalAnd(A, B, Name);
920 }
923 "can only generate first lane for PtrAdd");
924 Value *Ptr = State.get(getOperand(0), VPLane(0));
925 Value *Addend = State.get(getOperand(1), VPLane(0));
926 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
927 }
929 Value *Ptr =
931 Value *Addend = State.get(getOperand(1));
932 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
935 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
936 for (VPValue *Op : drop_begin(operands()))
937 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
938 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
939 }
941 Value *LaneToExtract = State.get(getOperand(0), true);
942 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
943 Value *Res = nullptr;
944 Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
945
946 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
947 Value *VectorStart =
948 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
949 Value *VectorIdx = Idx == 1
950 ? LaneToExtract
951 : Builder.CreateSub(LaneToExtract, VectorStart);
952 Value *Ext = State.VF.isScalar()
953 ? State.get(getOperand(Idx))
954 : Builder.CreateExtractElement(
955 State.get(getOperand(Idx)), VectorIdx);
956 if (Res) {
957 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
958 Res = Builder.CreateSelect(Cmp, Ext, Res);
959 } else {
960 Res = Ext;
961 }
962 }
963 return Res;
964 }
966 if (getNumOperands() == 1) {
967 Value *Mask = State.get(getOperand(0));
968 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
969 true, Name);
970 }
971 // If there are multiple operands, create a chain of selects to pick the
972 // first operand with an active lane and add the number of lanes of the
973 // preceding operands.
974 Value *RuntimeVF =
975 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
976 unsigned LastOpIdx = getNumOperands() - 1;
977 Value *Res = nullptr;
978 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
979 Value *TrailingZeros =
980 State.VF.isScalar()
981 ? Builder.CreateZExt(
982 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
983 Builder.getFalse()),
984 Builder.getInt64Ty())
985 : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
986 State.get(getOperand(Idx)),
987 true, Name);
988 Value *Current = Builder.CreateAdd(
989 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
990 if (Res) {
991 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
992 Res = Builder.CreateSelect(Cmp, Current, Res);
993 } else {
994 Res = Current;
995 }
996 }
997
998 return Res;
999 }
1001 return State.get(getOperand(0), true);
1002 default:
1003 llvm_unreachable("Unsupported opcode for instruction");
1004 }
1005}
1006
1008 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1009 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1010 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1011 switch (Opcode) {
1012 case Instruction::FNeg:
1013 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1014 case Instruction::UDiv:
1015 case Instruction::SDiv:
1016 case Instruction::SRem:
1017 case Instruction::URem:
1018 case Instruction::Add:
1019 case Instruction::FAdd:
1020 case Instruction::Sub:
1021 case Instruction::FSub:
1022 case Instruction::Mul:
1023 case Instruction::FMul:
1024 case Instruction::FDiv:
1025 case Instruction::FRem:
1026 case Instruction::Shl:
1027 case Instruction::LShr:
1028 case Instruction::AShr:
1029 case Instruction::And:
1030 case Instruction::Or:
1031 case Instruction::Xor: {
1034
1035 if (VF.isVector()) {
1036 // Certain instructions can be cheaper to vectorize if they have a
1037 // constant second vector operand. One example of this are shifts on x86.
1038 VPValue *RHS = getOperand(1);
1039 RHSInfo = Ctx.getOperandInfo(RHS);
1040
1041 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1044 }
1045
1048 if (CtxI)
1049 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1050 return Ctx.TTI.getArithmeticInstrCost(
1051 Opcode, ResultTy, Ctx.CostKind,
1052 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1053 RHSInfo, Operands, CtxI, &Ctx.TLI);
1054 }
1055 case Instruction::Freeze:
1056 // This opcode is unknown. Assume that it is the same as 'mul'.
1057 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
1058 Ctx.CostKind);
1059 case Instruction::ExtractValue:
1060 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1061 Ctx.CostKind);
1062 case Instruction::ICmp:
1063 case Instruction::FCmp: {
1064 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1065 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1067 return Ctx.TTI.getCmpSelInstrCost(
1068 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1069 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1070 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1071 }
1072 }
1073 llvm_unreachable("called for unsupported opcode");
1074}
1075
1077 VPCostContext &Ctx) const {
1079 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1080 // TODO: Compute cost for VPInstructions without underlying values once
1081 // the legacy cost model has been retired.
1082 return 0;
1083 }
1084
1086 "Should only generate a vector value or single scalar, not scalars "
1087 "for all lanes.");
1089 getOpcode(),
1091 }
1092
1093 switch (getOpcode()) {
1094 case Instruction::Select: {
1095 // TODO: It may be possible to improve this by analyzing where the
1096 // condition operand comes from.
1098 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1099 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1100 if (!vputils::onlyFirstLaneUsed(this)) {
1101 CondTy = toVectorTy(CondTy, VF);
1102 VecTy = toVectorTy(VecTy, VF);
1103 }
1104 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1105 Ctx.CostKind);
1106 }
1107 case Instruction::ExtractElement:
1109 if (VF.isScalar()) {
1110 // ExtractLane with VF=1 takes care of handling extracting across multiple
1111 // parts.
1112 return 0;
1113 }
1114
1115 // Add on the cost of extracting the element.
1116 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1117 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1118 Ctx.CostKind);
1119 }
1120 case VPInstruction::AnyOf: {
1121 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1122 return Ctx.TTI.getArithmeticReductionCost(
1123 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1124 }
1126 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1127 if (VF.isScalar())
1128 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1130 CmpInst::ICMP_EQ, Ctx.CostKind);
1131 // Calculate the cost of determining the lane index.
1132 auto *PredTy = toVectorTy(ScalarTy, VF);
1133 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1134 Type::getInt64Ty(Ctx.LLVMCtx),
1135 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1136 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1137 }
1139 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1141 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1142 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1143
1144 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
1145 cast<VectorType>(VectorTy),
1146 cast<VectorType>(VectorTy), Mask,
1147 Ctx.CostKind, VF.getKnownMinValue() - 1);
1148 }
1150 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1151 unsigned Multiplier =
1152 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1153 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1154 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1155 {ArgTy, ArgTy});
1156 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1157 }
1159 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1160 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1161 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1162 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1163 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1164 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1165 }
1167 // Add on the cost of extracting the element.
1168 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1169 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1170 VecTy, Ctx.CostKind, 0);
1171 }
1173 if (VF == ElementCount::getScalable(1))
1175 [[fallthrough]];
1176 default:
1177 // TODO: Compute cost other VPInstructions once the legacy cost model has
1178 // been retired.
1180 "unexpected VPInstruction witht underlying value");
1181 return 0;
1182 }
1183}
1184
1197
1199 switch (getOpcode()) {
1200 case Instruction::PHI:
1204 return true;
1205 default:
1206 return isScalarCast();
1207 }
1208}
1209
1211 assert(!State.Lane && "VPInstruction executing an Lane");
1212 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1214 "Set flags not supported for the provided opcode");
1215 if (hasFastMathFlags())
1216 State.Builder.setFastMathFlags(getFastMathFlags());
1217 Value *GeneratedValue = generate(State);
1218 if (!hasResult())
1219 return;
1220 assert(GeneratedValue && "generate must produce a value");
1221 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1224 assert((((GeneratedValue->getType()->isVectorTy() ||
1225 GeneratedValue->getType()->isStructTy()) ==
1226 !GeneratesPerFirstLaneOnly) ||
1227 State.VF.isScalar()) &&
1228 "scalar value but not only first lane defined");
1229 State.set(this, GeneratedValue,
1230 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1231}
1232
1235 return false;
1236 switch (getOpcode()) {
1237 case Instruction::ExtractElement:
1238 case Instruction::Freeze:
1239 case Instruction::FCmp:
1240 case Instruction::ICmp:
1241 case Instruction::Select:
1242 case Instruction::PHI:
1257 case VPInstruction::Not:
1265 return false;
1266 default:
1267 return true;
1268 }
1269}
1270
1272 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1274 return vputils::onlyFirstLaneUsed(this);
1275
1276 switch (getOpcode()) {
1277 default:
1278 return false;
1279 case Instruction::ExtractElement:
1280 return Op == getOperand(1);
1281 case Instruction::PHI:
1282 return true;
1283 case Instruction::FCmp:
1284 case Instruction::ICmp:
1285 case Instruction::Select:
1286 case Instruction::Or:
1287 case Instruction::Freeze:
1288 case VPInstruction::Not:
1289 // TODO: Cover additional opcodes.
1290 return vputils::onlyFirstLaneUsed(this);
1299 return true;
1302 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1303 // operand, after replicating its operands only the first lane is used.
1304 // Before replicating, it will have only a single operand.
1305 return getNumOperands() > 1;
1307 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1309 // WidePtrAdd supports scalar and vector base addresses.
1310 return false;
1313 return Op == getOperand(1);
1315 return Op == getOperand(0);
1316 };
1317 llvm_unreachable("switch should return");
1318}
1319
1321 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1323 return vputils::onlyFirstPartUsed(this);
1324
1325 switch (getOpcode()) {
1326 default:
1327 return false;
1328 case Instruction::FCmp:
1329 case Instruction::ICmp:
1330 case Instruction::Select:
1331 return vputils::onlyFirstPartUsed(this);
1335 return true;
1336 };
1337 llvm_unreachable("switch should return");
1338}
1339
1340#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1342 VPSlotTracker SlotTracker(getParent()->getPlan());
1343 print(dbgs(), "", SlotTracker);
1344}
1345
1347 VPSlotTracker &SlotTracker) const {
1348 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1349
1350 if (hasResult()) {
1352 O << " = ";
1353 }
1354
1355 switch (getOpcode()) {
1356 case VPInstruction::Not:
1357 O << "not";
1358 break;
1360 O << "combined load";
1361 break;
1363 O << "combined store";
1364 break;
1366 O << "active lane mask";
1367 break;
1369 O << "EXPLICIT-VECTOR-LENGTH";
1370 break;
1372 O << "first-order splice";
1373 break;
1375 O << "branch-on-cond";
1376 break;
1378 O << "TC > VF ? TC - VF : 0";
1379 break;
1381 O << "VF * Part +";
1382 break;
1384 O << "branch-on-count";
1385 break;
1387 O << "broadcast";
1388 break;
1390 O << "buildstructvector";
1391 break;
1393 O << "buildvector";
1394 break;
1396 O << "extract-lane";
1397 break;
1399 O << "extract-last-element";
1400 break;
1402 O << "extract-last-lane-per-part";
1403 break;
1405 O << "extract-penultimate-element";
1406 break;
1408 O << "compute-anyof-result";
1409 break;
1411 O << "compute-find-iv-result";
1412 break;
1414 O << "compute-reduction-result";
1415 break;
1417 O << "logical-and";
1418 break;
1420 O << "ptradd";
1421 break;
1423 O << "wide-ptradd";
1424 break;
1426 O << "any-of";
1427 break;
1429 O << "first-active-lane";
1430 break;
1432 O << "reduction-start-vector";
1433 break;
1435 O << "resume-for-epilogue";
1436 break;
1438 O << "unpack";
1439 break;
1440 default:
1442 }
1443
1444 printFlags(O);
1446
1447 if (auto DL = getDebugLoc()) {
1448 O << ", !dbg ";
1449 DL.print(O);
1450 }
1451}
1452#endif
1453
1455 State.setDebugLocFrom(getDebugLoc());
1456 if (isScalarCast()) {
1457 Value *Op = State.get(getOperand(0), VPLane(0));
1458 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1459 Op, ResultTy);
1460 State.set(this, Cast, VPLane(0));
1461 return;
1462 }
1463 switch (getOpcode()) {
1465 Value *StepVector =
1466 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1467 State.set(this, StepVector);
1468 break;
1469 }
1470 case VPInstruction::VScale: {
1471 Value *VScale = State.Builder.CreateVScale(ResultTy);
1472 State.set(this, VScale, true);
1473 break;
1474 }
1475
1476 default:
1477 llvm_unreachable("opcode not implemented yet");
1478 }
1479}
1480
1481#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1483 VPSlotTracker &SlotTracker) const {
1484 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1486 O << " = ";
1487
1488 switch (getOpcode()) {
1490 O << "wide-iv-step ";
1492 break;
1494 O << "step-vector " << *ResultTy;
1495 break;
1497 O << "vscale " << *ResultTy;
1498 break;
1499 default:
1500 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1503 O << " to " << *ResultTy;
1504 }
1505}
1506#endif
1507
1509 State.setDebugLocFrom(getDebugLoc());
1510 PHINode *NewPhi = State.Builder.CreatePHI(
1511 State.TypeAnalysis.inferScalarType(this), 2, getName());
1512 unsigned NumIncoming = getNumIncoming();
1513 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1514 // TODO: Fixup all incoming values of header phis once recipes defining them
1515 // are introduced.
1516 NumIncoming = 1;
1517 }
1518 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1519 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1520 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1521 NewPhi->addIncoming(IncV, PredBB);
1522 }
1523 State.set(this, NewPhi, VPLane(0));
1524}
1525
1526#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1527void VPPhi::print(raw_ostream &O, const Twine &Indent,
1528 VPSlotTracker &SlotTracker) const {
1529 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1531 O << " = phi ";
1533}
1534#endif
1535
1536VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1537 if (auto *Phi = dyn_cast<PHINode>(&I))
1538 return new VPIRPhi(*Phi);
1539 return new VPIRInstruction(I);
1540}
1541
1543 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1544 "PHINodes must be handled by VPIRPhi");
1545 // Advance the insert point after the wrapped IR instruction. This allows
1546 // interleaving VPIRInstructions and other recipes.
1547 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1548}
1549
1551 VPCostContext &Ctx) const {
1552 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1553 // hence it does not contribute to the cost-modeling for the VPlan.
1554 return 0;
1555}
1556
1559 "can only update exiting operands to phi nodes");
1560 assert(getNumOperands() > 0 && "must have at least one operand");
1561 VPValue *Exiting = getOperand(0);
1562 if (Exiting->isLiveIn())
1563 return;
1564
1565 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1566 setOperand(0, Exiting);
1567}
1568
1569#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1571 VPSlotTracker &SlotTracker) const {
1572 O << Indent << "IR " << I;
1573}
1574#endif
1575
1577 PHINode *Phi = &getIRPhi();
1578 for (const auto &[Idx, Op] : enumerate(operands())) {
1579 VPValue *ExitValue = Op;
1580 auto Lane = vputils::isSingleScalar(ExitValue)
1582 : VPLane::getLastLaneForVF(State.VF);
1583 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1584 auto *PredVPBB = Pred->getExitingBasicBlock();
1585 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1586 // Set insertion point in PredBB in case an extract needs to be generated.
1587 // TODO: Model extracts explicitly.
1588 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1589 Value *V = State.get(ExitValue, VPLane(Lane));
1590 // If there is no existing block for PredBB in the phi, add a new incoming
1591 // value. Otherwise update the existing incoming value for PredBB.
1592 if (Phi->getBasicBlockIndex(PredBB) == -1)
1593 Phi->addIncoming(V, PredBB);
1594 else
1595 Phi->setIncomingValueForBlock(PredBB, V);
1596 }
1597
1598 // Advance the insert point after the wrapped IR instruction. This allows
1599 // interleaving VPIRInstructions and other recipes.
1600 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1601}
1602
1604 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1605 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1606 "Number of phi operands must match number of predecessors");
1607 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1608 R->removeOperand(Position);
1609}
1610
1611#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1613 VPSlotTracker &SlotTracker) const {
1614 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1615 [this, &O, &SlotTracker](auto Op) {
1616 O << "[ ";
1617 Op.value()->printAsOperand(O, SlotTracker);
1618 O << ", ";
1619 getIncomingBlock(Op.index())->printAsOperand(O);
1620 O << " ]";
1621 });
1622}
1623#endif
1624
1625#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1626void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1627 VPSlotTracker &SlotTracker) const {
1629
1630 if (getNumOperands() != 0) {
1631 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1633 [&O, &SlotTracker](auto Op) {
1634 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1635 O << " from ";
1636 std::get<1>(Op)->printAsOperand(O);
1637 });
1638 O << ")";
1639 }
1640}
1641#endif
1642
1644 : VPIRMetadata(I) {
1645 if (!LVer || !isa<LoadInst, StoreInst>(&I))
1646 return;
1647 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1648 if (AliasScopeMD)
1649 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1650 if (NoAliasMD)
1651 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1652}
1653
1655 for (const auto &[Kind, Node] : Metadata)
1656 I.setMetadata(Kind, Node);
1657}
1658
1660 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1661 for (const auto &[KindA, MDA] : Metadata) {
1662 for (const auto &[KindB, MDB] : Other.Metadata) {
1663 if (KindA == KindB && MDA == MDB) {
1664 MetadataIntersection.emplace_back(KindA, MDA);
1665 break;
1666 }
1667 }
1668 }
1669 Metadata = std::move(MetadataIntersection);
1670}
1671
1673 assert(State.VF.isVector() && "not widening");
1674 assert(Variant != nullptr && "Can't create vector function.");
1675
1676 FunctionType *VFTy = Variant->getFunctionType();
1677 // Add return type if intrinsic is overloaded on it.
1679 for (const auto &I : enumerate(args())) {
1680 Value *Arg;
1681 // Some vectorized function variants may also take a scalar argument,
1682 // e.g. linear parameters for pointers. This needs to be the scalar value
1683 // from the start of the respective part when interleaving.
1684 if (!VFTy->getParamType(I.index())->isVectorTy())
1685 Arg = State.get(I.value(), VPLane(0));
1686 else
1687 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1688 Args.push_back(Arg);
1689 }
1690
1693 if (CI)
1694 CI->getOperandBundlesAsDefs(OpBundles);
1695
1696 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1697 applyFlags(*V);
1698 applyMetadata(*V);
1699 V->setCallingConv(Variant->getCallingConv());
1700
1701 if (!V->getType()->isVoidTy())
1702 State.set(this, V);
1703}
1704
1706 VPCostContext &Ctx) const {
1707 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1708 Variant->getFunctionType()->params(),
1709 Ctx.CostKind);
1710}
1711
1712#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1714 VPSlotTracker &SlotTracker) const {
1715 O << Indent << "WIDEN-CALL ";
1716
1717 Function *CalledFn = getCalledScalarFunction();
1718 if (CalledFn->getReturnType()->isVoidTy())
1719 O << "void ";
1720 else {
1722 O << " = ";
1723 }
1724
1725 O << "call";
1726 printFlags(O);
1727 O << " @" << CalledFn->getName() << "(";
1728 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1729 Op->printAsOperand(O, SlotTracker);
1730 });
1731 O << ")";
1732
1733 O << " (using library function";
1734 if (Variant->hasName())
1735 O << ": " << Variant->getName();
1736 O << ")";
1737}
1738#endif
1739
1741 assert(State.VF.isVector() && "not widening");
1742
1743 SmallVector<Type *, 2> TysForDecl;
1744 // Add return type if intrinsic is overloaded on it.
1745 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1746 TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1748 for (const auto &I : enumerate(operands())) {
1749 // Some intrinsics have a scalar argument - don't replace it with a
1750 // vector.
1751 Value *Arg;
1752 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1753 State.TTI))
1754 Arg = State.get(I.value(), VPLane(0));
1755 else
1756 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1757 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1758 State.TTI))
1759 TysForDecl.push_back(Arg->getType());
1760 Args.push_back(Arg);
1761 }
1762
1763 // Use vector version of the intrinsic.
1764 Module *M = State.Builder.GetInsertBlock()->getModule();
1765 Function *VectorF =
1766 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1767 assert(VectorF &&
1768 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1769
1772 if (CI)
1773 CI->getOperandBundlesAsDefs(OpBundles);
1774
1775 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1776
1777 applyFlags(*V);
1778 applyMetadata(*V);
1779
1780 if (!V->getType()->isVoidTy())
1781 State.set(this, V);
1782}
1783
1784/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1787 const VPRecipeWithIRFlags &R,
1788 ElementCount VF,
1789 VPCostContext &Ctx) {
1790 // Some backends analyze intrinsic arguments to determine cost. Use the
1791 // underlying value for the operand if it has one. Otherwise try to use the
1792 // operand of the underlying call instruction, if there is one. Otherwise
1793 // clear Arguments.
1794 // TODO: Rework TTI interface to be independent of concrete IR values.
1796 for (const auto &[Idx, Op] : enumerate(Operands)) {
1797 auto *V = Op->getUnderlyingValue();
1798 if (!V) {
1799 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1800 Arguments.push_back(UI->getArgOperand(Idx));
1801 continue;
1802 }
1803 Arguments.clear();
1804 break;
1805 }
1806 Arguments.push_back(V);
1807 }
1808
1809 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1810 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1811 SmallVector<Type *> ParamTys;
1812 for (const VPValue *Op : Operands) {
1813 ParamTys.push_back(VF.isVector()
1814 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1815 : Ctx.Types.inferScalarType(Op));
1816 }
1817
1818 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1819 FastMathFlags FMF =
1820 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1821 IntrinsicCostAttributes CostAttrs(
1822 ID, RetTy, Arguments, ParamTys, FMF,
1823 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1824 InstructionCost::getInvalid(), &Ctx.TLI);
1825 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1826}
1827
1829 VPCostContext &Ctx) const {
1831 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1832}
1833
1835 return Intrinsic::getBaseName(VectorIntrinsicID);
1836}
1837
1839 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1840 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1841 auto [Idx, V] = X;
1843 Idx, nullptr);
1844 });
1845}
1846
1847#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1849 VPSlotTracker &SlotTracker) const {
1850 O << Indent << "WIDEN-INTRINSIC ";
1851 if (ResultTy->isVoidTy()) {
1852 O << "void ";
1853 } else {
1855 O << " = ";
1856 }
1857
1858 O << "call";
1859 printFlags(O);
1860 O << getIntrinsicName() << "(";
1861
1863 Op->printAsOperand(O, SlotTracker);
1864 });
1865 O << ")";
1866}
1867#endif
1868
1870 IRBuilderBase &Builder = State.Builder;
1871
1872 Value *Address = State.get(getOperand(0));
1873 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1874 VectorType *VTy = cast<VectorType>(Address->getType());
1875
1876 // The histogram intrinsic requires a mask even if the recipe doesn't;
1877 // if the mask operand was omitted then all lanes should be executed and
1878 // we just need to synthesize an all-true mask.
1879 Value *Mask = nullptr;
1880 if (VPValue *VPMask = getMask())
1881 Mask = State.get(VPMask);
1882 else
1883 Mask =
1884 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1885
1886 // If this is a subtract, we want to invert the increment amount. We may
1887 // add a separate intrinsic in future, but for now we'll try this.
1888 if (Opcode == Instruction::Sub)
1889 IncAmt = Builder.CreateNeg(IncAmt);
1890 else
1891 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1892
1893 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1894 {VTy, IncAmt->getType()},
1895 {Address, IncAmt, Mask});
1896}
1897
1899 VPCostContext &Ctx) const {
1900 // FIXME: Take the gather and scatter into account as well. For now we're
1901 // generating the same cost as the fallback path, but we'll likely
1902 // need to create a new TTI method for determining the cost, including
1903 // whether we can use base + vec-of-smaller-indices or just
1904 // vec-of-pointers.
1905 assert(VF.isVector() && "Invalid VF for histogram cost");
1906 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1907 VPValue *IncAmt = getOperand(1);
1908 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1909 VectorType *VTy = VectorType::get(IncTy, VF);
1910
1911 // Assume that a non-constant update value (or a constant != 1) requires
1912 // a multiply, and add that into the cost.
1913 InstructionCost MulCost =
1914 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1915 if (IncAmt->isLiveIn()) {
1917
1918 if (CI && CI->getZExtValue() == 1)
1919 MulCost = TTI::TCC_Free;
1920 }
1921
1922 // Find the cost of the histogram operation itself.
1923 Type *PtrTy = VectorType::get(AddressTy, VF);
1924 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1925 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1926 Type::getVoidTy(Ctx.LLVMCtx),
1927 {PtrTy, IncTy, MaskTy});
1928
1929 // Add the costs together with the add/sub operation.
1930 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1931 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1932}
1933
1934#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1936 VPSlotTracker &SlotTracker) const {
1937 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1939
1940 if (Opcode == Instruction::Sub)
1941 O << ", dec: ";
1942 else {
1943 assert(Opcode == Instruction::Add);
1944 O << ", inc: ";
1945 }
1947
1948 if (VPValue *Mask = getMask()) {
1949 O << ", mask: ";
1950 Mask->printAsOperand(O, SlotTracker);
1951 }
1952}
1953
1955 VPSlotTracker &SlotTracker) const {
1956 O << Indent << "WIDEN-SELECT ";
1958 O << " = select ";
1959 printFlags(O);
1961 O << ", ";
1963 O << ", ";
1965 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1966}
1967#endif
1968
1970 // The condition can be loop invariant but still defined inside the
1971 // loop. This means that we can't just use the original 'cond' value.
1972 // We have to take the 'vectorized' value and pick the first lane.
1973 // Instcombine will make this a no-op.
1974 Value *Cond = State.get(getCond(), isInvariantCond());
1975
1976 Value *Op0 = State.get(getOperand(1));
1977 Value *Op1 = State.get(getOperand(2));
1978 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1979 State.set(this, Sel);
1980 if (auto *I = dyn_cast<Instruction>(Sel)) {
1982 applyFlags(*I);
1983 applyMetadata(*I);
1984 }
1985}
1986
1988 VPCostContext &Ctx) const {
1990 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1991 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1992 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1993
1994 VPValue *Op0, *Op1;
1995 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1996 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1997 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1998 // select x, y, false --> x & y
1999 // select x, true, y --> x | y
2000 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
2001 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
2002
2004 if (all_of(operands(),
2005 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
2006 Operands.append(SI->op_begin(), SI->op_end());
2007 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
2008 return Ctx.TTI.getArithmeticInstrCost(
2009 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
2010 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
2011 }
2012
2013 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
2014 if (!ScalarCond)
2015 CondTy = VectorType::get(CondTy, VF);
2016
2018 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
2019 Pred = Cmp->getPredicate();
2020 return Ctx.TTI.getCmpSelInstrCost(
2021 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
2022 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
2023}
2024
2025VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2026 AllowReassoc = FMF.allowReassoc();
2027 NoNaNs = FMF.noNaNs();
2028 NoInfs = FMF.noInfs();
2029 NoSignedZeros = FMF.noSignedZeros();
2030 AllowReciprocal = FMF.allowReciprocal();
2031 AllowContract = FMF.allowContract();
2032 ApproxFunc = FMF.approxFunc();
2033}
2034
2035#if !defined(NDEBUG)
2036bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2037 switch (OpType) {
2038 case OperationType::OverflowingBinOp:
2039 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2040 Opcode == Instruction::Mul ||
2041 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2042 case OperationType::Trunc:
2043 return Opcode == Instruction::Trunc;
2044 case OperationType::DisjointOp:
2045 return Opcode == Instruction::Or;
2046 case OperationType::PossiblyExactOp:
2047 return Opcode == Instruction::AShr;
2048 case OperationType::GEPOp:
2049 return Opcode == Instruction::GetElementPtr ||
2050 Opcode == VPInstruction::PtrAdd ||
2051 Opcode == VPInstruction::WidePtrAdd;
2052 case OperationType::FPMathOp:
2053 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
2054 Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
2055 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
2056 Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc ||
2057 Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
2058 Opcode == VPInstruction::WideIVStep ||
2061 case OperationType::NonNegOp:
2062 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2063 case OperationType::Cmp:
2064 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2065 case OperationType::Other:
2066 return true;
2067 }
2068 llvm_unreachable("Unknown OperationType enum");
2069}
2070#endif
2071
2072#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2074 switch (OpType) {
2075 case OperationType::Cmp:
2077 break;
2078 case OperationType::DisjointOp:
2079 if (DisjointFlags.IsDisjoint)
2080 O << " disjoint";
2081 break;
2082 case OperationType::PossiblyExactOp:
2083 if (ExactFlags.IsExact)
2084 O << " exact";
2085 break;
2086 case OperationType::OverflowingBinOp:
2087 if (WrapFlags.HasNUW)
2088 O << " nuw";
2089 if (WrapFlags.HasNSW)
2090 O << " nsw";
2091 break;
2092 case OperationType::Trunc:
2093 if (TruncFlags.HasNUW)
2094 O << " nuw";
2095 if (TruncFlags.HasNSW)
2096 O << " nsw";
2097 break;
2098 case OperationType::FPMathOp:
2100 break;
2101 case OperationType::GEPOp:
2102 if (GEPFlags.isInBounds())
2103 O << " inbounds";
2104 else if (GEPFlags.hasNoUnsignedSignedWrap())
2105 O << " nusw";
2106 if (GEPFlags.hasNoUnsignedWrap())
2107 O << " nuw";
2108 break;
2109 case OperationType::NonNegOp:
2110 if (NonNegFlags.NonNeg)
2111 O << " nneg";
2112 break;
2113 case OperationType::Other:
2114 break;
2115 }
2116 O << " ";
2117}
2118#endif
2119
2121 auto &Builder = State.Builder;
2122 switch (Opcode) {
2123 case Instruction::Call:
2124 case Instruction::Br:
2125 case Instruction::PHI:
2126 case Instruction::GetElementPtr:
2127 case Instruction::Select:
2128 llvm_unreachable("This instruction is handled by a different recipe.");
2129 case Instruction::UDiv:
2130 case Instruction::SDiv:
2131 case Instruction::SRem:
2132 case Instruction::URem:
2133 case Instruction::Add:
2134 case Instruction::FAdd:
2135 case Instruction::Sub:
2136 case Instruction::FSub:
2137 case Instruction::FNeg:
2138 case Instruction::Mul:
2139 case Instruction::FMul:
2140 case Instruction::FDiv:
2141 case Instruction::FRem:
2142 case Instruction::Shl:
2143 case Instruction::LShr:
2144 case Instruction::AShr:
2145 case Instruction::And:
2146 case Instruction::Or:
2147 case Instruction::Xor: {
2148 // Just widen unops and binops.
2150 for (VPValue *VPOp : operands())
2151 Ops.push_back(State.get(VPOp));
2152
2153 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2154
2155 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2156 applyFlags(*VecOp);
2157 applyMetadata(*VecOp);
2158 }
2159
2160 // Use this vector value for all users of the original instruction.
2161 State.set(this, V);
2162 break;
2163 }
2164 case Instruction::ExtractValue: {
2165 assert(getNumOperands() == 2 && "expected single level extractvalue");
2166 Value *Op = State.get(getOperand(0));
2168 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
2169 State.set(this, Extract);
2170 break;
2171 }
2172 case Instruction::Freeze: {
2173 Value *Op = State.get(getOperand(0));
2174 Value *Freeze = Builder.CreateFreeze(Op);
2175 State.set(this, Freeze);
2176 break;
2177 }
2178 case Instruction::ICmp:
2179 case Instruction::FCmp: {
2180 // Widen compares. Generate vector compares.
2181 bool FCmp = Opcode == Instruction::FCmp;
2182 Value *A = State.get(getOperand(0));
2183 Value *B = State.get(getOperand(1));
2184 Value *C = nullptr;
2185 if (FCmp) {
2186 // Propagate fast math flags.
2187 C = Builder.CreateFCmpFMF(
2188 getPredicate(), A, B,
2190 } else {
2191 C = Builder.CreateICmp(getPredicate(), A, B);
2192 }
2193 if (auto *I = dyn_cast<Instruction>(C))
2194 applyMetadata(*I);
2195 State.set(this, C);
2196 break;
2197 }
2198 default:
2199 // This instruction is not vectorized by simple widening.
2200 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2201 << Instruction::getOpcodeName(Opcode));
2202 llvm_unreachable("Unhandled instruction!");
2203 } // end of switch.
2204
2205#if !defined(NDEBUG)
2206 // Verify that VPlan type inference results agree with the type of the
2207 // generated values.
2208 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2209 State.get(this)->getType() &&
2210 "inferred type and type from generated instructions do not match");
2211#endif
2212}
2213
2215 VPCostContext &Ctx) const {
2216 switch (Opcode) {
2217 case Instruction::UDiv:
2218 case Instruction::SDiv:
2219 case Instruction::SRem:
2220 case Instruction::URem:
2221 // If the div/rem operation isn't safe to speculate and requires
2222 // predication, then the only way we can even create a vplan is to insert
2223 // a select on the second input operand to ensure we use the value of 1
2224 // for the inactive lanes. The select will be costed separately.
2225 case Instruction::FNeg:
2226 case Instruction::Add:
2227 case Instruction::FAdd:
2228 case Instruction::Sub:
2229 case Instruction::FSub:
2230 case Instruction::Mul:
2231 case Instruction::FMul:
2232 case Instruction::FDiv:
2233 case Instruction::FRem:
2234 case Instruction::Shl:
2235 case Instruction::LShr:
2236 case Instruction::AShr:
2237 case Instruction::And:
2238 case Instruction::Or:
2239 case Instruction::Xor:
2240 case Instruction::Freeze:
2241 case Instruction::ExtractValue:
2242 case Instruction::ICmp:
2243 case Instruction::FCmp:
2244 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2245 default:
2246 llvm_unreachable("Unsupported opcode for instruction");
2247 }
2248}
2249
2250#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2252 VPSlotTracker &SlotTracker) const {
2253 O << Indent << "WIDEN ";
2255 O << " = " << Instruction::getOpcodeName(Opcode);
2256 printFlags(O);
2258}
2259#endif
2260
2262 auto &Builder = State.Builder;
2263 /// Vectorize casts.
2264 assert(State.VF.isVector() && "Not vectorizing?");
2265 Type *DestTy = VectorType::get(getResultType(), State.VF);
2266 VPValue *Op = getOperand(0);
2267 Value *A = State.get(Op);
2268 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2269 State.set(this, Cast);
2270 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2271 applyFlags(*CastOp);
2272 applyMetadata(*CastOp);
2273 }
2274}
2275
2277 VPCostContext &Ctx) const {
2278 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2279 // the legacy cost model, including truncates/extends when evaluating a
2280 // reduction in a smaller type.
2281 if (!getUnderlyingValue())
2282 return 0;
2283 // Computes the CastContextHint from a recipes that may access memory.
2284 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2285 if (VF.isScalar())
2287 if (isa<VPInterleaveBase>(R))
2289 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2290 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2292 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2293 if (WidenMemoryRecipe == nullptr)
2295 if (!WidenMemoryRecipe->isConsecutive())
2297 if (WidenMemoryRecipe->isReverse())
2299 if (WidenMemoryRecipe->isMasked())
2302 };
2303
2304 VPValue *Operand = getOperand(0);
2306 // For Trunc/FPTrunc, get the context from the only user.
2307 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2309 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2310 CCH = ComputeCCH(StoreRecipe);
2311 }
2312 // For Z/Sext, get the context from the operand.
2313 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2314 Opcode == Instruction::FPExt) {
2315 if (Operand->isLiveIn())
2317 else if (Operand->getDefiningRecipe())
2318 CCH = ComputeCCH(Operand->getDefiningRecipe());
2319 }
2320
2321 auto *SrcTy =
2322 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2323 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2324 // Arm TTI will use the underlying instruction to determine the cost.
2325 return Ctx.TTI.getCastInstrCost(
2326 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2328}
2329
2330#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2332 VPSlotTracker &SlotTracker) const {
2333 O << Indent << "WIDEN-CAST ";
2335 O << " = " << Instruction::getOpcodeName(Opcode);
2336 printFlags(O);
2338 O << " to " << *getResultType();
2339}
2340#endif
2341
2343 VPCostContext &Ctx) const {
2344 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2345}
2346
2347/// A helper function that returns an integer or floating-point constant with
2348/// value C.
2350 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2351 : ConstantFP::get(Ty, C);
2352}
2353
2354#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2356 VPSlotTracker &SlotTracker) const {
2357 O << Indent;
2359 O << " = WIDEN-INDUCTION ";
2361
2362 if (auto *TI = getTruncInst())
2363 O << " (truncated to " << *TI->getType() << ")";
2364}
2365#endif
2366
2368 // The step may be defined by a recipe in the preheader (e.g. if it requires
2369 // SCEV expansion), but for the canonical induction the step is required to be
2370 // 1, which is represented as live-in.
2372 return false;
2375 auto *CanIV = getRegion()->getCanonicalIV();
2376 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2377 getScalarType() == CanIV->getScalarType();
2378}
2379
2380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2382 VPSlotTracker &SlotTracker) const {
2383 O << Indent;
2385 O << " = DERIVED-IV ";
2386 getStartValue()->printAsOperand(O, SlotTracker);
2387 O << " + ";
2388 getOperand(1)->printAsOperand(O, SlotTracker);
2389 O << " * ";
2390 getStepValue()->printAsOperand(O, SlotTracker);
2391}
2392#endif
2393
2395 // Fast-math-flags propagate from the original induction instruction.
2396 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2397 if (hasFastMathFlags())
2398 State.Builder.setFastMathFlags(getFastMathFlags());
2399
2400 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2401 /// variable on which to base the steps, \p Step is the size of the step.
2402
2403 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2404 Value *Step = State.get(getStepValue(), VPLane(0));
2405 IRBuilderBase &Builder = State.Builder;
2406
2407 // Ensure step has the same type as that of scalar IV.
2408 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2409 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2410
2411 // We build scalar steps for both integer and floating-point induction
2412 // variables. Here, we determine the kind of arithmetic we will perform.
2415 if (BaseIVTy->isIntegerTy()) {
2416 AddOp = Instruction::Add;
2417 MulOp = Instruction::Mul;
2418 } else {
2419 AddOp = InductionOpcode;
2420 MulOp = Instruction::FMul;
2421 }
2422
2423 // Determine the number of scalars we need to generate for each unroll
2424 // iteration.
2425 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2426 // Compute the scalar steps and save the results in State.
2427 Type *IntStepTy =
2428 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2429 Type *VecIVTy = nullptr;
2430 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2431 if (!FirstLaneOnly && State.VF.isScalable()) {
2432 VecIVTy = VectorType::get(BaseIVTy, State.VF);
2433 UnitStepVec =
2434 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2435 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2436 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2437 }
2438
2439 unsigned StartLane = 0;
2440 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2441 if (State.Lane) {
2442 StartLane = State.Lane->getKnownLane();
2443 EndLane = StartLane + 1;
2444 }
2445 Value *StartIdx0;
2446 if (getUnrollPart(*this) == 0)
2447 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2448 else {
2449 StartIdx0 = State.get(getOperand(2), true);
2450 if (getUnrollPart(*this) != 1) {
2451 StartIdx0 =
2452 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2453 getUnrollPart(*this)));
2454 }
2455 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2456 }
2457
2458 if (!FirstLaneOnly && State.VF.isScalable()) {
2459 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2460 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2461 if (BaseIVTy->isFloatingPointTy())
2462 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2463 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2464 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2465 State.set(this, Add);
2466 // It's useful to record the lane values too for the known minimum number
2467 // of elements so we do those below. This improves the code quality when
2468 // trying to extract the first element, for example.
2469 }
2470
2471 if (BaseIVTy->isFloatingPointTy())
2472 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2473
2474 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2475 Value *StartIdx = Builder.CreateBinOp(
2476 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2477 // The step returned by `createStepForVF` is a runtime-evaluated value
2478 // when VF is scalable. Otherwise, it should be folded into a Constant.
2479 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2480 "Expected StartIdx to be folded to a constant when VF is not "
2481 "scalable");
2482 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2483 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2484 State.set(this, Add, VPLane(Lane));
2485 }
2486}
2487
2488#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2490 VPSlotTracker &SlotTracker) const {
2491 O << Indent;
2493 O << " = SCALAR-STEPS ";
2495}
2496#endif
2497
2499 assert(State.VF.isVector() && "not widening");
2500 // Construct a vector GEP by widening the operands of the scalar GEP as
2501 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2502 // results in a vector of pointers when at least one operand of the GEP
2503 // is vector-typed. Thus, to keep the representation compact, we only use
2504 // vector-typed operands for loop-varying values.
2505
2506 if (areAllOperandsInvariant()) {
2507 // If we are vectorizing, but the GEP has only loop-invariant operands,
2508 // the GEP we build (by only using vector-typed operands for
2509 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2510 // produce a vector of pointers, we need to either arbitrarily pick an
2511 // operand to broadcast, or broadcast a clone of the original GEP.
2512 // Here, we broadcast a clone of the original.
2513 //
2514 // TODO: If at some point we decide to scalarize instructions having
2515 // loop-invariant operands, this special case will no longer be
2516 // required. We would add the scalarization decision to
2517 // collectLoopScalars() and teach getVectorValue() to broadcast
2518 // the lane-zero scalar value.
2520 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2521 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2522
2523 auto *NewGEP =
2524 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2525 "", getGEPNoWrapFlags());
2526 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2527 State.set(this, Splat);
2528 } else {
2529 // If the GEP has at least one loop-varying operand, we are sure to
2530 // produce a vector of pointers unless VF is scalar.
2531 // The pointer operand of the new GEP. If it's loop-invariant, we
2532 // won't broadcast it.
2533 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2534
2535 // Collect all the indices for the new GEP. If any index is
2536 // loop-invariant, we won't broadcast it.
2538 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2539 VPValue *Operand = getOperand(I);
2540 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2541 }
2542
2543 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2544 // but it should be a vector, otherwise.
2545 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2546 "", getGEPNoWrapFlags());
2547 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2548 "NewGEP is not a pointer vector");
2549 State.set(this, NewGEP);
2550 }
2551}
2552
2553#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2555 VPSlotTracker &SlotTracker) const {
2556 O << Indent << "WIDEN-GEP ";
2557 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2558 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2559 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2560
2561 O << " ";
2563 O << " = getelementptr";
2564 printFlags(O);
2566}
2567#endif
2568
2569static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2570 unsigned CurrentPart, IRBuilderBase &Builder) {
2571 // Use i32 for the gep index type when the value is constant,
2572 // or query DataLayout for a more suitable index type otherwise.
2573 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2574 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2575 ? DL.getIndexType(Builder.getPtrTy(0))
2576 : Builder.getInt32Ty();
2577}
2578
2580 auto &Builder = State.Builder;
2581 unsigned CurrentPart = getUnrollPart(*this);
2582 bool IsUnitStride = Stride == 1 || Stride == -1;
2583 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2584 IsUnitStride, CurrentPart, Builder);
2585
2586 // The wide store needs to start at the last vector element.
2587 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2588 if (IndexTy != RunTimeVF->getType())
2589 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2590 // NumElt = Stride * CurrentPart * RunTimeVF
2591 Value *NumElt = Builder.CreateMul(
2592 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2593 // LastLane = Stride * (RunTimeVF - 1)
2594 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2595 if (Stride != 1)
2596 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2597 Value *Ptr = State.get(getOperand(0), VPLane(0));
2598 Value *ResultPtr =
2599 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2600 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2602
2603 State.set(this, ResultPtr, /*IsScalar*/ true);
2604}
2605
2606#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2608 VPSlotTracker &SlotTracker) const {
2609 O << Indent;
2611 O << " = vector-end-pointer";
2612 printFlags(O);
2614}
2615#endif
2616
2618 auto &Builder = State.Builder;
2619 unsigned CurrentPart = getUnrollPart(*this);
2620 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2621 /*IsUnitStride*/ true, CurrentPart, Builder);
2622 Value *Ptr = State.get(getOperand(0), VPLane(0));
2623
2624 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2625 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2626 "", getGEPNoWrapFlags());
2627
2628 State.set(this, ResultPtr, /*IsScalar*/ true);
2629}
2630
2631#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2633 VPSlotTracker &SlotTracker) const {
2634 O << Indent;
2636 O << " = vector-pointer ";
2637
2639}
2640#endif
2641
2643 VPCostContext &Ctx) const {
2644 // Handle cases where only the first lane is used the same way as the legacy
2645 // cost model.
2647 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2648
2649 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2650 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2651 return (getNumIncomingValues() - 1) *
2652 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2653 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2654}
2655
2656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2658 VPSlotTracker &SlotTracker) const {
2659 O << Indent << "BLEND ";
2661 O << " =";
2662 if (getNumIncomingValues() == 1) {
2663 // Not a User of any mask: not really blending, this is a
2664 // single-predecessor phi.
2665 O << " ";
2666 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2667 } else {
2668 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2669 O << " ";
2670 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2671 if (I == 0)
2672 continue;
2673 O << "/";
2674 getMask(I)->printAsOperand(O, SlotTracker);
2675 }
2676 }
2677}
2678#endif
2679
2681 assert(!State.Lane && "Reduction being replicated.");
2682 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2685 "In-loop AnyOf reductions aren't currently supported");
2686 // Propagate the fast-math flags carried by the underlying instruction.
2687 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2688 State.Builder.setFastMathFlags(getFastMathFlags());
2689 Value *NewVecOp = State.get(getVecOp());
2690 if (VPValue *Cond = getCondOp()) {
2691 Value *NewCond = State.get(Cond, State.VF.isScalar());
2692 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2693 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2694
2695 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2696 if (State.VF.isVector())
2697 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2698
2699 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2700 NewVecOp = Select;
2701 }
2702 Value *NewRed;
2703 Value *NextInChain;
2704 if (IsOrdered) {
2705 if (State.VF.isVector())
2706 NewRed =
2707 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2708 else
2709 NewRed = State.Builder.CreateBinOp(
2711 PrevInChain, NewVecOp);
2712 PrevInChain = NewRed;
2713 NextInChain = NewRed;
2714 } else {
2715 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2716 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2718 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2719 else
2720 NextInChain = State.Builder.CreateBinOp(
2722 PrevInChain, NewRed);
2723 }
2724 State.set(this, NextInChain, /*IsScalar*/ true);
2725}
2726
2728 assert(!State.Lane && "Reduction being replicated.");
2729
2730 auto &Builder = State.Builder;
2731 // Propagate the fast-math flags carried by the underlying instruction.
2732 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2733 Builder.setFastMathFlags(getFastMathFlags());
2734
2736 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2737 Value *VecOp = State.get(getVecOp());
2738 Value *EVL = State.get(getEVL(), VPLane(0));
2739
2740 Value *Mask;
2741 if (VPValue *CondOp = getCondOp())
2742 Mask = State.get(CondOp);
2743 else
2744 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2745
2746 Value *NewRed;
2747 if (isOrdered()) {
2748 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2749 } else {
2750 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2752 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2753 else
2754 NewRed = Builder.CreateBinOp(
2756 Prev);
2757 }
2758 State.set(this, NewRed, /*IsScalar*/ true);
2759}
2760
2762 VPCostContext &Ctx) const {
2763 RecurKind RdxKind = getRecurrenceKind();
2764 Type *ElementTy = Ctx.Types.inferScalarType(this);
2765 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2766 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2768 std::optional<FastMathFlags> OptionalFMF =
2769 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2770
2771 // TODO: Support any-of reductions.
2772 assert(
2774 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2775 "Any-of reduction not implemented in VPlan-based cost model currently.");
2776
2777 // Note that TTI should model the cost of moving result to the scalar register
2778 // and the BinOp cost in the getMinMaxReductionCost().
2781 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2782 }
2783
2784 // Note that TTI should model the cost of moving result to the scalar register
2785 // and the BinOp cost in the getArithmeticReductionCost().
2786 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2787 Ctx.CostKind);
2788}
2789
2791 ExpressionTypes ExpressionType,
2792 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2793 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2794 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2795 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2796 assert(
2797 none_of(ExpressionRecipes,
2798 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2799 "expression cannot contain recipes with side-effects");
2800
2801 // Maintain a copy of the expression recipes as a set of users.
2802 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2803 for (auto *R : ExpressionRecipes)
2804 ExpressionRecipesAsSetOfUsers.insert(R);
2805
2806 // Recipes in the expression, except the last one, must only be used by
2807 // (other) recipes inside the expression. If there are other users, external
2808 // to the expression, use a clone of the recipe for external users.
2809 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2810 if (R != ExpressionRecipes.back() &&
2811 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2812 return !ExpressionRecipesAsSetOfUsers.contains(U);
2813 })) {
2814 // There are users outside of the expression. Clone the recipe and use the
2815 // clone those external users.
2816 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2817 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2818 VPUser &U, unsigned) {
2819 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2820 });
2821 CopyForExtUsers->insertBefore(R);
2822 }
2823 if (R->getParent())
2824 R->removeFromParent();
2825 }
2826
2827 // Internalize all external operands to the expression recipes. To do so,
2828 // create new temporary VPValues for all operands defined by a recipe outside
2829 // the expression. The original operands are added as operands of the
2830 // VPExpressionRecipe itself.
2831 for (auto *R : ExpressionRecipes) {
2832 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2833 auto *Def = Op->getDefiningRecipe();
2834 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2835 continue;
2836 addOperand(Op);
2837 LiveInPlaceholders.push_back(new VPValue());
2838 }
2839 }
2840
2841 // Replace each external operand with the first one created for it in
2842 // LiveInPlaceholders.
2843 for (auto *R : ExpressionRecipes)
2844 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2845 R->replaceUsesOfWith(LiveIn, Tmp);
2846}
2847
2849 for (auto *R : ExpressionRecipes)
2850 // Since the list could contain duplicates, make sure the recipe hasn't
2851 // already been inserted.
2852 if (!R->getParent())
2853 R->insertBefore(this);
2854
2855 for (const auto &[Idx, Op] : enumerate(operands()))
2856 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2857
2858 replaceAllUsesWith(ExpressionRecipes.back());
2859 ExpressionRecipes.clear();
2860}
2861
2863 VPCostContext &Ctx) const {
2864 Type *RedTy = Ctx.Types.inferScalarType(this);
2865 auto *SrcVecTy = cast<VectorType>(
2866 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2867 assert(RedTy->isIntegerTy() &&
2868 "VPExpressionRecipe only supports integer types currently.");
2869 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2870 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
2871 switch (ExpressionType) {
2872 case ExpressionTypes::ExtendedReduction: {
2873 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2874 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2875 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2876 return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
2877 ? Ctx.TTI.getPartialReductionCost(
2878 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
2879 RedTy, VF,
2881 ExtR->getOpcode()),
2882 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
2883 : Ctx.TTI.getExtendedReductionCost(
2884 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
2885 SrcVecTy, std::nullopt, Ctx.CostKind);
2886 }
2887 case ExpressionTypes::MulAccReduction:
2888 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
2889 Ctx.CostKind);
2890
2891 case ExpressionTypes::ExtNegatedMulAccReduction:
2892 assert(Opcode == Instruction::Add && "Unexpected opcode");
2893 Opcode = Instruction::Sub;
2894 [[fallthrough]];
2895 case ExpressionTypes::ExtMulAccReduction: {
2896 if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
2897 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2898 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2899 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2900 return Ctx.TTI.getPartialReductionCost(
2901 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
2902 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
2904 Ext0R->getOpcode()),
2906 Ext1R->getOpcode()),
2907 Mul->getOpcode(), Ctx.CostKind);
2908 }
2909 return Ctx.TTI.getMulAccReductionCost(
2910 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2911 Instruction::ZExt,
2912 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2913 }
2914 }
2915 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2916}
2917
2919 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2920 return R->mayReadFromMemory() || R->mayWriteToMemory();
2921 });
2922}
2923
2925 assert(
2926 none_of(ExpressionRecipes,
2927 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2928 "expression cannot contain recipes with side-effects");
2929 return false;
2930}
2931
2933 // Cannot use vputils::isSingleScalar(), because all external operands
2934 // of the expression will be live-ins while bundled.
2935 return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2936 !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2937}
2938
2939#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2940
2942 VPSlotTracker &SlotTracker) const {
2943 O << Indent << "EXPRESSION ";
2945 O << " = ";
2946 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2947 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2948 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
2949
2950 switch (ExpressionType) {
2951 case ExpressionTypes::ExtendedReduction: {
2953 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2954 O << Instruction::getOpcodeName(Opcode) << " (";
2956 Red->printFlags(O);
2957
2958 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2959 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2960 << *Ext0->getResultType();
2961 if (Red->isConditional()) {
2962 O << ", ";
2963 Red->getCondOp()->printAsOperand(O, SlotTracker);
2964 }
2965 O << ")";
2966 break;
2967 }
2968 case ExpressionTypes::ExtNegatedMulAccReduction: {
2970 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2972 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2973 << " (sub (0, mul";
2974 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2975 Mul->printFlags(O);
2976 O << "(";
2978 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2979 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2980 << *Ext0->getResultType() << "), (";
2982 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2983 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2984 << *Ext1->getResultType() << ")";
2985 if (Red->isConditional()) {
2986 O << ", ";
2987 Red->getCondOp()->printAsOperand(O, SlotTracker);
2988 }
2989 O << "))";
2990 break;
2991 }
2992 case ExpressionTypes::MulAccReduction:
2993 case ExpressionTypes::ExtMulAccReduction: {
2995 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2997 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2998 << " (";
2999 O << "mul";
3000 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3001 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3002 : ExpressionRecipes[0]);
3003 Mul->printFlags(O);
3004 if (IsExtended)
3005 O << "(";
3007 if (IsExtended) {
3008 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3009 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3010 << *Ext0->getResultType() << "), (";
3011 } else {
3012 O << ", ";
3013 }
3015 if (IsExtended) {
3016 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3017 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3018 << *Ext1->getResultType() << ")";
3019 }
3020 if (Red->isConditional()) {
3021 O << ", ";
3022 Red->getCondOp()->printAsOperand(O, SlotTracker);
3023 }
3024 O << ")";
3025 break;
3026 }
3027 }
3028}
3029
3031 VPSlotTracker &SlotTracker) const {
3032 O << Indent << "REDUCE ";
3034 O << " = ";
3036 O << " +";
3037 printFlags(O);
3038 O << " reduce."
3041 << " (";
3043 if (isConditional()) {
3044 O << ", ";
3046 }
3047 O << ")";
3048}
3049
3051 VPSlotTracker &SlotTracker) const {
3052 O << Indent << "REDUCE ";
3054 O << " = ";
3056 O << " +";
3057 printFlags(O);
3058 O << " vp.reduce."
3061 << " (";
3063 O << ", ";
3065 if (isConditional()) {
3066 O << ", ";
3068 }
3069 O << ")";
3070}
3071
3072#endif
3073
3074/// A helper function to scalarize a single Instruction in the innermost loop.
3075/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3076/// operands from \p RepRecipe instead of \p Instr's operands.
3077static void scalarizeInstruction(const Instruction *Instr,
3078 VPReplicateRecipe *RepRecipe,
3079 const VPLane &Lane, VPTransformState &State) {
3080 assert((!Instr->getType()->isAggregateType() ||
3081 canVectorizeTy(Instr->getType())) &&
3082 "Expected vectorizable or non-aggregate type.");
3083
3084 // Does this instruction return a value ?
3085 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3086
3087 Instruction *Cloned = Instr->clone();
3088 if (!IsVoidRetTy) {
3089 Cloned->setName(Instr->getName() + ".cloned");
3090 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3091 // The operands of the replicate recipe may have been narrowed, resulting in
3092 // a narrower result type. Update the type of the cloned instruction to the
3093 // correct type.
3094 if (ResultTy != Cloned->getType())
3095 Cloned->mutateType(ResultTy);
3096 }
3097
3098 RepRecipe->applyFlags(*Cloned);
3099 RepRecipe->applyMetadata(*Cloned);
3100
3101 if (RepRecipe->hasPredicate())
3102 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3103
3104 if (auto DL = RepRecipe->getDebugLoc())
3105 State.setDebugLocFrom(DL);
3106
3107 // Replace the operands of the cloned instructions with their scalar
3108 // equivalents in the new loop.
3109 for (const auto &I : enumerate(RepRecipe->operands())) {
3110 auto InputLane = Lane;
3111 VPValue *Operand = I.value();
3112 if (vputils::isSingleScalar(Operand))
3113 InputLane = VPLane::getFirstLane();
3114 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3115 }
3116
3117 // Place the cloned scalar in the new loop.
3118 State.Builder.Insert(Cloned);
3119
3120 State.set(RepRecipe, Cloned, Lane);
3121
3122 // If we just cloned a new assumption, add it the assumption cache.
3123 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3124 State.AC->registerAssumption(II);
3125
3126 assert(
3127 (RepRecipe->getRegion() ||
3128 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3129 all_of(RepRecipe->operands(),
3130 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3131 "Expected a recipe is either within a region or all of its operands "
3132 "are defined outside the vectorized region.");
3133}
3134
3137
3138 if (!State.Lane) {
3139 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3140 "must have already been unrolled");
3141 scalarizeInstruction(UI, this, VPLane(0), State);
3142 return;
3143 }
3144
3145 assert((State.VF.isScalar() || !isSingleScalar()) &&
3146 "uniform recipe shouldn't be predicated");
3147 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3148 scalarizeInstruction(UI, this, *State.Lane, State);
3149 // Insert scalar instance packing it into a vector.
3150 if (State.VF.isVector() && shouldPack()) {
3151 Value *WideValue =
3152 State.Lane->isFirstLane()
3153 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3154 : State.get(this);
3155 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3156 *State.Lane));
3157 }
3158}
3159
3161 // Find if the recipe is used by a widened recipe via an intervening
3162 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3163 return any_of(users(), [](const VPUser *U) {
3164 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3165 return !vputils::onlyScalarValuesUsed(PredR);
3166 return false;
3167 });
3168}
3169
3170/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3171/// model computes a SCEV expression when computing the address cost.
3173 auto *PtrR = Ptr->getDefiningRecipe();
3174 if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3176 Instruction::GetElementPtr) ||
3177 isa<VPWidenGEPRecipe>(PtrR) ||
3179 return false;
3180
3181 // We are looking for a GEP where all indices are either loop invariant or
3182 // inductions.
3183 for (VPValue *Opd : drop_begin(PtrR->operands())) {
3184 if (!Opd->isDefinedOutsideLoopRegions() &&
3186 return false;
3187 }
3188
3189 return true;
3190}
3191
3192/// Returns true if \p V is used as part of the address of another load or
3193/// store.
3194static bool isUsedByLoadStoreAddress(const VPUser *V) {
3196 SmallVector<const VPUser *> WorkList = {V};
3197
3198 while (!WorkList.empty()) {
3199 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3200 if (!Cur || !Seen.insert(Cur).second)
3201 continue;
3202
3203 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3204 // Skip blends that use V only through a compare by checking if any incoming
3205 // value was already visited.
3206 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3207 [&](unsigned I) {
3208 return Seen.contains(
3209 Blend->getIncomingValue(I)->getDefiningRecipe());
3210 }))
3211 continue;
3212
3213 for (VPUser *U : Cur->users()) {
3214 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3215 if (InterleaveR->getAddr() == Cur)
3216 return true;
3217 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3218 if (RepR->getOpcode() == Instruction::Load &&
3219 RepR->getOperand(0) == Cur)
3220 return true;
3221 if (RepR->getOpcode() == Instruction::Store &&
3222 RepR->getOperand(1) == Cur)
3223 return true;
3224 }
3225 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3226 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3227 return true;
3228 }
3229 }
3230
3231 // The legacy cost model only supports scalarization loads/stores with phi
3232 // addresses, if the phi is directly used as load/store address. Don't
3233 // traverse further for Blends.
3234 if (Blend)
3235 continue;
3236
3237 append_range(WorkList, Cur->users());
3238 }
3239 return false;
3240}
3241
3243 VPCostContext &Ctx) const {
3245 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3246 // transform, avoid computing their cost multiple times for now.
3247 Ctx.SkipCostComputation.insert(UI);
3248
3249 if (VF.isScalable() && !isSingleScalar())
3251
3252 switch (UI->getOpcode()) {
3253 case Instruction::GetElementPtr:
3254 // We mark this instruction as zero-cost because the cost of GEPs in
3255 // vectorized code depends on whether the corresponding memory instruction
3256 // is scalarized or not. Therefore, we handle GEPs with the memory
3257 // instruction cost.
3258 return 0;
3259 case Instruction::Call: {
3260 auto *CalledFn =
3262
3265 for (const VPValue *ArgOp : ArgOps)
3266 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3267
3268 if (CalledFn->isIntrinsic())
3269 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3270 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3271 switch (CalledFn->getIntrinsicID()) {
3272 case Intrinsic::assume:
3273 case Intrinsic::lifetime_end:
3274 case Intrinsic::lifetime_start:
3275 case Intrinsic::sideeffect:
3276 case Intrinsic::pseudoprobe:
3277 case Intrinsic::experimental_noalias_scope_decl: {
3278 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3279 ElementCount::getFixed(1), Ctx) == 0 &&
3280 "scalarizing intrinsic should be free");
3281 return InstructionCost(0);
3282 }
3283 default:
3284 break;
3285 }
3286
3287 Type *ResultTy = Ctx.Types.inferScalarType(this);
3288 InstructionCost ScalarCallCost =
3289 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3290 if (isSingleScalar()) {
3291 if (CalledFn->isIntrinsic())
3292 ScalarCallCost = std::min(
3293 ScalarCallCost,
3294 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3295 ElementCount::getFixed(1), Ctx));
3296 return ScalarCallCost;
3297 }
3298
3299 return ScalarCallCost * VF.getFixedValue() +
3300 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3301 }
3302 case Instruction::Add:
3303 case Instruction::Sub:
3304 case Instruction::FAdd:
3305 case Instruction::FSub:
3306 case Instruction::Mul:
3307 case Instruction::FMul:
3308 case Instruction::FDiv:
3309 case Instruction::FRem:
3310 case Instruction::Shl:
3311 case Instruction::LShr:
3312 case Instruction::AShr:
3313 case Instruction::And:
3314 case Instruction::Or:
3315 case Instruction::Xor:
3316 case Instruction::ICmp:
3317 case Instruction::FCmp:
3319 Ctx) *
3320 (isSingleScalar() ? 1 : VF.getFixedValue());
3321 case Instruction::SDiv:
3322 case Instruction::UDiv:
3323 case Instruction::SRem:
3324 case Instruction::URem: {
3325 InstructionCost ScalarCost =
3327 if (isSingleScalar())
3328 return ScalarCost;
3329
3330 ScalarCost = ScalarCost * VF.getFixedValue() +
3331 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3332 to_vector(operands()), VF);
3333 // If the recipe is not predicated (i.e. not in a replicate region), return
3334 // the scalar cost. Otherwise handle predicated cost.
3335 if (!getRegion()->isReplicator())
3336 return ScalarCost;
3337
3338 // Account for the phi nodes that we will create.
3339 ScalarCost += VF.getFixedValue() *
3340 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3341 // Scale the cost by the probability of executing the predicated blocks.
3342 // This assumes the predicated block for each vector lane is equally
3343 // likely.
3344 ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3345 return ScalarCost;
3346 }
3347 case Instruction::Load:
3348 case Instruction::Store: {
3349 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3350 // predicated cases.
3351 const VPRegionBlock *ParentRegion = getRegion();
3352 if (ParentRegion && ParentRegion->isReplicator())
3353 break;
3354
3355 bool IsLoad = UI->getOpcode() == Instruction::Load;
3356 const VPValue *PtrOp = getOperand(!IsLoad);
3357 // TODO: Handle cases where we need to pass a SCEV to
3358 // getAddressComputationCost.
3359 if (shouldUseAddressAccessSCEV(PtrOp))
3360 break;
3361
3362 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3363 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3364 const Align Alignment = getLoadStoreAlignment(UI);
3365 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3367 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3368 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3369
3370 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3371 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3372 bool UsedByLoadStoreAddress =
3373 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3374 InstructionCost ScalarCost =
3375 ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3376 PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3377 nullptr, Ctx.CostKind);
3378 if (isSingleScalar())
3379 return ScalarCost;
3380
3381 SmallVector<const VPValue *> OpsToScalarize;
3382 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3383 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3384 // don't assign scalarization overhead in general, if the target prefers
3385 // vectorized addressing or the loaded value is used as part of an address
3386 // of another load or store.
3387 if (!UsedByLoadStoreAddress) {
3388 bool EfficientVectorLoadStore =
3389 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3390 if (!(IsLoad && !PreferVectorizedAddressing) &&
3391 !(!IsLoad && EfficientVectorLoadStore))
3392 append_range(OpsToScalarize, operands());
3393
3394 if (!EfficientVectorLoadStore)
3395 ResultTy = Ctx.Types.inferScalarType(this);
3396 }
3397
3398 return (ScalarCost * VF.getFixedValue()) +
3399 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3400 }
3401 }
3402
3403 return Ctx.getLegacyCost(UI, VF);
3404}
3405
3406#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3408 VPSlotTracker &SlotTracker) const {
3409 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3410
3411 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3413 O << " = ";
3414 }
3415 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3416 O << "call";
3417 printFlags(O);
3418 O << "@" << CB->getCalledFunction()->getName() << "(";
3420 O, [&O, &SlotTracker](VPValue *Op) {
3421 Op->printAsOperand(O, SlotTracker);
3422 });
3423 O << ")";
3424 } else {
3426 printFlags(O);
3428 }
3429
3430 if (shouldPack())
3431 O << " (S->V)";
3432}
3433#endif
3434
3436 assert(State.Lane && "Branch on Mask works only on single instance.");
3437
3438 VPValue *BlockInMask = getOperand(0);
3439 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3440
3441 // Replace the temporary unreachable terminator with a new conditional branch,
3442 // whose two destinations will be set later when they are created.
3443 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3444 assert(isa<UnreachableInst>(CurrentTerminator) &&
3445 "Expected to replace unreachable terminator with conditional branch.");
3446 auto CondBr =
3447 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3448 CondBr->setSuccessor(0, nullptr);
3449 CurrentTerminator->eraseFromParent();
3450}
3451
3453 VPCostContext &Ctx) const {
3454 // The legacy cost model doesn't assign costs to branches for individual
3455 // replicate regions. Match the current behavior in the VPlan cost model for
3456 // now.
3457 return 0;
3458}
3459
3461 assert(State.Lane && "Predicated instruction PHI works per instance.");
3462 Instruction *ScalarPredInst =
3463 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3464 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3465 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3466 assert(PredicatingBB && "Predicated block has no single predecessor.");
3468 "operand must be VPReplicateRecipe");
3469
3470 // By current pack/unpack logic we need to generate only a single phi node: if
3471 // a vector value for the predicated instruction exists at this point it means
3472 // the instruction has vector users only, and a phi for the vector value is
3473 // needed. In this case the recipe of the predicated instruction is marked to
3474 // also do that packing, thereby "hoisting" the insert-element sequence.
3475 // Otherwise, a phi node for the scalar value is needed.
3476 if (State.hasVectorValue(getOperand(0))) {
3477 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3479 "Packed operands must generate an insertelement or insertvalue");
3480
3481 // If VectorI is a struct, it will be a sequence like:
3482 // %1 = insertvalue %unmodified, %x, 0
3483 // %2 = insertvalue %1, %y, 1
3484 // %VectorI = insertvalue %2, %z, 2
3485 // To get the unmodified vector we need to look through the chain.
3486 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3487 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3488 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3489
3490 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3491 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3492 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3493 if (State.hasVectorValue(this))
3494 State.reset(this, VPhi);
3495 else
3496 State.set(this, VPhi);
3497 // NOTE: Currently we need to update the value of the operand, so the next
3498 // predicated iteration inserts its generated value in the correct vector.
3499 State.reset(getOperand(0), VPhi);
3500 } else {
3501 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3502 return;
3503
3504 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3505 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3506 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3507 PredicatingBB);
3508 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3509 if (State.hasScalarValue(this, *State.Lane))
3510 State.reset(this, Phi, *State.Lane);
3511 else
3512 State.set(this, Phi, *State.Lane);
3513 // NOTE: Currently we need to update the value of the operand, so the next
3514 // predicated iteration inserts its generated value in the correct vector.
3515 State.reset(getOperand(0), Phi, *State.Lane);
3516 }
3517}
3518
3519#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3521 VPSlotTracker &SlotTracker) const {
3522 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3524 O << " = ";
3526}
3527#endif
3528
3530 VPCostContext &Ctx) const {
3532 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3533 ->getAddressSpace();
3534 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3535 ? Instruction::Load
3536 : Instruction::Store;
3537
3538 if (!Consecutive) {
3539 // TODO: Using the original IR may not be accurate.
3540 // Currently, ARM will use the underlying IR to calculate gather/scatter
3541 // instruction cost.
3542 assert(!Reverse &&
3543 "Inconsecutive memory access should not have the order.");
3544
3546 Type *PtrTy = Ptr->getType();
3547
3548 // If the address value is uniform across all lanes, then the address can be
3549 // calculated with scalar type and broadcast.
3551 PtrTy = toVectorTy(PtrTy, VF);
3552
3553 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3554 Ctx.CostKind) +
3555 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3556 Ctx.CostKind, &Ingredient);
3557 }
3558
3560 if (IsMasked) {
3561 Cost +=
3562 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3563 } else {
3564 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3566 : getOperand(1));
3567 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3568 OpInfo, &Ingredient);
3569 }
3570 if (!Reverse)
3571 return Cost;
3572
3573 return Cost += Ctx.TTI.getShuffleCost(
3575 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3576}
3577
3579 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3580 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3581 bool CreateGather = !isConsecutive();
3582
3583 auto &Builder = State.Builder;
3584 Value *Mask = nullptr;
3585 if (auto *VPMask = getMask()) {
3586 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3587 // of a null all-one mask is a null mask.
3588 Mask = State.get(VPMask);
3589 if (isReverse())
3590 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3591 }
3592
3593 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3594 Value *NewLI;
3595 if (CreateGather) {
3596 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3597 "wide.masked.gather");
3598 } else if (Mask) {
3599 NewLI =
3600 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3601 PoisonValue::get(DataTy), "wide.masked.load");
3602 } else {
3603 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3604 }
3606 if (Reverse)
3607 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3608 State.set(this, NewLI);
3609}
3610
3611#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3613 VPSlotTracker &SlotTracker) const {
3614 O << Indent << "WIDEN ";
3616 O << " = load ";
3618}
3619#endif
3620
3621/// Use all-true mask for reverse rather than actual mask, as it avoids a
3622/// dependence w/o affecting the result.
3624 Value *EVL, const Twine &Name) {
3625 VectorType *ValTy = cast<VectorType>(Operand->getType());
3626 Value *AllTrueMask =
3627 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3628 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3629 {Operand, AllTrueMask, EVL}, nullptr, Name);
3630}
3631
3633 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3634 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3635 bool CreateGather = !isConsecutive();
3636
3637 auto &Builder = State.Builder;
3638 CallInst *NewLI;
3639 Value *EVL = State.get(getEVL(), VPLane(0));
3640 Value *Addr = State.get(getAddr(), !CreateGather);
3641 Value *Mask = nullptr;
3642 if (VPValue *VPMask = getMask()) {
3643 Mask = State.get(VPMask);
3644 if (isReverse())
3645 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3646 } else {
3647 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3648 }
3649
3650 if (CreateGather) {
3651 NewLI =
3652 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3653 nullptr, "wide.masked.gather");
3654 } else {
3655 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3656 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3657 }
3658 NewLI->addParamAttr(
3660 applyMetadata(*NewLI);
3661 Instruction *Res = NewLI;
3662 if (isReverse())
3663 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3664 State.set(this, Res);
3665}
3666
3668 VPCostContext &Ctx) const {
3669 if (!Consecutive || IsMasked)
3670 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3671
3672 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3673 // here because the EVL recipes using EVL to replace the tail mask. But in the
3674 // legacy model, it will always calculate the cost of mask.
3675 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3676 // don't need to compare to the legacy cost model.
3678 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3679 ->getAddressSpace();
3680 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3681 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3682 if (!Reverse)
3683 return Cost;
3684
3685 return Cost + Ctx.TTI.getShuffleCost(
3687 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3688}
3689
3690#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3692 VPSlotTracker &SlotTracker) const {
3693 O << Indent << "WIDEN ";
3695 O << " = vp.load ";
3697}
3698#endif
3699
3701 VPValue *StoredVPValue = getStoredValue();
3702 bool CreateScatter = !isConsecutive();
3703
3704 auto &Builder = State.Builder;
3705
3706 Value *Mask = nullptr;
3707 if (auto *VPMask = getMask()) {
3708 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3709 // of a null all-one mask is a null mask.
3710 Mask = State.get(VPMask);
3711 if (isReverse())
3712 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3713 }
3714
3715 Value *StoredVal = State.get(StoredVPValue);
3716 if (isReverse()) {
3717 // If we store to reverse consecutive memory locations, then we need
3718 // to reverse the order of elements in the stored value.
3719 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3720 // We don't want to update the value in the map as it might be used in
3721 // another expression. So don't call resetVectorValue(StoredVal).
3722 }
3723 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3724 Instruction *NewSI = nullptr;
3725 if (CreateScatter)
3726 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3727 else if (Mask)
3728 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3729 else
3730 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3731 applyMetadata(*NewSI);
3732}
3733
3734#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3736 VPSlotTracker &SlotTracker) const {
3737 O << Indent << "WIDEN store ";
3739}
3740#endif
3741
3743 VPValue *StoredValue = getStoredValue();
3744 bool CreateScatter = !isConsecutive();
3745
3746 auto &Builder = State.Builder;
3747
3748 CallInst *NewSI = nullptr;
3749 Value *StoredVal = State.get(StoredValue);
3750 Value *EVL = State.get(getEVL(), VPLane(0));
3751 if (isReverse())
3752 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3753 Value *Mask = nullptr;
3754 if (VPValue *VPMask = getMask()) {
3755 Mask = State.get(VPMask);
3756 if (isReverse())
3757 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3758 } else {
3759 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3760 }
3761 Value *Addr = State.get(getAddr(), !CreateScatter);
3762 if (CreateScatter) {
3763 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3764 Intrinsic::vp_scatter,
3765 {StoredVal, Addr, Mask, EVL});
3766 } else {
3767 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3768 Intrinsic::vp_store,
3769 {StoredVal, Addr, Mask, EVL});
3770 }
3771 NewSI->addParamAttr(
3773 applyMetadata(*NewSI);
3774}
3775
3777 VPCostContext &Ctx) const {
3778 if (!Consecutive || IsMasked)
3779 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3780
3781 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3782 // here because the EVL recipes using EVL to replace the tail mask. But in the
3783 // legacy model, it will always calculate the cost of mask.
3784 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3785 // don't need to compare to the legacy cost model.
3787 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3788 ->getAddressSpace();
3789 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3790 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3791 if (!Reverse)
3792 return Cost;
3793
3794 return Cost + Ctx.TTI.getShuffleCost(
3796 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3797}
3798
3799#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3801 VPSlotTracker &SlotTracker) const {
3802 O << Indent << "WIDEN vp.store ";
3804}
3805#endif
3806
3808 VectorType *DstVTy, const DataLayout &DL) {
3809 // Verify that V is a vector type with same number of elements as DstVTy.
3810 auto VF = DstVTy->getElementCount();
3811 auto *SrcVecTy = cast<VectorType>(V->getType());
3812 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3813 Type *SrcElemTy = SrcVecTy->getElementType();
3814 Type *DstElemTy = DstVTy->getElementType();
3815 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3816 "Vector elements must have same size");
3817
3818 // Do a direct cast if element types are castable.
3819 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3820 return Builder.CreateBitOrPointerCast(V, DstVTy);
3821 }
3822 // V cannot be directly casted to desired vector type.
3823 // May happen when V is a floating point vector but DstVTy is a vector of
3824 // pointers or vice-versa. Handle this using a two-step bitcast using an
3825 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3826 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3827 "Only one type should be a pointer type");
3828 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3829 "Only one type should be a floating point type");
3830 Type *IntTy =
3831 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3832 auto *VecIntTy = VectorType::get(IntTy, VF);
3833 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3834 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3835}
3836
3837/// Return a vector containing interleaved elements from multiple
3838/// smaller input vectors.
3840 const Twine &Name) {
3841 unsigned Factor = Vals.size();
3842 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3843
3844 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3845#ifndef NDEBUG
3846 for (Value *Val : Vals)
3847 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3848#endif
3849
3850 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3851 // must use intrinsics to interleave.
3852 if (VecTy->isScalableTy()) {
3853 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3854 return Builder.CreateVectorInterleave(Vals, Name);
3855 }
3856
3857 // Fixed length. Start by concatenating all vectors into a wide vector.
3858 Value *WideVec = concatenateVectors(Builder, Vals);
3859
3860 // Interleave the elements into the wide vector.
3861 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3862 return Builder.CreateShuffleVector(
3863 WideVec, createInterleaveMask(NumElts, Factor), Name);
3864}
3865
3866// Try to vectorize the interleave group that \p Instr belongs to.
3867//
3868// E.g. Translate following interleaved load group (factor = 3):
3869// for (i = 0; i < N; i+=3) {
3870// R = Pic[i]; // Member of index 0
3871// G = Pic[i+1]; // Member of index 1
3872// B = Pic[i+2]; // Member of index 2
3873// ... // do something to R, G, B
3874// }
3875// To:
3876// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3877// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3878// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3879// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3880//
3881// Or translate following interleaved store group (factor = 3):
3882// for (i = 0; i < N; i+=3) {
3883// ... do something to R, G, B
3884// Pic[i] = R; // Member of index 0
3885// Pic[i+1] = G; // Member of index 1
3886// Pic[i+2] = B; // Member of index 2
3887// }
3888// To:
3889// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3890// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3891// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3892// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3893// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3895 assert(!State.Lane && "Interleave group being replicated.");
3896 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3897 "Masking gaps for scalable vectors is not yet supported.");
3899 Instruction *Instr = Group->getInsertPos();
3900
3901 // Prepare for the vector type of the interleaved load/store.
3902 Type *ScalarTy = getLoadStoreType(Instr);
3903 unsigned InterleaveFactor = Group->getFactor();
3904 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3905
3906 VPValue *BlockInMask = getMask();
3907 VPValue *Addr = getAddr();
3908 Value *ResAddr = State.get(Addr, VPLane(0));
3909
3910 auto CreateGroupMask = [&BlockInMask, &State,
3911 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3912 if (State.VF.isScalable()) {
3913 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3914 assert(InterleaveFactor <= 8 &&
3915 "Unsupported deinterleave factor for scalable vectors");
3916 auto *ResBlockInMask = State.get(BlockInMask);
3917 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3918 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3919 }
3920
3921 if (!BlockInMask)
3922 return MaskForGaps;
3923
3924 Value *ResBlockInMask = State.get(BlockInMask);
3925 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3926 ResBlockInMask,
3927 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3928 "interleaved.mask");
3929 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3930 ShuffledMask, MaskForGaps)
3931 : ShuffledMask;
3932 };
3933
3934 const DataLayout &DL = Instr->getDataLayout();
3935 // Vectorize the interleaved load group.
3936 if (isa<LoadInst>(Instr)) {
3937 Value *MaskForGaps = nullptr;
3938 if (needsMaskForGaps()) {
3939 MaskForGaps =
3940 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3941 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3942 }
3943
3944 Instruction *NewLoad;
3945 if (BlockInMask || MaskForGaps) {
3946 Value *GroupMask = CreateGroupMask(MaskForGaps);
3947 Value *PoisonVec = PoisonValue::get(VecTy);
3948 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3949 Group->getAlign(), GroupMask,
3950 PoisonVec, "wide.masked.vec");
3951 } else
3952 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3953 Group->getAlign(), "wide.vec");
3954 applyMetadata(*NewLoad);
3955 // TODO: Also manage existing metadata using VPIRMetadata.
3956 Group->addMetadata(NewLoad);
3957
3959 if (VecTy->isScalableTy()) {
3960 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3961 // so must use intrinsics to deinterleave.
3962 assert(InterleaveFactor <= 8 &&
3963 "Unsupported deinterleave factor for scalable vectors");
3964 NewLoad = State.Builder.CreateIntrinsic(
3965 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
3966 NewLoad->getType(), NewLoad,
3967 /*FMFSource=*/nullptr, "strided.vec");
3968 }
3969
3970 auto CreateStridedVector = [&InterleaveFactor, &State,
3971 &NewLoad](unsigned Index) -> Value * {
3972 assert(Index < InterleaveFactor && "Illegal group index");
3973 if (State.VF.isScalable())
3974 return State.Builder.CreateExtractValue(NewLoad, Index);
3975
3976 // For fixed length VF, use shuffle to extract the sub-vectors from the
3977 // wide load.
3978 auto StrideMask =
3979 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
3980 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
3981 "strided.vec");
3982 };
3983
3984 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3985 Instruction *Member = Group->getMember(I);
3986
3987 // Skip the gaps in the group.
3988 if (!Member)
3989 continue;
3990
3991 Value *StridedVec = CreateStridedVector(I);
3992
3993 // If this member has different type, cast the result type.
3994 if (Member->getType() != ScalarTy) {
3995 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3996 StridedVec =
3997 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3998 }
3999
4000 if (Group->isReverse())
4001 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4002
4003 State.set(VPDefs[J], StridedVec);
4004 ++J;
4005 }
4006 return;
4007 }
4008
4009 // The sub vector type for current instruction.
4010 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4011
4012 // Vectorize the interleaved store group.
4013 Value *MaskForGaps =
4014 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4015 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4016 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4017 ArrayRef<VPValue *> StoredValues = getStoredValues();
4018 // Collect the stored vector from each member.
4019 SmallVector<Value *, 4> StoredVecs;
4020 unsigned StoredIdx = 0;
4021 for (unsigned i = 0; i < InterleaveFactor; i++) {
4022 assert((Group->getMember(i) || MaskForGaps) &&
4023 "Fail to get a member from an interleaved store group");
4024 Instruction *Member = Group->getMember(i);
4025
4026 // Skip the gaps in the group.
4027 if (!Member) {
4028 Value *Undef = PoisonValue::get(SubVT);
4029 StoredVecs.push_back(Undef);
4030 continue;
4031 }
4032
4033 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4034 ++StoredIdx;
4035
4036 if (Group->isReverse())
4037 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4038
4039 // If this member has different type, cast it to a unified type.
4040
4041 if (StoredVec->getType() != SubVT)
4042 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4043
4044 StoredVecs.push_back(StoredVec);
4045 }
4046
4047 // Interleave all the smaller vectors into one wider vector.
4048 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4049 Instruction *NewStoreInstr;
4050 if (BlockInMask || MaskForGaps) {
4051 Value *GroupMask = CreateGroupMask(MaskForGaps);
4052 NewStoreInstr = State.Builder.CreateMaskedStore(
4053 IVec, ResAddr, Group->getAlign(), GroupMask);
4054 } else
4055 NewStoreInstr =
4056 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4057
4058 applyMetadata(*NewStoreInstr);
4059 // TODO: Also manage existing metadata using VPIRMetadata.
4060 Group->addMetadata(NewStoreInstr);
4061}
4062
4063#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4065 VPSlotTracker &SlotTracker) const {
4067 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4068 IG->getInsertPos()->printAsOperand(O, false);
4069 O << ", ";
4071 VPValue *Mask = getMask();
4072 if (Mask) {
4073 O << ", ";
4074 Mask->printAsOperand(O, SlotTracker);
4075 }
4076
4077 unsigned OpIdx = 0;
4078 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4079 if (!IG->getMember(i))
4080 continue;
4081 if (getNumStoreOperands() > 0) {
4082 O << "\n" << Indent << " store ";
4084 O << " to index " << i;
4085 } else {
4086 O << "\n" << Indent << " ";
4088 O << " = load from index " << i;
4089 }
4090 ++OpIdx;
4091 }
4092}
4093#endif
4094
4096 assert(!State.Lane && "Interleave group being replicated.");
4097 assert(State.VF.isScalable() &&
4098 "Only support scalable VF for EVL tail-folding.");
4100 "Masking gaps for scalable vectors is not yet supported.");
4102 Instruction *Instr = Group->getInsertPos();
4103
4104 // Prepare for the vector type of the interleaved load/store.
4105 Type *ScalarTy = getLoadStoreType(Instr);
4106 unsigned InterleaveFactor = Group->getFactor();
4107 assert(InterleaveFactor <= 8 &&
4108 "Unsupported deinterleave/interleave factor for scalable vectors");
4109 ElementCount WideVF = State.VF * InterleaveFactor;
4110 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4111
4112 VPValue *Addr = getAddr();
4113 Value *ResAddr = State.get(Addr, VPLane(0));
4114 Value *EVL = State.get(getEVL(), VPLane(0));
4115 Value *InterleaveEVL = State.Builder.CreateMul(
4116 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4117 /* NUW= */ true, /* NSW= */ true);
4118 LLVMContext &Ctx = State.Builder.getContext();
4119
4120 Value *GroupMask = nullptr;
4121 if (VPValue *BlockInMask = getMask()) {
4122 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4123 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4124 } else {
4125 GroupMask =
4126 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4127 }
4128
4129 // Vectorize the interleaved load group.
4130 if (isa<LoadInst>(Instr)) {
4131 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4132 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4133 "wide.vp.load");
4134 NewLoad->addParamAttr(0,
4135 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4136
4137 applyMetadata(*NewLoad);
4138 // TODO: Also manage existing metadata using VPIRMetadata.
4139 Group->addMetadata(NewLoad);
4140
4141 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4142 // so must use intrinsics to deinterleave.
4143 NewLoad = State.Builder.CreateIntrinsic(
4144 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4145 NewLoad->getType(), NewLoad,
4146 /*FMFSource=*/nullptr, "strided.vec");
4147
4148 const DataLayout &DL = Instr->getDataLayout();
4149 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4150 Instruction *Member = Group->getMember(I);
4151 // Skip the gaps in the group.
4152 if (!Member)
4153 continue;
4154
4155 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4156 // If this member has different type, cast the result type.
4157 if (Member->getType() != ScalarTy) {
4158 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4159 StridedVec =
4160 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4161 }
4162
4163 State.set(getVPValue(J), StridedVec);
4164 ++J;
4165 }
4166 return;
4167 } // End for interleaved load.
4168
4169 // The sub vector type for current instruction.
4170 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4171 // Vectorize the interleaved store group.
4172 ArrayRef<VPValue *> StoredValues = getStoredValues();
4173 // Collect the stored vector from each member.
4174 SmallVector<Value *, 4> StoredVecs;
4175 const DataLayout &DL = Instr->getDataLayout();
4176 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4177 Instruction *Member = Group->getMember(I);
4178 // Skip the gaps in the group.
4179 if (!Member) {
4180 StoredVecs.push_back(PoisonValue::get(SubVT));
4181 continue;
4182 }
4183
4184 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4185 // If this member has different type, cast it to a unified type.
4186 if (StoredVec->getType() != SubVT)
4187 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4188
4189 StoredVecs.push_back(StoredVec);
4190 ++StoredIdx;
4191 }
4192
4193 // Interleave all the smaller vectors into one wider vector.
4194 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4195 CallInst *NewStore =
4196 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4197 {IVec, ResAddr, GroupMask, InterleaveEVL});
4198 NewStore->addParamAttr(1,
4199 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4200
4201 applyMetadata(*NewStore);
4202 // TODO: Also manage existing metadata using VPIRMetadata.
4203 Group->addMetadata(NewStore);
4204}
4205
4206#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4208 VPSlotTracker &SlotTracker) const {
4210 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4211 IG->getInsertPos()->printAsOperand(O, false);
4212 O << ", ";
4214 O << ", ";
4216 if (VPValue *Mask = getMask()) {
4217 O << ", ";
4218 Mask->printAsOperand(O, SlotTracker);
4219 }
4220
4221 unsigned OpIdx = 0;
4222 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4223 if (!IG->getMember(i))
4224 continue;
4225 if (getNumStoreOperands() > 0) {
4226 O << "\n" << Indent << " vp.store ";
4228 O << " to index " << i;
4229 } else {
4230 O << "\n" << Indent << " ";
4232 O << " = vp.load from index " << i;
4233 }
4234 ++OpIdx;
4235 }
4236}
4237#endif
4238
4240 VPCostContext &Ctx) const {
4241 Instruction *InsertPos = getInsertPos();
4242 // Find the VPValue index of the interleave group. We need to skip gaps.
4243 unsigned InsertPosIdx = 0;
4244 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4245 if (auto *Member = IG->getMember(Idx)) {
4246 if (Member == InsertPos)
4247 break;
4248 InsertPosIdx++;
4249 }
4250 Type *ValTy = Ctx.Types.inferScalarType(
4251 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4252 : getStoredValues()[InsertPosIdx]);
4253 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4254 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4255 ->getAddressSpace();
4256
4257 unsigned InterleaveFactor = IG->getFactor();
4258 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4259
4260 // Holds the indices of existing members in the interleaved group.
4262 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4263 if (IG->getMember(IF))
4264 Indices.push_back(IF);
4265
4266 // Calculate the cost of the whole interleaved group.
4267 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4268 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4269 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4270
4271 if (!IG->isReverse())
4272 return Cost;
4273
4274 return Cost + IG->getNumMembers() *
4275 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4276 VectorTy, VectorTy, {}, Ctx.CostKind,
4277 0);
4278}
4279
4280#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4282 VPSlotTracker &SlotTracker) const {
4283 O << Indent << "EMIT ";
4285 O << " = CANONICAL-INDUCTION ";
4287}
4288#endif
4289
4291 return IsScalarAfterVectorization &&
4292 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4293}
4294
4295#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4297 VPSlotTracker &SlotTracker) const {
4298 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4299 "unexpected number of operands");
4300 O << Indent << "EMIT ";
4302 O << " = WIDEN-POINTER-INDUCTION ";
4304 O << ", ";
4306 O << ", ";
4308 if (getNumOperands() == 5) {
4309 O << ", ";
4311 O << ", ";
4313 }
4314}
4315
4317 VPSlotTracker &SlotTracker) const {
4318 O << Indent << "EMIT ";
4320 O << " = EXPAND SCEV " << *Expr;
4321}
4322#endif
4323
4325 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4326 Type *STy = CanonicalIV->getType();
4327 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4328 ElementCount VF = State.VF;
4329 Value *VStart = VF.isScalar()
4330 ? CanonicalIV
4331 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4332 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
4333 if (VF.isVector()) {
4334 VStep = Builder.CreateVectorSplat(VF, VStep);
4335 VStep =
4336 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4337 }
4338 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4339 State.set(this, CanonicalVectorIV);
4340}
4341
4342#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4344 VPSlotTracker &SlotTracker) const {
4345 O << Indent << "EMIT ";
4347 O << " = WIDEN-CANONICAL-INDUCTION ";
4349}
4350#endif
4351
4353 auto &Builder = State.Builder;
4354 // Create a vector from the initial value.
4355 auto *VectorInit = getStartValue()->getLiveInIRValue();
4356
4357 Type *VecTy = State.VF.isScalar()
4358 ? VectorInit->getType()
4359 : VectorType::get(VectorInit->getType(), State.VF);
4360
4361 BasicBlock *VectorPH =
4362 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4363 if (State.VF.isVector()) {
4364 auto *IdxTy = Builder.getInt32Ty();
4365 auto *One = ConstantInt::get(IdxTy, 1);
4366 IRBuilder<>::InsertPointGuard Guard(Builder);
4367 Builder.SetInsertPoint(VectorPH->getTerminator());
4368 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4369 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4370 VectorInit = Builder.CreateInsertElement(
4371 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4372 }
4373
4374 // Create a phi node for the new recurrence.
4375 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4376 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4377 Phi->addIncoming(VectorInit, VectorPH);
4378 State.set(this, Phi);
4379}
4380
4383 VPCostContext &Ctx) const {
4384 if (VF.isScalar())
4385 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4386
4387 return 0;
4388}
4389
4390#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4392 VPSlotTracker &SlotTracker) const {
4393 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4395 O << " = phi ";
4397}
4398#endif
4399
4401 // Reductions do not have to start at zero. They can start with
4402 // any loop invariant values.
4403 VPValue *StartVPV = getStartValue();
4404
4405 // In order to support recurrences we need to be able to vectorize Phi nodes.
4406 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4407 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4408 // this value when we vectorize all of the instructions that use the PHI.
4409 BasicBlock *VectorPH =
4410 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4411 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
4412 Value *StartV = State.get(StartVPV, ScalarPHI);
4413 Type *VecTy = StartV->getType();
4414
4415 BasicBlock *HeaderBB = State.CFG.PrevBB;
4416 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4417 "recipe must be in the vector loop header");
4418 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4419 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4420 State.set(this, Phi, IsInLoop);
4421
4422 Phi->addIncoming(StartV, VectorPH);
4423}
4424
4425#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4427 VPSlotTracker &SlotTracker) const {
4428 O << Indent << "WIDEN-REDUCTION-PHI ";
4429
4431 O << " = phi ";
4433 if (VFScaleFactor != 1)
4434 O << " (VF scaled by 1/" << VFScaleFactor << ")";
4435}
4436#endif
4437
4439 Value *Op0 = State.get(getOperand(0));
4440 Type *VecTy = Op0->getType();
4441 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4442 State.set(this, VecPhi);
4443}
4444
4445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4447 VPSlotTracker &SlotTracker) const {
4448 O << Indent << "WIDEN-PHI ";
4449
4451 O << " = phi ";
4453}
4454#endif
4455
4456// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
4457// remove VPActiveLaneMaskPHIRecipe.
4459 BasicBlock *VectorPH =
4460 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4461 Value *StartMask = State.get(getOperand(0));
4462 PHINode *Phi =
4463 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4464 Phi->addIncoming(StartMask, VectorPH);
4465 State.set(this, Phi);
4466}
4467
4468#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4470 VPSlotTracker &SlotTracker) const {
4471 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4472
4474 O << " = phi ";
4476}
4477#endif
4478
4479#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4481 VPSlotTracker &SlotTracker) const {
4482 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4483
4485 O << " = phi ";
4487}
4488#endif
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
#define I(x, y, z)
Definition MD5.cpp:58
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Type * getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, unsigned CurrentPart, IRBuilderBase &Builder)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr)
Returns true if Ptr is a pointer computation for which the legacy cost model computes a SCEV expressi...
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
static BranchInst * createCondBranch(Value *Cond, VPBasicBlock *VPBB, VPTransformState &State)
Create a conditional branch using Cond branching to the successors of VPBB.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Conditional or Unconditional Branch instruction.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:271
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:661
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:594
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2633
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LLVM_ABI Value * CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, const Twine &Name="")
Return a vector splice intrinsic if using scalable vectors, otherwise return a shufflevector.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2039
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:567
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1725
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1808
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1134
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
LLVMContext & getContext() const
Definition IRBuilder.h:203
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
std::pair< MDNode *, MDNode * > getNoAliasMetadataFor(const Instruction *OrigInst) const
Returns a pair containing the alias_scope and noalias metadata nodes for OrigInst,...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static bool isSignedRecurrenceKind(RecurKind Kind)
Returns true if recurrece kind is a signed redux kind.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
value_op_iterator value_op_end()
Definition User.h:313
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
value_op_iterator value_op_begin()
Definition User.h:310
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3815
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:3868
iterator end()
Definition VPlan.h:3852
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:3881
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2435
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2430
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:348
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
This class augments a recipe with a set of VPValues defined by the recipe.
Definition VPlanValue.h:302
void dump() const
Dump the VPDef to stderr (for debugging).
Definition VPlan.cpp:126
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
unsigned getVPDefID() const
Definition VPlanValue.h:429
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3692
VPValue * getStartValue() const
Definition VPlan.h:3691
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2018
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:1714
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
FastMathFlagsTy FMFs
Definition VPlan.h:660
bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
WrapFlagsTy WrapFlags
Definition VPlan.h:654
CmpInst::Predicate CmpPredicate
Definition VPlan.h:653
void printFlags(raw_ostream &O) const
GEPNoWrapFlags GEPFlags
Definition VPlan.h:658
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:818
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
TruncFlagsTy TruncFlags
Definition VPlan.h:655
CmpInst::Predicate getPredicate() const
Definition VPlan.h:800
ExactFlagsTy ExactFlags
Definition VPlan.h:657
bool hasNoSignedWrap() const
Definition VPlan.h:842
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:812
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:815
DisjointFlagsTy DisjointFlags
Definition VPlan.h:656
unsigned AllFlags
Definition VPlan.h:661
bool hasNoUnsignedWrap() const
Definition VPlan.h:831
NonNegFlagsTy NonNegFlags
Definition VPlan.h:659
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:763
Instruction & getInstruction() const
Definition VPlan.h:1379
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void extractLastLaneOfFirstOperand(VPBuilder &Builder)
Update the recipes first operand to the last lane of the operand using Builder.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1354
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition VPlan.h:1104
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1063
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1017
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1053
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1066
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1014
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1057
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ VScale
Returns the value for vscale.
Definition VPlan.h:1068
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
bool hasResult() const
Definition VPlan.h:1143
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1183
unsigned getOpcode() const
Definition VPlan.h:1123
bool onlyFirstPartUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
void execute(VPTransformState &State) override
Generate the instruction.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2545
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2549
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2547
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2539
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2568
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2533
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2642
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2661
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2612
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPPartialReductionRecipe.
unsigned getOpcode() const
Get the binary op's opcode.
Definition VPlan.h:2812
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1269
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:3959
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1294
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1261
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4114
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:397
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2857
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:2751
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:2755
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:2757
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:2747
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:2753
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4003
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4071
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2872
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:2917
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
unsigned getOpcode() const
Definition VPlan.h:2946
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3757
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:519
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:926
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1436
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
unsigned getNumOperands() const
Definition VPlanValue.h:237
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
virtual bool onlyFirstLaneUsed(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:282
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1390
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
friend class VPExpressionRecipe
Definition VPlanValue.h:53
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1432
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:140
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
VPValue(const unsigned char SC, Value *UV=nullptr, VPDef *Def=nullptr)
Definition VPlan.cpp:98
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
user_range users()
Definition VPlanValue.h:134
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:1918
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
operand_range args()
Definition VPlan.h:1671
Function * getCalledScalarFunction() const
Definition VPlan.h:1667
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1540
void execute(VPTransformState &State) override
Produce widened copies of the cast.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:1815
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2074
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2185
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2194
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1605
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1608
void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3192
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3189
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3229
Instruction & Ingredient
Definition VPlan.h:3180
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3186
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3243
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3183
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3236
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3233
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1443
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getUF() const
Definition VPlan.h:4353
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * createFindLastIVReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind, Value *Start, Value *Sentinel)
Create a reduction of the given vector Src for a reduction of the kind RecurKind::FindLastIV.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1734
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind)
A helper function that returns how much we should divide the cost of a predicated block by.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI Value * createAnyOfReduction(IRBuilderBase &B, Value *Src, Value *InitVal, PHINode *OrigPhi)
Create a reduction of the given vector Src for a reduction of kind RecurKind::AnyOf.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1416
PHINode & getIRPhi()
Definition VPlan.h:1424
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:871
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:872
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
void execute(VPTransformState &State) override
Generate the wide load or gather.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3319
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isInvariantCond() const
Definition VPlan.h:1760
VPValue * getCond() const
Definition VPlan.h:1756
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenSelectRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the select instruction.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3400
void execute(VPTransformState &State) override
Generate the wide store or scatter.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3403
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3364
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.