LLVM 22.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
35#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC:
55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56 case VPInterleaveEVLSC:
57 case VPInterleaveSC:
58 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
59 case VPWidenStoreEVLSC:
60 case VPWidenStoreSC:
61 return true;
62 case VPReplicateSC:
63 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
64 ->mayWriteToMemory();
65 case VPWidenCallSC:
66 return !cast<VPWidenCallRecipe>(this)
67 ->getCalledScalarFunction()
68 ->onlyReadsMemory();
69 case VPWidenIntrinsicSC:
70 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
71 case VPCanonicalIVPHISC:
72 case VPBranchOnMaskSC:
73 case VPFirstOrderRecurrencePHISC:
74 case VPReductionPHISC:
75 case VPScalarIVStepsSC:
76 case VPPredInstPHISC:
77 return false;
78 case VPBlendSC:
79 case VPReductionEVLSC:
80 case VPReductionSC:
81 case VPVectorPointerSC:
82 case VPWidenCanonicalIVSC:
83 case VPWidenCastSC:
84 case VPWidenGEPSC:
85 case VPWidenIntOrFpInductionSC:
86 case VPWidenLoadEVLSC:
87 case VPWidenLoadSC:
88 case VPWidenPHISC:
89 case VPWidenSC:
90 case VPWidenSelectSC: {
91 const Instruction *I =
92 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
93 (void)I;
94 assert((!I || !I->mayWriteToMemory()) &&
95 "underlying instruction may write to memory");
96 return false;
97 }
98 default:
99 return true;
100 }
101}
102
104 switch (getVPDefID()) {
105 case VPExpressionSC:
106 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
107 case VPInstructionSC:
108 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
109 case VPWidenLoadEVLSC:
110 case VPWidenLoadSC:
111 return true;
112 case VPReplicateSC:
113 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
114 ->mayReadFromMemory();
115 case VPWidenCallSC:
116 return !cast<VPWidenCallRecipe>(this)
117 ->getCalledScalarFunction()
118 ->onlyWritesMemory();
119 case VPWidenIntrinsicSC:
120 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
121 case VPBranchOnMaskSC:
122 case VPFirstOrderRecurrencePHISC:
123 case VPPredInstPHISC:
124 case VPScalarIVStepsSC:
125 case VPWidenStoreEVLSC:
126 case VPWidenStoreSC:
127 return false;
128 case VPBlendSC:
129 case VPReductionEVLSC:
130 case VPReductionSC:
131 case VPVectorPointerSC:
132 case VPWidenCanonicalIVSC:
133 case VPWidenCastSC:
134 case VPWidenGEPSC:
135 case VPWidenIntOrFpInductionSC:
136 case VPWidenPHISC:
137 case VPWidenSC:
138 case VPWidenSelectSC: {
139 const Instruction *I =
140 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
141 (void)I;
142 assert((!I || !I->mayReadFromMemory()) &&
143 "underlying instruction may read from memory");
144 return false;
145 }
146 default:
147 // FIXME: Return false if the recipe represents an interleaved store.
148 return true;
149 }
150}
151
153 switch (getVPDefID()) {
154 case VPExpressionSC:
155 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
156 case VPDerivedIVSC:
157 case VPFirstOrderRecurrencePHISC:
158 case VPPredInstPHISC:
159 case VPVectorEndPointerSC:
160 return false;
161 case VPInstructionSC:
162 return mayWriteToMemory();
163 case VPWidenCallSC: {
164 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
165 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
166 }
167 case VPWidenIntrinsicSC:
168 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
169 case VPBlendSC:
170 case VPReductionEVLSC:
171 case VPReductionSC:
172 case VPScalarIVStepsSC:
173 case VPVectorPointerSC:
174 case VPWidenCanonicalIVSC:
175 case VPWidenCastSC:
176 case VPWidenGEPSC:
177 case VPWidenIntOrFpInductionSC:
178 case VPWidenPHISC:
179 case VPWidenPointerInductionSC:
180 case VPWidenSC:
181 case VPWidenSelectSC: {
182 const Instruction *I =
183 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
184 (void)I;
185 assert((!I || !I->mayHaveSideEffects()) &&
186 "underlying instruction has side-effects");
187 return false;
188 }
189 case VPInterleaveEVLSC:
190 case VPInterleaveSC:
191 return mayWriteToMemory();
192 case VPWidenLoadEVLSC:
193 case VPWidenLoadSC:
194 case VPWidenStoreEVLSC:
195 case VPWidenStoreSC:
196 assert(
197 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
199 "mayHaveSideffects result for ingredient differs from this "
200 "implementation");
201 return mayWriteToMemory();
202 case VPReplicateSC: {
203 auto *R = cast<VPReplicateRecipe>(this);
204 return R->getUnderlyingInstr()->mayHaveSideEffects();
205 }
206 default:
207 return true;
208 }
209}
210
212 assert(!Parent && "Recipe already in some VPBasicBlock");
213 assert(InsertPos->getParent() &&
214 "Insertion position not in any VPBasicBlock");
215 InsertPos->getParent()->insert(this, InsertPos->getIterator());
216}
217
218void VPRecipeBase::insertBefore(VPBasicBlock &BB,
220 assert(!Parent && "Recipe already in some VPBasicBlock");
221 assert(I == BB.end() || I->getParent() == &BB);
222 BB.insert(this, I);
223}
224
226 assert(!Parent && "Recipe already in some VPBasicBlock");
227 assert(InsertPos->getParent() &&
228 "Insertion position not in any VPBasicBlock");
229 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
230}
231
233 assert(getParent() && "Recipe not in any VPBasicBlock");
235 Parent = nullptr;
236}
237
239 assert(getParent() && "Recipe not in any VPBasicBlock");
241}
242
245 insertAfter(InsertPos);
246}
247
253
255 // Get the underlying instruction for the recipe, if there is one. It is used
256 // to
257 // * decide if cost computation should be skipped for this recipe,
258 // * apply forced target instruction cost.
259 Instruction *UI = nullptr;
260 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
261 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
262 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
263 UI = IG->getInsertPos();
264 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
265 UI = &WidenMem->getIngredient();
266
267 InstructionCost RecipeCost;
268 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
269 RecipeCost = 0;
270 } else {
271 RecipeCost = computeCost(VF, Ctx);
272 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
273 RecipeCost.isValid())
275 }
276
277 LLVM_DEBUG({
278 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
279 dump();
280 });
281 return RecipeCost;
282}
283
285 VPCostContext &Ctx) const {
286 llvm_unreachable("subclasses should implement computeCost");
287}
288
290 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
292}
293
295 auto *VPI = dyn_cast<VPInstruction>(this);
296 return VPI && Instruction::isCast(VPI->getOpcode());
297}
298
301 VPCostContext &Ctx) const {
302 std::optional<unsigned> Opcode;
303 VPValue *Op = getOperand(0);
304 VPRecipeBase *OpR = Op->getDefiningRecipe();
305
306 // If the partial reduction is predicated, a select will be operand 0
308 OpR = Op->getDefiningRecipe();
309 }
310
311 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
313 ExtBType = TTI::PR_None;
314
315 auto GetExtendKind = [](VPRecipeBase *R) {
316 if (!R)
317 return TTI::PR_None;
318 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
319 if (!WidenCastR)
320 return TTI::PR_None;
321 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
322 return TTI::PR_ZeroExtend;
323 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
324 return TTI::PR_SignExtend;
325 return TTI::PR_None;
326 };
327
328 // Pick out opcode, type/ext information and use sub side effects from a widen
329 // recipe.
330 auto HandleWiden = [&](VPWidenRecipe *Widen) {
331 if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
332 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
333 }
334 Opcode = Widen->getOpcode();
335 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
336 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
337 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
338 : Widen->getOperand(0));
339 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
340 : Widen->getOperand(1));
341 ExtAType = GetExtendKind(ExtAR);
342 ExtBType = GetExtendKind(ExtBR);
343
344 using namespace VPlanPatternMatch;
345 const APInt *C;
346 if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
347 canConstantBeExtended(C, InputTypeA, ExtAType)) {
348 InputTypeB = InputTypeA;
349 ExtBType = ExtAType;
350 }
351 };
352
353 if (isa<VPWidenCastRecipe>(OpR)) {
354 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
355 ExtAType = GetExtendKind(OpR);
356 } else if (isa<VPReductionPHIRecipe>(OpR)) {
357 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
358 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
359 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
360 ExtAType = GetExtendKind(RedPhiOp1R);
361 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
362 HandleWiden(Widen);
363 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
364 HandleWiden(Widen);
365 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
366 return Reduction->computeCost(VF, Ctx);
367 }
368 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
369 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
370 PhiType, VF, ExtAType, ExtBType,
371 Opcode, Ctx.CostKind);
372}
373
375 auto &Builder = State.Builder;
376
377 assert(getOpcode() == Instruction::Add &&
378 "Unhandled partial reduction opcode");
379
380 Value *BinOpVal = State.get(getOperand(1));
381 Value *PhiVal = State.get(getOperand(0));
382 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
383
384 Type *RetTy = PhiVal->getType();
385
386 CallInst *V =
387 Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
388 {PhiVal, BinOpVal}, nullptr, "partial.reduce");
389
390 State.set(this, V);
391}
392
393#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
395 VPSlotTracker &SlotTracker) const {
396 O << Indent << "PARTIAL-REDUCE ";
398 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
400}
401#endif
402
404 assert(OpType == Other.OpType && "OpType must match");
405 switch (OpType) {
406 case OperationType::OverflowingBinOp:
407 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
408 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
409 break;
410 case OperationType::Trunc:
411 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
412 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
413 break;
414 case OperationType::DisjointOp:
415 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
416 break;
417 case OperationType::PossiblyExactOp:
418 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
419 break;
420 case OperationType::GEPOp:
421 GEPFlags &= Other.GEPFlags;
422 break;
423 case OperationType::FPMathOp:
424 FMFs.NoNaNs &= Other.FMFs.NoNaNs;
425 FMFs.NoInfs &= Other.FMFs.NoInfs;
426 break;
427 case OperationType::NonNegOp:
428 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
429 break;
430 case OperationType::Cmp:
431 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
432 break;
433 case OperationType::Other:
434 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
435 break;
436 }
437}
438
440 assert(OpType == OperationType::FPMathOp &&
441 "recipe doesn't have fast math flags");
442 FastMathFlags Res;
443 Res.setAllowReassoc(FMFs.AllowReassoc);
444 Res.setNoNaNs(FMFs.NoNaNs);
445 Res.setNoInfs(FMFs.NoInfs);
446 Res.setNoSignedZeros(FMFs.NoSignedZeros);
447 Res.setAllowReciprocal(FMFs.AllowReciprocal);
448 Res.setAllowContract(FMFs.AllowContract);
449 Res.setApproxFunc(FMFs.ApproxFunc);
450 return Res;
451}
452
453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
455#endif
456
457template <unsigned PartOpIdx>
458VPValue *
460 if (U.getNumOperands() == PartOpIdx + 1)
461 return U.getOperand(PartOpIdx);
462 return nullptr;
463}
464
465template <unsigned PartOpIdx>
467 if (auto *UnrollPartOp = getUnrollPartOperand(U))
468 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
469 return 0;
470}
471
472namespace llvm {
473template class VPUnrollPartAccessor<1>;
474template class VPUnrollPartAccessor<2>;
475template class VPUnrollPartAccessor<3>;
476}
477
479 const VPIRFlags &Flags, DebugLoc DL,
480 const Twine &Name)
481 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
482 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
484 "Set flags not supported for the provided opcode");
485 assert((getNumOperandsForOpcode(Opcode) == -1u ||
486 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
487 "number of operands does not match opcode");
488}
489
490#ifndef NDEBUG
491unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
492 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
493 return 1;
494
495 if (Instruction::isBinaryOp(Opcode))
496 return 2;
497
498 switch (Opcode) {
501 return 0;
502 case Instruction::Alloca:
503 case Instruction::ExtractValue:
504 case Instruction::Freeze:
505 case Instruction::Load:
519 return 1;
520 case Instruction::ICmp:
521 case Instruction::FCmp:
522 case Instruction::Store:
530 return 2;
531 case Instruction::Select:
535 return 3;
537 return 4;
538 case Instruction::Call:
539 case Instruction::GetElementPtr:
540 case Instruction::PHI:
541 case Instruction::Switch:
542 // Cannot determine the number of operands from the opcode.
543 return -1u;
544 }
545 llvm_unreachable("all cases should be handled above");
546}
547#endif
548
552
553bool VPInstruction::canGenerateScalarForFirstLane() const {
555 return true;
557 return true;
558 switch (Opcode) {
559 case Instruction::Freeze:
560 case Instruction::ICmp:
561 case Instruction::PHI:
562 case Instruction::Select:
571 return true;
572 default:
573 return false;
574 }
575}
576
577/// Create a conditional branch using \p Cond branching to the successors of \p
578/// VPBB. Note that the first successor is always forward (i.e. not created yet)
579/// while the second successor may already have been created (if it is a header
580/// block and VPBB is a latch).
582 VPTransformState &State) {
583 // Replace the temporary unreachable terminator with a new conditional
584 // branch, hooking it up to backward destination (header) for latch blocks
585 // now, and to forward destination(s) later when they are created.
586 // Second successor may be backwards - iff it is already in VPBB2IRBB.
587 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
588 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
589 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
590 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
591 // First successor is always forward, reset it to nullptr
592 CondBr->setSuccessor(0, nullptr);
594 return CondBr;
595}
596
597Value *VPInstruction::generate(VPTransformState &State) {
598 IRBuilderBase &Builder = State.Builder;
599
601 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
602 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
603 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
604 auto *Res =
605 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
606 if (auto *I = dyn_cast<Instruction>(Res))
607 applyFlags(*I);
608 return Res;
609 }
610
611 switch (getOpcode()) {
612 case VPInstruction::Not: {
613 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
614 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
615 return Builder.CreateNot(A, Name);
616 }
617 case Instruction::ExtractElement: {
618 assert(State.VF.isVector() && "Only extract elements from vectors");
619 if (getOperand(1)->isLiveIn()) {
620 unsigned IdxToExtract =
621 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
622 return State.get(getOperand(0), VPLane(IdxToExtract));
623 }
624 Value *Vec = State.get(getOperand(0));
625 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
626 return Builder.CreateExtractElement(Vec, Idx, Name);
627 }
628 case Instruction::Freeze: {
630 return Builder.CreateFreeze(Op, Name);
631 }
632 case Instruction::FCmp:
633 case Instruction::ICmp: {
634 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
635 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
636 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
637 return Builder.CreateCmp(getPredicate(), A, B, Name);
638 }
639 case Instruction::PHI: {
640 llvm_unreachable("should be handled by VPPhi::execute");
641 }
642 case Instruction::Select: {
643 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
644 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
645 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
646 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
647 return Builder.CreateSelect(Cond, Op1, Op2, Name);
648 }
650 // Get first lane of vector induction variable.
651 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
652 // Get the original loop tripcount.
653 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
654
655 // If this part of the active lane mask is scalar, generate the CMP directly
656 // to avoid unnecessary extracts.
657 if (State.VF.isScalar())
658 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
659 Name);
660
661 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
662 auto PredTy = VectorType::get(
663 Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
664 ->getZExtValue());
665 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
666 {PredTy, ScalarTC->getType()},
667 {VIVElem0, ScalarTC}, nullptr, Name);
668 }
670 // Generate code to combine the previous and current values in vector v3.
671 //
672 // vector.ph:
673 // v_init = vector(..., ..., ..., a[-1])
674 // br vector.body
675 //
676 // vector.body
677 // i = phi [0, vector.ph], [i+4, vector.body]
678 // v1 = phi [v_init, vector.ph], [v2, vector.body]
679 // v2 = a[i, i+1, i+2, i+3];
680 // v3 = vector(v1(3), v2(0, 1, 2))
681
682 auto *V1 = State.get(getOperand(0));
683 if (!V1->getType()->isVectorTy())
684 return V1;
685 Value *V2 = State.get(getOperand(1));
686 return Builder.CreateVectorSplice(V1, V2, -1, Name);
687 }
689 unsigned UF = getParent()->getPlan()->getUF();
690 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
691 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
692 Value *Sub = Builder.CreateSub(ScalarTC, Step);
693 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
694 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
695 return Builder.CreateSelect(Cmp, Sub, Zero);
696 }
698 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
699 // be outside of the main loop.
700 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
701 // Compute EVL
702 assert(AVL->getType()->isIntegerTy() &&
703 "Requested vector length should be an integer.");
704
705 assert(State.VF.isScalable() && "Expected scalable vector factor.");
706 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
707
708 Value *EVL = State.Builder.CreateIntrinsic(
709 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
710 {AVL, VFArg, State.Builder.getTrue()});
711 return EVL;
712 }
714 unsigned Part = getUnrollPart(*this);
715 auto *IV = State.get(getOperand(0), VPLane(0));
716 assert(Part != 0 && "Must have a positive part");
717 // The canonical IV is incremented by the vectorization factor (num of
718 // SIMD elements) times the unroll part.
719 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
720 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
722 }
724 Value *Cond = State.get(getOperand(0), VPLane(0));
725 auto *Br = createCondBranch(Cond, getParent(), State);
726 applyMetadata(*Br);
727 return Br;
728 }
730 // First create the compare.
731 Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
732 Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
733 Value *Cond = Builder.CreateICmpEQ(IV, TC);
734 return createCondBranch(Cond, getParent(), State);
735 }
737 return Builder.CreateVectorSplat(
738 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
739 }
741 // For struct types, we need to build a new 'wide' struct type, where each
742 // element is widened, i.e., we create a struct of vectors.
743 auto *StructTy =
745 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
746 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
747 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
748 FieldIndex++) {
749 Value *ScalarValue =
750 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
751 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
752 VectorValue =
753 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
754 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
755 }
756 }
757 return Res;
758 }
760 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
761 auto NumOfElements = ElementCount::getFixed(getNumOperands());
762 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
763 for (const auto &[Idx, Op] : enumerate(operands()))
764 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
765 State.Builder.getInt32(Idx));
766 return Res;
767 }
769 if (State.VF.isScalar())
770 return State.get(getOperand(0), true);
771 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
773 // If this start vector is scaled then it should produce a vector with fewer
774 // elements than the VF.
775 ElementCount VF = State.VF.divideCoefficientBy(
776 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
777 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
778 Constant *Zero = Builder.getInt32(0);
779 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
780 Zero);
781 }
783 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
784 // and will be removed by breaking up the recipe further.
785 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
786 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
787 Value *ReducedPartRdx = State.get(getOperand(2));
788 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
789 ReducedPartRdx = Builder.CreateBinOp(
792 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
793 return createAnyOfReduction(Builder, ReducedPartRdx,
794 State.get(getOperand(1), VPLane(0)), OrigPhi);
795 }
797 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
798 // and will be removed by breaking up the recipe further.
799 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
800 // Get its reduction variable descriptor.
801 RecurKind RK = PhiR->getRecurrenceKind();
803 "Unexpected reduction kind");
804 assert(!PhiR->isInLoop() &&
805 "In-loop FindLastIV reduction is not supported yet");
806
807 // The recipe's operands are the reduction phi, the start value, the
808 // sentinel value, followed by one operand for each part of the reduction.
809 unsigned UF = getNumOperands() - 3;
810 Value *ReducedPartRdx = State.get(getOperand(3));
811 RecurKind MinMaxKind;
814 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
815 else
816 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
817 for (unsigned Part = 1; Part < UF; ++Part)
818 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
819 State.get(getOperand(3 + Part)));
820
821 Value *Start = State.get(getOperand(1), true);
823 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
824 Sentinel);
825 }
827 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
828 // and will be removed by breaking up the recipe further.
829 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
830 // Get its reduction variable descriptor.
831
832 RecurKind RK = PhiR->getRecurrenceKind();
834 "should be handled by ComputeFindIVResult");
835
836 // The recipe's operands are the reduction phi, followed by one operand for
837 // each part of the reduction.
838 unsigned UF = getNumOperands() - 1;
839 VectorParts RdxParts(UF);
840 for (unsigned Part = 0; Part < UF; ++Part)
841 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
842
843 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
844 if (hasFastMathFlags())
846
847 // Reduce all of the unrolled parts into a single vector.
848 Value *ReducedPartRdx = RdxParts[0];
849 if (PhiR->isOrdered()) {
850 ReducedPartRdx = RdxParts[UF - 1];
851 } else {
852 // Floating-point operations should have some FMF to enable the reduction.
853 for (unsigned Part = 1; Part < UF; ++Part) {
854 Value *RdxPart = RdxParts[Part];
856 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
857 else {
859 // For sub-recurrences, each UF's reduction variable is already
860 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
861 if (RK == RecurKind::Sub)
862 Opcode = Instruction::Add;
863 else
864 Opcode =
866 ReducedPartRdx =
867 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
868 }
869 }
870 }
871
872 // Create the reduction after the loop. Note that inloop reductions create
873 // the target reduction in the loop using a Reduction recipe.
874 if (State.VF.isVector() && !PhiR->isInLoop()) {
875 // TODO: Support in-order reductions based on the recurrence descriptor.
876 // All ops in the reduction inherit fast-math-flags from the recurrence
877 // descriptor.
878 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
879 }
880
881 return ReducedPartRdx;
882 }
886 unsigned Offset =
888 Value *Res;
889 if (State.VF.isVector()) {
890 assert(Offset <= State.VF.getKnownMinValue() &&
891 "invalid offset to extract from");
892 // Extract lane VF - Offset from the operand.
893 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
894 } else {
895 assert(Offset <= 1 && "invalid offset to extract from");
896 Res = State.get(getOperand(0));
897 }
899 Res->setName(Name);
900 return Res;
901 }
903 Value *A = State.get(getOperand(0));
904 Value *B = State.get(getOperand(1));
905 return Builder.CreateLogicalAnd(A, B, Name);
906 }
909 "can only generate first lane for PtrAdd");
910 Value *Ptr = State.get(getOperand(0), VPLane(0));
911 Value *Addend = State.get(getOperand(1), VPLane(0));
912 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
913 }
915 Value *Ptr =
917 Value *Addend = State.get(getOperand(1));
918 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
919 }
921 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
922 for (VPValue *Op : drop_begin(operands()))
923 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
924 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
925 }
927 Value *LaneToExtract = State.get(getOperand(0), true);
928 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
929 Value *Res = nullptr;
930 Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
931
932 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
933 Value *VectorStart =
934 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
935 Value *VectorIdx = Idx == 1
936 ? LaneToExtract
937 : Builder.CreateSub(LaneToExtract, VectorStart);
938 Value *Ext = State.VF.isScalar()
939 ? State.get(getOperand(Idx))
940 : Builder.CreateExtractElement(
941 State.get(getOperand(Idx)), VectorIdx);
942 if (Res) {
943 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
944 Res = Builder.CreateSelect(Cmp, Ext, Res);
945 } else {
946 Res = Ext;
947 }
948 }
949 return Res;
950 }
952 if (getNumOperands() == 1) {
953 Value *Mask = State.get(getOperand(0));
954 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
955 true, Name);
956 }
957 // If there are multiple operands, create a chain of selects to pick the
958 // first operand with an active lane and add the number of lanes of the
959 // preceding operands.
960 Value *RuntimeVF =
961 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
962 unsigned LastOpIdx = getNumOperands() - 1;
963 Value *Res = nullptr;
964 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
965 Value *TrailingZeros =
966 State.VF.isScalar()
967 ? Builder.CreateZExt(
968 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
969 Builder.getFalse()),
970 Builder.getInt64Ty())
971 : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
972 State.get(getOperand(Idx)),
973 true, Name);
974 Value *Current = Builder.CreateAdd(
975 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
976 if (Res) {
977 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
978 Res = Builder.CreateSelect(Cmp, Current, Res);
979 } else {
980 Res = Current;
981 }
982 }
983
984 return Res;
985 }
987 return State.get(getOperand(0), true);
988 default:
989 llvm_unreachable("Unsupported opcode for instruction");
990 }
991}
992
994 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
995 Type *ScalarTy = Ctx.Types.inferScalarType(this);
996 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
997 switch (Opcode) {
998 case Instruction::FNeg:
999 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1000 case Instruction::UDiv:
1001 case Instruction::SDiv:
1002 case Instruction::SRem:
1003 case Instruction::URem:
1004 case Instruction::Add:
1005 case Instruction::FAdd:
1006 case Instruction::Sub:
1007 case Instruction::FSub:
1008 case Instruction::Mul:
1009 case Instruction::FMul:
1010 case Instruction::FDiv:
1011 case Instruction::FRem:
1012 case Instruction::Shl:
1013 case Instruction::LShr:
1014 case Instruction::AShr:
1015 case Instruction::And:
1016 case Instruction::Or:
1017 case Instruction::Xor: {
1020
1021 if (VF.isVector()) {
1022 // Certain instructions can be cheaper to vectorize if they have a
1023 // constant second vector operand. One example of this are shifts on x86.
1024 VPValue *RHS = getOperand(1);
1025 RHSInfo = Ctx.getOperandInfo(RHS);
1026
1027 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1030 }
1031
1034 if (CtxI)
1035 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1036 return Ctx.TTI.getArithmeticInstrCost(
1037 Opcode, ResultTy, Ctx.CostKind,
1038 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1039 RHSInfo, Operands, CtxI, &Ctx.TLI);
1040 }
1041 case Instruction::Freeze:
1042 // This opcode is unknown. Assume that it is the same as 'mul'.
1043 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
1044 Ctx.CostKind);
1045 case Instruction::ExtractValue:
1046 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1047 Ctx.CostKind);
1048 case Instruction::ICmp:
1049 case Instruction::FCmp: {
1050 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1051 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1053 return Ctx.TTI.getCmpSelInstrCost(
1054 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1055 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1056 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1057 }
1058 }
1059 llvm_unreachable("called for unsupported opcode");
1060}
1061
1063 VPCostContext &Ctx) const {
1065 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1066 // TODO: Compute cost for VPInstructions without underlying values once
1067 // the legacy cost model has been retired.
1068 return 0;
1069 }
1070
1072 "Should only generate a vector value or single scalar, not scalars "
1073 "for all lanes.");
1075 getOpcode(),
1077 }
1078
1079 switch (getOpcode()) {
1080 case Instruction::Select: {
1081 // TODO: It may be possible to improve this by analyzing where the
1082 // condition operand comes from.
1084 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1085 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1086 if (!vputils::onlyFirstLaneUsed(this)) {
1087 CondTy = toVectorTy(CondTy, VF);
1088 VecTy = toVectorTy(VecTy, VF);
1089 }
1090 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1091 Ctx.CostKind);
1092 }
1093 case Instruction::ExtractElement:
1095 if (VF.isScalar()) {
1096 // ExtractLane with VF=1 takes care of handling extracting across multiple
1097 // parts.
1098 return 0;
1099 }
1100
1101 // Add on the cost of extracting the element.
1102 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1103 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1104 Ctx.CostKind);
1105 }
1106 case VPInstruction::AnyOf: {
1107 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1108 return Ctx.TTI.getArithmeticReductionCost(
1109 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1110 }
1112 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1113 if (VF.isScalar())
1114 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1116 CmpInst::ICMP_EQ, Ctx.CostKind);
1117 // Calculate the cost of determining the lane index.
1118 auto *PredTy = toVectorTy(ScalarTy, VF);
1119 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1120 Type::getInt64Ty(Ctx.LLVMCtx),
1121 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1122 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1123 }
1125 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1127 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1128 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1129
1130 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
1131 cast<VectorType>(VectorTy),
1132 cast<VectorType>(VectorTy), Mask,
1133 Ctx.CostKind, VF.getKnownMinValue() - 1);
1134 }
1136 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1137 unsigned Multiplier =
1138 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1139 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1140 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1141 {ArgTy, ArgTy});
1142 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1143 }
1145 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1146 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1147 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1148 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1149 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1150 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1151 }
1153 // Add on the cost of extracting the element.
1154 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1155 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1156 VecTy, Ctx.CostKind, 0);
1157 }
1159 if (VF == ElementCount::getScalable(1))
1161 [[fallthrough]];
1162 default:
1163 // TODO: Compute cost other VPInstructions once the legacy cost model has
1164 // been retired.
1166 "unexpected VPInstruction witht underlying value");
1167 return 0;
1168 }
1169}
1170
1183
1185 switch (getOpcode()) {
1186 case Instruction::PHI:
1190 return true;
1191 default:
1192 return isScalarCast();
1193 }
1194}
1195
1197 assert(!State.Lane && "VPInstruction executing an Lane");
1198 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1200 "Set flags not supported for the provided opcode");
1201 if (hasFastMathFlags())
1202 State.Builder.setFastMathFlags(getFastMathFlags());
1203 Value *GeneratedValue = generate(State);
1204 if (!hasResult())
1205 return;
1206 assert(GeneratedValue && "generate must produce a value");
1207 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1210 assert((((GeneratedValue->getType()->isVectorTy() ||
1211 GeneratedValue->getType()->isStructTy()) ==
1212 !GeneratesPerFirstLaneOnly) ||
1213 State.VF.isScalar()) &&
1214 "scalar value but not only first lane defined");
1215 State.set(this, GeneratedValue,
1216 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1217}
1218
1221 return false;
1222 switch (getOpcode()) {
1223 case Instruction::ExtractElement:
1224 case Instruction::Freeze:
1225 case Instruction::FCmp:
1226 case Instruction::ICmp:
1227 case Instruction::Select:
1228 case Instruction::PHI:
1243 case VPInstruction::Not:
1251 return false;
1252 default:
1253 return true;
1254 }
1255}
1256
1258 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1260 return vputils::onlyFirstLaneUsed(this);
1261
1262 switch (getOpcode()) {
1263 default:
1264 return false;
1265 case Instruction::ExtractElement:
1266 return Op == getOperand(1);
1267 case Instruction::PHI:
1268 return true;
1269 case Instruction::FCmp:
1270 case Instruction::ICmp:
1271 case Instruction::Select:
1272 case Instruction::Or:
1273 case Instruction::Freeze:
1274 case VPInstruction::Not:
1275 // TODO: Cover additional opcodes.
1276 return vputils::onlyFirstLaneUsed(this);
1285 return true;
1288 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1289 // operand, after replicating its operands only the first lane is used.
1290 // Before replicating, it will have only a single operand.
1291 return getNumOperands() > 1;
1293 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1295 // WidePtrAdd supports scalar and vector base addresses.
1296 return false;
1299 return Op == getOperand(1);
1301 return Op == getOperand(0);
1302 };
1303 llvm_unreachable("switch should return");
1304}
1305
1307 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1309 return vputils::onlyFirstPartUsed(this);
1310
1311 switch (getOpcode()) {
1312 default:
1313 return false;
1314 case Instruction::FCmp:
1315 case Instruction::ICmp:
1316 case Instruction::Select:
1317 return vputils::onlyFirstPartUsed(this);
1321 return true;
1322 };
1323 llvm_unreachable("switch should return");
1324}
1325
1326#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1328 VPSlotTracker SlotTracker(getParent()->getPlan());
1329 print(dbgs(), "", SlotTracker);
1330}
1331
1333 VPSlotTracker &SlotTracker) const {
1334 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1335
1336 if (hasResult()) {
1338 O << " = ";
1339 }
1340
1341 switch (getOpcode()) {
1342 case VPInstruction::Not:
1343 O << "not";
1344 break;
1346 O << "combined load";
1347 break;
1349 O << "combined store";
1350 break;
1352 O << "active lane mask";
1353 break;
1355 O << "EXPLICIT-VECTOR-LENGTH";
1356 break;
1358 O << "first-order splice";
1359 break;
1361 O << "branch-on-cond";
1362 break;
1364 O << "TC > VF ? TC - VF : 0";
1365 break;
1367 O << "VF * Part +";
1368 break;
1370 O << "branch-on-count";
1371 break;
1373 O << "broadcast";
1374 break;
1376 O << "buildstructvector";
1377 break;
1379 O << "buildvector";
1380 break;
1382 O << "extract-lane";
1383 break;
1385 O << "extract-last-element";
1386 break;
1388 O << "extract-last-lane-per-part";
1389 break;
1391 O << "extract-penultimate-element";
1392 break;
1394 O << "compute-anyof-result";
1395 break;
1397 O << "compute-find-iv-result";
1398 break;
1400 O << "compute-reduction-result";
1401 break;
1403 O << "logical-and";
1404 break;
1406 O << "ptradd";
1407 break;
1409 O << "wide-ptradd";
1410 break;
1412 O << "any-of";
1413 break;
1415 O << "first-active-lane";
1416 break;
1418 O << "reduction-start-vector";
1419 break;
1421 O << "resume-for-epilogue";
1422 break;
1424 O << "unpack";
1425 break;
1426 default:
1428 }
1429
1430 printFlags(O);
1432
1433 if (auto DL = getDebugLoc()) {
1434 O << ", !dbg ";
1435 DL.print(O);
1436 }
1437}
1438#endif
1439
1441 State.setDebugLocFrom(getDebugLoc());
1442 if (isScalarCast()) {
1443 Value *Op = State.get(getOperand(0), VPLane(0));
1444 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1445 Op, ResultTy);
1446 State.set(this, Cast, VPLane(0));
1447 return;
1448 }
1449 switch (getOpcode()) {
1451 Value *StepVector =
1452 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1453 State.set(this, StepVector);
1454 break;
1455 }
1456 case VPInstruction::VScale: {
1457 Value *VScale = State.Builder.CreateVScale(ResultTy);
1458 State.set(this, VScale, true);
1459 break;
1460 }
1461
1462 default:
1463 llvm_unreachable("opcode not implemented yet");
1464 }
1465}
1466
1467#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1469 VPSlotTracker &SlotTracker) const {
1470 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1472 O << " = ";
1473
1474 switch (getOpcode()) {
1476 O << "wide-iv-step ";
1478 break;
1480 O << "step-vector " << *ResultTy;
1481 break;
1483 O << "vscale " << *ResultTy;
1484 break;
1485 default:
1486 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1489 O << " to " << *ResultTy;
1490 }
1491}
1492#endif
1493
1495 State.setDebugLocFrom(getDebugLoc());
1496 PHINode *NewPhi = State.Builder.CreatePHI(
1497 State.TypeAnalysis.inferScalarType(this), 2, getName());
1498 unsigned NumIncoming = getNumIncoming();
1499 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1500 // TODO: Fixup all incoming values of header phis once recipes defining them
1501 // are introduced.
1502 NumIncoming = 1;
1503 }
1504 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1505 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1506 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1507 NewPhi->addIncoming(IncV, PredBB);
1508 }
1509 State.set(this, NewPhi, VPLane(0));
1510}
1511
1512#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1513void VPPhi::print(raw_ostream &O, const Twine &Indent,
1514 VPSlotTracker &SlotTracker) const {
1515 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1517 O << " = phi ";
1519}
1520#endif
1521
1522VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1523 if (auto *Phi = dyn_cast<PHINode>(&I))
1524 return new VPIRPhi(*Phi);
1525 return new VPIRInstruction(I);
1526}
1527
1529 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1530 "PHINodes must be handled by VPIRPhi");
1531 // Advance the insert point after the wrapped IR instruction. This allows
1532 // interleaving VPIRInstructions and other recipes.
1533 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1534}
1535
1537 VPCostContext &Ctx) const {
1538 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1539 // hence it does not contribute to the cost-modeling for the VPlan.
1540 return 0;
1541}
1542
1545 "can only update exiting operands to phi nodes");
1546 assert(getNumOperands() > 0 && "must have at least one operand");
1547 VPValue *Exiting = getOperand(0);
1548 if (Exiting->isLiveIn())
1549 return;
1550
1551 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1552 setOperand(0, Exiting);
1553}
1554
1555#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1557 VPSlotTracker &SlotTracker) const {
1558 O << Indent << "IR " << I;
1559}
1560#endif
1561
1563 PHINode *Phi = &getIRPhi();
1564 for (const auto &[Idx, Op] : enumerate(operands())) {
1565 VPValue *ExitValue = Op;
1566 auto Lane = vputils::isSingleScalar(ExitValue)
1568 : VPLane::getLastLaneForVF(State.VF);
1569 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1570 auto *PredVPBB = Pred->getExitingBasicBlock();
1571 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1572 // Set insertion point in PredBB in case an extract needs to be generated.
1573 // TODO: Model extracts explicitly.
1574 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1575 Value *V = State.get(ExitValue, VPLane(Lane));
1576 // If there is no existing block for PredBB in the phi, add a new incoming
1577 // value. Otherwise update the existing incoming value for PredBB.
1578 if (Phi->getBasicBlockIndex(PredBB) == -1)
1579 Phi->addIncoming(V, PredBB);
1580 else
1581 Phi->setIncomingValueForBlock(PredBB, V);
1582 }
1583
1584 // Advance the insert point after the wrapped IR instruction. This allows
1585 // interleaving VPIRInstructions and other recipes.
1586 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1587}
1588
1590 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1591 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1592 "Number of phi operands must match number of predecessors");
1593 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1594 R->removeOperand(Position);
1595}
1596
1597#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1599 VPSlotTracker &SlotTracker) const {
1600 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1601 [this, &O, &SlotTracker](auto Op) {
1602 O << "[ ";
1603 Op.value()->printAsOperand(O, SlotTracker);
1604 O << ", ";
1605 getIncomingBlock(Op.index())->printAsOperand(O);
1606 O << " ]";
1607 });
1608}
1609#endif
1610
1611#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1612void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1613 VPSlotTracker &SlotTracker) const {
1615
1616 if (getNumOperands() != 0) {
1617 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1619 [&O, &SlotTracker](auto Op) {
1620 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1621 O << " from ";
1622 std::get<1>(Op)->printAsOperand(O);
1623 });
1624 O << ")";
1625 }
1626}
1627#endif
1628
1630 : VPIRMetadata(I) {
1631 if (!LVer || !isa<LoadInst, StoreInst>(&I))
1632 return;
1633 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1634 if (AliasScopeMD)
1635 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1636 if (NoAliasMD)
1637 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1638}
1639
1641 for (const auto &[Kind, Node] : Metadata)
1642 I.setMetadata(Kind, Node);
1643}
1644
1646 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1647 for (const auto &[KindA, MDA] : Metadata) {
1648 for (const auto &[KindB, MDB] : Other.Metadata) {
1649 if (KindA == KindB && MDA == MDB) {
1650 MetadataIntersection.emplace_back(KindA, MDA);
1651 break;
1652 }
1653 }
1654 }
1655 Metadata = std::move(MetadataIntersection);
1656}
1657
1659 assert(State.VF.isVector() && "not widening");
1660 assert(Variant != nullptr && "Can't create vector function.");
1661
1662 FunctionType *VFTy = Variant->getFunctionType();
1663 // Add return type if intrinsic is overloaded on it.
1665 for (const auto &I : enumerate(args())) {
1666 Value *Arg;
1667 // Some vectorized function variants may also take a scalar argument,
1668 // e.g. linear parameters for pointers. This needs to be the scalar value
1669 // from the start of the respective part when interleaving.
1670 if (!VFTy->getParamType(I.index())->isVectorTy())
1671 Arg = State.get(I.value(), VPLane(0));
1672 else
1673 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1674 Args.push_back(Arg);
1675 }
1676
1679 if (CI)
1680 CI->getOperandBundlesAsDefs(OpBundles);
1681
1682 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1683 applyFlags(*V);
1684 applyMetadata(*V);
1685 V->setCallingConv(Variant->getCallingConv());
1686
1687 if (!V->getType()->isVoidTy())
1688 State.set(this, V);
1689}
1690
1692 VPCostContext &Ctx) const {
1693 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1694 Variant->getFunctionType()->params(),
1695 Ctx.CostKind);
1696}
1697
1698#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1700 VPSlotTracker &SlotTracker) const {
1701 O << Indent << "WIDEN-CALL ";
1702
1703 Function *CalledFn = getCalledScalarFunction();
1704 if (CalledFn->getReturnType()->isVoidTy())
1705 O << "void ";
1706 else {
1708 O << " = ";
1709 }
1710
1711 O << "call";
1712 printFlags(O);
1713 O << " @" << CalledFn->getName() << "(";
1714 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1715 Op->printAsOperand(O, SlotTracker);
1716 });
1717 O << ")";
1718
1719 O << " (using library function";
1720 if (Variant->hasName())
1721 O << ": " << Variant->getName();
1722 O << ")";
1723}
1724#endif
1725
1727 assert(State.VF.isVector() && "not widening");
1728
1729 SmallVector<Type *, 2> TysForDecl;
1730 // Add return type if intrinsic is overloaded on it.
1731 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1732 TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1734 for (const auto &I : enumerate(operands())) {
1735 // Some intrinsics have a scalar argument - don't replace it with a
1736 // vector.
1737 Value *Arg;
1738 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1739 State.TTI))
1740 Arg = State.get(I.value(), VPLane(0));
1741 else
1742 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1743 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1744 State.TTI))
1745 TysForDecl.push_back(Arg->getType());
1746 Args.push_back(Arg);
1747 }
1748
1749 // Use vector version of the intrinsic.
1750 Module *M = State.Builder.GetInsertBlock()->getModule();
1751 Function *VectorF =
1752 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1753 assert(VectorF &&
1754 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1755
1758 if (CI)
1759 CI->getOperandBundlesAsDefs(OpBundles);
1760
1761 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1762
1763 applyFlags(*V);
1764 applyMetadata(*V);
1765
1766 if (!V->getType()->isVoidTy())
1767 State.set(this, V);
1768}
1769
1770/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1773 const VPRecipeWithIRFlags &R,
1774 ElementCount VF,
1775 VPCostContext &Ctx) {
1776 // Some backends analyze intrinsic arguments to determine cost. Use the
1777 // underlying value for the operand if it has one. Otherwise try to use the
1778 // operand of the underlying call instruction, if there is one. Otherwise
1779 // clear Arguments.
1780 // TODO: Rework TTI interface to be independent of concrete IR values.
1782 for (const auto &[Idx, Op] : enumerate(Operands)) {
1783 auto *V = Op->getUnderlyingValue();
1784 if (!V) {
1785 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1786 Arguments.push_back(UI->getArgOperand(Idx));
1787 continue;
1788 }
1789 Arguments.clear();
1790 break;
1791 }
1792 Arguments.push_back(V);
1793 }
1794
1795 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1796 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1797 SmallVector<Type *> ParamTys;
1798 for (const VPValue *Op : Operands) {
1799 ParamTys.push_back(VF.isVector()
1800 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1801 : Ctx.Types.inferScalarType(Op));
1802 }
1803
1804 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1805 FastMathFlags FMF =
1806 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1807 IntrinsicCostAttributes CostAttrs(
1808 ID, RetTy, Arguments, ParamTys, FMF,
1809 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1810 InstructionCost::getInvalid(), &Ctx.TLI);
1811 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1812}
1813
1815 VPCostContext &Ctx) const {
1817 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1818}
1819
1821 return Intrinsic::getBaseName(VectorIntrinsicID);
1822}
1823
1825 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1826 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1827 auto [Idx, V] = X;
1829 Idx, nullptr);
1830 });
1831}
1832
1833#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1835 VPSlotTracker &SlotTracker) const {
1836 O << Indent << "WIDEN-INTRINSIC ";
1837 if (ResultTy->isVoidTy()) {
1838 O << "void ";
1839 } else {
1841 O << " = ";
1842 }
1843
1844 O << "call";
1845 printFlags(O);
1846 O << getIntrinsicName() << "(";
1847
1849 Op->printAsOperand(O, SlotTracker);
1850 });
1851 O << ")";
1852}
1853#endif
1854
1856 IRBuilderBase &Builder = State.Builder;
1857
1858 Value *Address = State.get(getOperand(0));
1859 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1860 VectorType *VTy = cast<VectorType>(Address->getType());
1861
1862 // The histogram intrinsic requires a mask even if the recipe doesn't;
1863 // if the mask operand was omitted then all lanes should be executed and
1864 // we just need to synthesize an all-true mask.
1865 Value *Mask = nullptr;
1866 if (VPValue *VPMask = getMask())
1867 Mask = State.get(VPMask);
1868 else
1869 Mask =
1870 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1871
1872 // If this is a subtract, we want to invert the increment amount. We may
1873 // add a separate intrinsic in future, but for now we'll try this.
1874 if (Opcode == Instruction::Sub)
1875 IncAmt = Builder.CreateNeg(IncAmt);
1876 else
1877 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1878
1879 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1880 {VTy, IncAmt->getType()},
1881 {Address, IncAmt, Mask});
1882}
1883
1885 VPCostContext &Ctx) const {
1886 // FIXME: Take the gather and scatter into account as well. For now we're
1887 // generating the same cost as the fallback path, but we'll likely
1888 // need to create a new TTI method for determining the cost, including
1889 // whether we can use base + vec-of-smaller-indices or just
1890 // vec-of-pointers.
1891 assert(VF.isVector() && "Invalid VF for histogram cost");
1892 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1893 VPValue *IncAmt = getOperand(1);
1894 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1895 VectorType *VTy = VectorType::get(IncTy, VF);
1896
1897 // Assume that a non-constant update value (or a constant != 1) requires
1898 // a multiply, and add that into the cost.
1899 InstructionCost MulCost =
1900 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1901 if (IncAmt->isLiveIn()) {
1903
1904 if (CI && CI->getZExtValue() == 1)
1905 MulCost = TTI::TCC_Free;
1906 }
1907
1908 // Find the cost of the histogram operation itself.
1909 Type *PtrTy = VectorType::get(AddressTy, VF);
1910 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1911 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1912 Type::getVoidTy(Ctx.LLVMCtx),
1913 {PtrTy, IncTy, MaskTy});
1914
1915 // Add the costs together with the add/sub operation.
1916 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1917 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1918}
1919
1920#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1922 VPSlotTracker &SlotTracker) const {
1923 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1925
1926 if (Opcode == Instruction::Sub)
1927 O << ", dec: ";
1928 else {
1929 assert(Opcode == Instruction::Add);
1930 O << ", inc: ";
1931 }
1933
1934 if (VPValue *Mask = getMask()) {
1935 O << ", mask: ";
1936 Mask->printAsOperand(O, SlotTracker);
1937 }
1938}
1939
1941 VPSlotTracker &SlotTracker) const {
1942 O << Indent << "WIDEN-SELECT ";
1944 O << " = select ";
1945 printFlags(O);
1947 O << ", ";
1949 O << ", ";
1951 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1952}
1953#endif
1954
1956 // The condition can be loop invariant but still defined inside the
1957 // loop. This means that we can't just use the original 'cond' value.
1958 // We have to take the 'vectorized' value and pick the first lane.
1959 // Instcombine will make this a no-op.
1960 Value *Cond = State.get(getCond(), isInvariantCond());
1961
1962 Value *Op0 = State.get(getOperand(1));
1963 Value *Op1 = State.get(getOperand(2));
1964 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1965 State.set(this, Sel);
1966 if (auto *I = dyn_cast<Instruction>(Sel)) {
1968 applyFlags(*I);
1969 applyMetadata(*I);
1970 }
1971}
1972
1974 VPCostContext &Ctx) const {
1976 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1977 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1978 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1979
1980 VPValue *Op0, *Op1;
1981 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1982 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
1983 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
1984 // select x, y, false --> x & y
1985 // select x, true, y --> x | y
1986 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1987 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1988
1990 if (all_of(operands(),
1991 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1992 Operands.append(SI->op_begin(), SI->op_end());
1993 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1994 return Ctx.TTI.getArithmeticInstrCost(
1995 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
1996 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1997 }
1998
1999 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
2000 if (!ScalarCond)
2001 CondTy = VectorType::get(CondTy, VF);
2002
2004 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
2005 Pred = Cmp->getPredicate();
2006 return Ctx.TTI.getCmpSelInstrCost(
2007 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
2008 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
2009}
2010
2011VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2012 AllowReassoc = FMF.allowReassoc();
2013 NoNaNs = FMF.noNaNs();
2014 NoInfs = FMF.noInfs();
2015 NoSignedZeros = FMF.noSignedZeros();
2016 AllowReciprocal = FMF.allowReciprocal();
2017 AllowContract = FMF.allowContract();
2018 ApproxFunc = FMF.approxFunc();
2019}
2020
2021#if !defined(NDEBUG)
2022bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2023 switch (OpType) {
2024 case OperationType::OverflowingBinOp:
2025 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2026 Opcode == Instruction::Mul ||
2027 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2028 case OperationType::Trunc:
2029 return Opcode == Instruction::Trunc;
2030 case OperationType::DisjointOp:
2031 return Opcode == Instruction::Or;
2032 case OperationType::PossiblyExactOp:
2033 return Opcode == Instruction::AShr;
2034 case OperationType::GEPOp:
2035 return Opcode == Instruction::GetElementPtr ||
2036 Opcode == VPInstruction::PtrAdd ||
2037 Opcode == VPInstruction::WidePtrAdd;
2038 case OperationType::FPMathOp:
2039 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
2040 Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
2041 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
2042 Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc ||
2043 Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
2044 Opcode == VPInstruction::WideIVStep ||
2047 case OperationType::NonNegOp:
2048 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2049 case OperationType::Cmp:
2050 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2051 case OperationType::Other:
2052 return true;
2053 }
2054 llvm_unreachable("Unknown OperationType enum");
2055}
2056#endif
2057
2058#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2060 switch (OpType) {
2061 case OperationType::Cmp:
2063 break;
2064 case OperationType::DisjointOp:
2065 if (DisjointFlags.IsDisjoint)
2066 O << " disjoint";
2067 break;
2068 case OperationType::PossiblyExactOp:
2069 if (ExactFlags.IsExact)
2070 O << " exact";
2071 break;
2072 case OperationType::OverflowingBinOp:
2073 if (WrapFlags.HasNUW)
2074 O << " nuw";
2075 if (WrapFlags.HasNSW)
2076 O << " nsw";
2077 break;
2078 case OperationType::Trunc:
2079 if (TruncFlags.HasNUW)
2080 O << " nuw";
2081 if (TruncFlags.HasNSW)
2082 O << " nsw";
2083 break;
2084 case OperationType::FPMathOp:
2086 break;
2087 case OperationType::GEPOp:
2088 if (GEPFlags.isInBounds())
2089 O << " inbounds";
2090 else if (GEPFlags.hasNoUnsignedSignedWrap())
2091 O << " nusw";
2092 if (GEPFlags.hasNoUnsignedWrap())
2093 O << " nuw";
2094 break;
2095 case OperationType::NonNegOp:
2096 if (NonNegFlags.NonNeg)
2097 O << " nneg";
2098 break;
2099 case OperationType::Other:
2100 break;
2101 }
2102 O << " ";
2103}
2104#endif
2105
2107 auto &Builder = State.Builder;
2108 switch (Opcode) {
2109 case Instruction::Call:
2110 case Instruction::Br:
2111 case Instruction::PHI:
2112 case Instruction::GetElementPtr:
2113 case Instruction::Select:
2114 llvm_unreachable("This instruction is handled by a different recipe.");
2115 case Instruction::UDiv:
2116 case Instruction::SDiv:
2117 case Instruction::SRem:
2118 case Instruction::URem:
2119 case Instruction::Add:
2120 case Instruction::FAdd:
2121 case Instruction::Sub:
2122 case Instruction::FSub:
2123 case Instruction::FNeg:
2124 case Instruction::Mul:
2125 case Instruction::FMul:
2126 case Instruction::FDiv:
2127 case Instruction::FRem:
2128 case Instruction::Shl:
2129 case Instruction::LShr:
2130 case Instruction::AShr:
2131 case Instruction::And:
2132 case Instruction::Or:
2133 case Instruction::Xor: {
2134 // Just widen unops and binops.
2136 for (VPValue *VPOp : operands())
2137 Ops.push_back(State.get(VPOp));
2138
2139 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2140
2141 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2142 applyFlags(*VecOp);
2143 applyMetadata(*VecOp);
2144 }
2145
2146 // Use this vector value for all users of the original instruction.
2147 State.set(this, V);
2148 break;
2149 }
2150 case Instruction::ExtractValue: {
2151 assert(getNumOperands() == 2 && "expected single level extractvalue");
2152 Value *Op = State.get(getOperand(0));
2154 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
2155 State.set(this, Extract);
2156 break;
2157 }
2158 case Instruction::Freeze: {
2159 Value *Op = State.get(getOperand(0));
2160 Value *Freeze = Builder.CreateFreeze(Op);
2161 State.set(this, Freeze);
2162 break;
2163 }
2164 case Instruction::ICmp:
2165 case Instruction::FCmp: {
2166 // Widen compares. Generate vector compares.
2167 bool FCmp = Opcode == Instruction::FCmp;
2168 Value *A = State.get(getOperand(0));
2169 Value *B = State.get(getOperand(1));
2170 Value *C = nullptr;
2171 if (FCmp) {
2172 // Propagate fast math flags.
2173 C = Builder.CreateFCmpFMF(
2174 getPredicate(), A, B,
2176 } else {
2177 C = Builder.CreateICmp(getPredicate(), A, B);
2178 }
2179 if (auto *I = dyn_cast<Instruction>(C))
2180 applyMetadata(*I);
2181 State.set(this, C);
2182 break;
2183 }
2184 default:
2185 // This instruction is not vectorized by simple widening.
2186 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2187 << Instruction::getOpcodeName(Opcode));
2188 llvm_unreachable("Unhandled instruction!");
2189 } // end of switch.
2190
2191#if !defined(NDEBUG)
2192 // Verify that VPlan type inference results agree with the type of the
2193 // generated values.
2194 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2195 State.get(this)->getType() &&
2196 "inferred type and type from generated instructions do not match");
2197#endif
2198}
2199
2201 VPCostContext &Ctx) const {
2202 switch (Opcode) {
2203 case Instruction::UDiv:
2204 case Instruction::SDiv:
2205 case Instruction::SRem:
2206 case Instruction::URem:
2207 // If the div/rem operation isn't safe to speculate and requires
2208 // predication, then the only way we can even create a vplan is to insert
2209 // a select on the second input operand to ensure we use the value of 1
2210 // for the inactive lanes. The select will be costed separately.
2211 case Instruction::FNeg:
2212 case Instruction::Add:
2213 case Instruction::FAdd:
2214 case Instruction::Sub:
2215 case Instruction::FSub:
2216 case Instruction::Mul:
2217 case Instruction::FMul:
2218 case Instruction::FDiv:
2219 case Instruction::FRem:
2220 case Instruction::Shl:
2221 case Instruction::LShr:
2222 case Instruction::AShr:
2223 case Instruction::And:
2224 case Instruction::Or:
2225 case Instruction::Xor:
2226 case Instruction::Freeze:
2227 case Instruction::ExtractValue:
2228 case Instruction::ICmp:
2229 case Instruction::FCmp:
2230 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2231 default:
2232 llvm_unreachable("Unsupported opcode for instruction");
2233 }
2234}
2235
2236#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2238 VPSlotTracker &SlotTracker) const {
2239 O << Indent << "WIDEN ";
2241 O << " = " << Instruction::getOpcodeName(Opcode);
2242 printFlags(O);
2244}
2245#endif
2246
2248 auto &Builder = State.Builder;
2249 /// Vectorize casts.
2250 assert(State.VF.isVector() && "Not vectorizing?");
2251 Type *DestTy = VectorType::get(getResultType(), State.VF);
2252 VPValue *Op = getOperand(0);
2253 Value *A = State.get(Op);
2254 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2255 State.set(this, Cast);
2256 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2257 applyFlags(*CastOp);
2258 applyMetadata(*CastOp);
2259 }
2260}
2261
2263 VPCostContext &Ctx) const {
2264 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2265 // the legacy cost model, including truncates/extends when evaluating a
2266 // reduction in a smaller type.
2267 if (!getUnderlyingValue())
2268 return 0;
2269 // Computes the CastContextHint from a recipes that may access memory.
2270 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2271 if (VF.isScalar())
2273 if (isa<VPInterleaveBase>(R))
2275 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2276 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2278 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2279 if (WidenMemoryRecipe == nullptr)
2281 if (!WidenMemoryRecipe->isConsecutive())
2283 if (WidenMemoryRecipe->isReverse())
2285 if (WidenMemoryRecipe->isMasked())
2288 };
2289
2290 VPValue *Operand = getOperand(0);
2292 // For Trunc/FPTrunc, get the context from the only user.
2293 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2295 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2296 CCH = ComputeCCH(StoreRecipe);
2297 }
2298 // For Z/Sext, get the context from the operand.
2299 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2300 Opcode == Instruction::FPExt) {
2301 if (Operand->isLiveIn())
2303 else if (Operand->getDefiningRecipe())
2304 CCH = ComputeCCH(Operand->getDefiningRecipe());
2305 }
2306
2307 auto *SrcTy =
2308 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2309 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2310 // Arm TTI will use the underlying instruction to determine the cost.
2311 return Ctx.TTI.getCastInstrCost(
2312 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2314}
2315
2316#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2318 VPSlotTracker &SlotTracker) const {
2319 O << Indent << "WIDEN-CAST ";
2321 O << " = " << Instruction::getOpcodeName(Opcode);
2322 printFlags(O);
2324 O << " to " << *getResultType();
2325}
2326#endif
2327
2329 VPCostContext &Ctx) const {
2330 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2331}
2332
2333/// A helper function that returns an integer or floating-point constant with
2334/// value C.
2336 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2337 : ConstantFP::get(Ty, C);
2338}
2339
2340#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2342 VPSlotTracker &SlotTracker) const {
2343 O << Indent;
2345 O << " = WIDEN-INDUCTION ";
2347
2348 if (auto *TI = getTruncInst())
2349 O << " (truncated to " << *TI->getType() << ")";
2350}
2351#endif
2352
2354 // The step may be defined by a recipe in the preheader (e.g. if it requires
2355 // SCEV expansion), but for the canonical induction the step is required to be
2356 // 1, which is represented as live-in.
2358 return false;
2361 auto *CanIV = getRegion()->getCanonicalIV();
2362 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2363 getScalarType() == CanIV->getScalarType();
2364}
2365
2366#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2368 VPSlotTracker &SlotTracker) const {
2369 O << Indent;
2371 O << " = DERIVED-IV ";
2372 getStartValue()->printAsOperand(O, SlotTracker);
2373 O << " + ";
2374 getOperand(1)->printAsOperand(O, SlotTracker);
2375 O << " * ";
2376 getStepValue()->printAsOperand(O, SlotTracker);
2377}
2378#endif
2379
2381 // Fast-math-flags propagate from the original induction instruction.
2382 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2383 if (hasFastMathFlags())
2384 State.Builder.setFastMathFlags(getFastMathFlags());
2385
2386 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2387 /// variable on which to base the steps, \p Step is the size of the step.
2388
2389 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2390 Value *Step = State.get(getStepValue(), VPLane(0));
2391 IRBuilderBase &Builder = State.Builder;
2392
2393 // Ensure step has the same type as that of scalar IV.
2394 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2395 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2396
2397 // We build scalar steps for both integer and floating-point induction
2398 // variables. Here, we determine the kind of arithmetic we will perform.
2401 if (BaseIVTy->isIntegerTy()) {
2402 AddOp = Instruction::Add;
2403 MulOp = Instruction::Mul;
2404 } else {
2405 AddOp = InductionOpcode;
2406 MulOp = Instruction::FMul;
2407 }
2408
2409 // Determine the number of scalars we need to generate for each unroll
2410 // iteration.
2411 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2412 // Compute the scalar steps and save the results in State.
2413 Type *IntStepTy =
2414 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2415 Type *VecIVTy = nullptr;
2416 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2417 if (!FirstLaneOnly && State.VF.isScalable()) {
2418 VecIVTy = VectorType::get(BaseIVTy, State.VF);
2419 UnitStepVec =
2420 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2421 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2422 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2423 }
2424
2425 unsigned StartLane = 0;
2426 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2427 if (State.Lane) {
2428 StartLane = State.Lane->getKnownLane();
2429 EndLane = StartLane + 1;
2430 }
2431 Value *StartIdx0;
2432 if (getUnrollPart(*this) == 0)
2433 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2434 else {
2435 StartIdx0 = State.get(getOperand(2), true);
2436 if (getUnrollPart(*this) != 1) {
2437 StartIdx0 =
2438 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2439 getUnrollPart(*this)));
2440 }
2441 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2442 }
2443
2444 if (!FirstLaneOnly && State.VF.isScalable()) {
2445 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2446 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2447 if (BaseIVTy->isFloatingPointTy())
2448 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2449 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2450 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2451 State.set(this, Add);
2452 // It's useful to record the lane values too for the known minimum number
2453 // of elements so we do those below. This improves the code quality when
2454 // trying to extract the first element, for example.
2455 }
2456
2457 if (BaseIVTy->isFloatingPointTy())
2458 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2459
2460 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2461 Value *StartIdx = Builder.CreateBinOp(
2462 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2463 // The step returned by `createStepForVF` is a runtime-evaluated value
2464 // when VF is scalable. Otherwise, it should be folded into a Constant.
2465 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2466 "Expected StartIdx to be folded to a constant when VF is not "
2467 "scalable");
2468 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2469 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2470 State.set(this, Add, VPLane(Lane));
2471 }
2472}
2473
2474#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2476 VPSlotTracker &SlotTracker) const {
2477 O << Indent;
2479 O << " = SCALAR-STEPS ";
2481}
2482#endif
2483
2485 assert(State.VF.isVector() && "not widening");
2486 // Construct a vector GEP by widening the operands of the scalar GEP as
2487 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2488 // results in a vector of pointers when at least one operand of the GEP
2489 // is vector-typed. Thus, to keep the representation compact, we only use
2490 // vector-typed operands for loop-varying values.
2491
2492 if (areAllOperandsInvariant()) {
2493 // If we are vectorizing, but the GEP has only loop-invariant operands,
2494 // the GEP we build (by only using vector-typed operands for
2495 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2496 // produce a vector of pointers, we need to either arbitrarily pick an
2497 // operand to broadcast, or broadcast a clone of the original GEP.
2498 // Here, we broadcast a clone of the original.
2499 //
2500 // TODO: If at some point we decide to scalarize instructions having
2501 // loop-invariant operands, this special case will no longer be
2502 // required. We would add the scalarization decision to
2503 // collectLoopScalars() and teach getVectorValue() to broadcast
2504 // the lane-zero scalar value.
2506 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2507 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2508
2509 auto *NewGEP =
2510 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2511 "", getGEPNoWrapFlags());
2512 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2513 State.set(this, Splat);
2514 } else {
2515 // If the GEP has at least one loop-varying operand, we are sure to
2516 // produce a vector of pointers unless VF is scalar.
2517 // The pointer operand of the new GEP. If it's loop-invariant, we
2518 // won't broadcast it.
2519 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2520
2521 // Collect all the indices for the new GEP. If any index is
2522 // loop-invariant, we won't broadcast it.
2524 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2525 VPValue *Operand = getOperand(I);
2526 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2527 }
2528
2529 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2530 // but it should be a vector, otherwise.
2531 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2532 "", getGEPNoWrapFlags());
2533 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2534 "NewGEP is not a pointer vector");
2535 State.set(this, NewGEP);
2536 }
2537}
2538
2539#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2541 VPSlotTracker &SlotTracker) const {
2542 O << Indent << "WIDEN-GEP ";
2543 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2544 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2545 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2546
2547 O << " ";
2549 O << " = getelementptr";
2550 printFlags(O);
2552}
2553#endif
2554
2555static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2556 unsigned CurrentPart, IRBuilderBase &Builder) {
2557 // Use i32 for the gep index type when the value is constant,
2558 // or query DataLayout for a more suitable index type otherwise.
2559 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2560 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2561 ? DL.getIndexType(Builder.getPtrTy(0))
2562 : Builder.getInt32Ty();
2563}
2564
2566 auto &Builder = State.Builder;
2567 unsigned CurrentPart = getUnrollPart(*this);
2568 bool IsUnitStride = Stride == 1 || Stride == -1;
2569 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2570 IsUnitStride, CurrentPart, Builder);
2571
2572 // The wide store needs to start at the last vector element.
2573 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2574 if (IndexTy != RunTimeVF->getType())
2575 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2576 // NumElt = Stride * CurrentPart * RunTimeVF
2577 Value *NumElt = Builder.CreateMul(
2578 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2579 // LastLane = Stride * (RunTimeVF - 1)
2580 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2581 if (Stride != 1)
2582 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2583 Value *Ptr = State.get(getOperand(0), VPLane(0));
2584 Value *ResultPtr =
2585 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2586 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2588
2589 State.set(this, ResultPtr, /*IsScalar*/ true);
2590}
2591
2592#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2594 VPSlotTracker &SlotTracker) const {
2595 O << Indent;
2597 O << " = vector-end-pointer";
2598 printFlags(O);
2600}
2601#endif
2602
2604 auto &Builder = State.Builder;
2605 unsigned CurrentPart = getUnrollPart(*this);
2606 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2607 /*IsUnitStride*/ true, CurrentPart, Builder);
2608 Value *Ptr = State.get(getOperand(0), VPLane(0));
2609
2610 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2611 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2612 "", getGEPNoWrapFlags());
2613
2614 State.set(this, ResultPtr, /*IsScalar*/ true);
2615}
2616
2617#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2619 VPSlotTracker &SlotTracker) const {
2620 O << Indent;
2622 O << " = vector-pointer ";
2623
2625}
2626#endif
2627
2629 VPCostContext &Ctx) const {
2630 // Handle cases where only the first lane is used the same way as the legacy
2631 // cost model.
2633 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2634
2635 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2636 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2637 return (getNumIncomingValues() - 1) *
2638 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2639 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2640}
2641
2642#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2644 VPSlotTracker &SlotTracker) const {
2645 O << Indent << "BLEND ";
2647 O << " =";
2648 if (getNumIncomingValues() == 1) {
2649 // Not a User of any mask: not really blending, this is a
2650 // single-predecessor phi.
2651 O << " ";
2652 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2653 } else {
2654 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2655 O << " ";
2656 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2657 if (I == 0)
2658 continue;
2659 O << "/";
2660 getMask(I)->printAsOperand(O, SlotTracker);
2661 }
2662 }
2663}
2664#endif
2665
2667 assert(!State.Lane && "Reduction being replicated.");
2668 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2671 "In-loop AnyOf reductions aren't currently supported");
2672 // Propagate the fast-math flags carried by the underlying instruction.
2673 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2674 State.Builder.setFastMathFlags(getFastMathFlags());
2675 Value *NewVecOp = State.get(getVecOp());
2676 if (VPValue *Cond = getCondOp()) {
2677 Value *NewCond = State.get(Cond, State.VF.isScalar());
2678 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2679 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2680
2681 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2682 if (State.VF.isVector())
2683 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2684
2685 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2686 NewVecOp = Select;
2687 }
2688 Value *NewRed;
2689 Value *NextInChain;
2690 if (IsOrdered) {
2691 if (State.VF.isVector())
2692 NewRed =
2693 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2694 else
2695 NewRed = State.Builder.CreateBinOp(
2697 PrevInChain, NewVecOp);
2698 PrevInChain = NewRed;
2699 NextInChain = NewRed;
2700 } else {
2701 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2702 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2704 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2705 else
2706 NextInChain = State.Builder.CreateBinOp(
2708 PrevInChain, NewRed);
2709 }
2710 State.set(this, NextInChain, /*IsScalar*/ true);
2711}
2712
2714 assert(!State.Lane && "Reduction being replicated.");
2715
2716 auto &Builder = State.Builder;
2717 // Propagate the fast-math flags carried by the underlying instruction.
2718 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2719 Builder.setFastMathFlags(getFastMathFlags());
2720
2722 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2723 Value *VecOp = State.get(getVecOp());
2724 Value *EVL = State.get(getEVL(), VPLane(0));
2725
2726 Value *Mask;
2727 if (VPValue *CondOp = getCondOp())
2728 Mask = State.get(CondOp);
2729 else
2730 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2731
2732 Value *NewRed;
2733 if (isOrdered()) {
2734 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2735 } else {
2736 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2738 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2739 else
2740 NewRed = Builder.CreateBinOp(
2742 Prev);
2743 }
2744 State.set(this, NewRed, /*IsScalar*/ true);
2745}
2746
2748 VPCostContext &Ctx) const {
2749 RecurKind RdxKind = getRecurrenceKind();
2750 Type *ElementTy = Ctx.Types.inferScalarType(this);
2751 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2752 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2754 std::optional<FastMathFlags> OptionalFMF =
2755 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2756
2757 // TODO: Support any-of reductions.
2758 assert(
2760 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2761 "Any-of reduction not implemented in VPlan-based cost model currently.");
2762
2763 // Note that TTI should model the cost of moving result to the scalar register
2764 // and the BinOp cost in the getMinMaxReductionCost().
2767 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2768 }
2769
2770 // Note that TTI should model the cost of moving result to the scalar register
2771 // and the BinOp cost in the getArithmeticReductionCost().
2772 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2773 Ctx.CostKind);
2774}
2775
2777 ExpressionTypes ExpressionType,
2778 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2779 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2780 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2781 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2782 assert(
2783 none_of(ExpressionRecipes,
2784 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2785 "expression cannot contain recipes with side-effects");
2786
2787 // Maintain a copy of the expression recipes as a set of users.
2788 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2789 for (auto *R : ExpressionRecipes)
2790 ExpressionRecipesAsSetOfUsers.insert(R);
2791
2792 // Recipes in the expression, except the last one, must only be used by
2793 // (other) recipes inside the expression. If there are other users, external
2794 // to the expression, use a clone of the recipe for external users.
2795 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2796 if (R != ExpressionRecipes.back() &&
2797 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2798 return !ExpressionRecipesAsSetOfUsers.contains(U);
2799 })) {
2800 // There are users outside of the expression. Clone the recipe and use the
2801 // clone those external users.
2802 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2803 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2804 VPUser &U, unsigned) {
2805 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2806 });
2807 CopyForExtUsers->insertBefore(R);
2808 }
2809 if (R->getParent())
2810 R->removeFromParent();
2811 }
2812
2813 // Internalize all external operands to the expression recipes. To do so,
2814 // create new temporary VPValues for all operands defined by a recipe outside
2815 // the expression. The original operands are added as operands of the
2816 // VPExpressionRecipe itself.
2817 for (auto *R : ExpressionRecipes) {
2818 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2819 auto *Def = Op->getDefiningRecipe();
2820 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2821 continue;
2822 addOperand(Op);
2823 LiveInPlaceholders.push_back(new VPValue());
2824 }
2825 }
2826
2827 // Replace each external operand with the first one created for it in
2828 // LiveInPlaceholders.
2829 for (auto *R : ExpressionRecipes)
2830 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2831 R->replaceUsesOfWith(LiveIn, Tmp);
2832}
2833
2835 for (auto *R : ExpressionRecipes)
2836 // Since the list could contain duplicates, make sure the recipe hasn't
2837 // already been inserted.
2838 if (!R->getParent())
2839 R->insertBefore(this);
2840
2841 for (const auto &[Idx, Op] : enumerate(operands()))
2842 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2843
2844 replaceAllUsesWith(ExpressionRecipes.back());
2845 ExpressionRecipes.clear();
2846}
2847
2849 VPCostContext &Ctx) const {
2850 Type *RedTy = Ctx.Types.inferScalarType(this);
2851 auto *SrcVecTy = cast<VectorType>(
2852 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2853 assert(RedTy->isIntegerTy() &&
2854 "VPExpressionRecipe only supports integer types currently.");
2855 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2856 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
2857 switch (ExpressionType) {
2858 case ExpressionTypes::ExtendedReduction: {
2859 return Ctx.TTI.getExtendedReductionCost(
2860 Opcode,
2861 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2862 Instruction::ZExt,
2863 RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
2864 }
2865 case ExpressionTypes::MulAccReduction:
2866 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
2867 Ctx.CostKind);
2868
2869 case ExpressionTypes::ExtNegatedMulAccReduction:
2870 assert(Opcode == Instruction::Add && "Unexpected opcode");
2871 Opcode = Instruction::Sub;
2872 [[fallthrough]];
2873 case ExpressionTypes::ExtMulAccReduction: {
2874 return Ctx.TTI.getMulAccReductionCost(
2875 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2876 Instruction::ZExt,
2877 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2878 }
2879 }
2880 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2881}
2882
2884 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2885 return R->mayReadFromMemory() || R->mayWriteToMemory();
2886 });
2887}
2888
2890 assert(
2891 none_of(ExpressionRecipes,
2892 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2893 "expression cannot contain recipes with side-effects");
2894 return false;
2895}
2896
2898 // Cannot use vputils::isSingleScalar(), because all external operands
2899 // of the expression will be live-ins while bundled.
2900 return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2901 !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2902}
2903
2904#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2905
2907 VPSlotTracker &SlotTracker) const {
2908 O << Indent << "EXPRESSION ";
2910 O << " = ";
2911 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2912 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2913
2914 switch (ExpressionType) {
2915 case ExpressionTypes::ExtendedReduction: {
2917 O << " +";
2918 O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2920 Red->printFlags(O);
2921
2922 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2923 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2924 << *Ext0->getResultType();
2925 if (Red->isConditional()) {
2926 O << ", ";
2927 Red->getCondOp()->printAsOperand(O, SlotTracker);
2928 }
2929 O << ")";
2930 break;
2931 }
2932 case ExpressionTypes::ExtNegatedMulAccReduction: {
2934 O << " + reduce."
2936 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2937 << " (sub (0, mul";
2938 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2939 Mul->printFlags(O);
2940 O << "(";
2942 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2943 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2944 << *Ext0->getResultType() << "), (";
2946 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2947 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2948 << *Ext1->getResultType() << ")";
2949 if (Red->isConditional()) {
2950 O << ", ";
2951 Red->getCondOp()->printAsOperand(O, SlotTracker);
2952 }
2953 O << "))";
2954 break;
2955 }
2956 case ExpressionTypes::MulAccReduction:
2957 case ExpressionTypes::ExtMulAccReduction: {
2959 O << " + ";
2960 O << "reduce."
2962 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2963 << " (";
2964 O << "mul";
2965 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2966 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
2967 : ExpressionRecipes[0]);
2968 Mul->printFlags(O);
2969 if (IsExtended)
2970 O << "(";
2972 if (IsExtended) {
2973 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2974 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2975 << *Ext0->getResultType() << "), (";
2976 } else {
2977 O << ", ";
2978 }
2980 if (IsExtended) {
2981 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2982 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2983 << *Ext1->getResultType() << ")";
2984 }
2985 if (Red->isConditional()) {
2986 O << ", ";
2987 Red->getCondOp()->printAsOperand(O, SlotTracker);
2988 }
2989 O << ")";
2990 break;
2991 }
2992 }
2993}
2994
2996 VPSlotTracker &SlotTracker) const {
2997 O << Indent << "REDUCE ";
2999 O << " = ";
3001 O << " +";
3002 printFlags(O);
3003 O << " reduce."
3006 << " (";
3008 if (isConditional()) {
3009 O << ", ";
3011 }
3012 O << ")";
3013}
3014
3016 VPSlotTracker &SlotTracker) const {
3017 O << Indent << "REDUCE ";
3019 O << " = ";
3021 O << " +";
3022 printFlags(O);
3023 O << " vp.reduce."
3026 << " (";
3028 O << ", ";
3030 if (isConditional()) {
3031 O << ", ";
3033 }
3034 O << ")";
3035}
3036
3037#endif
3038
3039/// A helper function to scalarize a single Instruction in the innermost loop.
3040/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3041/// operands from \p RepRecipe instead of \p Instr's operands.
3042static void scalarizeInstruction(const Instruction *Instr,
3043 VPReplicateRecipe *RepRecipe,
3044 const VPLane &Lane, VPTransformState &State) {
3045 assert((!Instr->getType()->isAggregateType() ||
3046 canVectorizeTy(Instr->getType())) &&
3047 "Expected vectorizable or non-aggregate type.");
3048
3049 // Does this instruction return a value ?
3050 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3051
3052 Instruction *Cloned = Instr->clone();
3053 if (!IsVoidRetTy) {
3054 Cloned->setName(Instr->getName() + ".cloned");
3055 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3056 // The operands of the replicate recipe may have been narrowed, resulting in
3057 // a narrower result type. Update the type of the cloned instruction to the
3058 // correct type.
3059 if (ResultTy != Cloned->getType())
3060 Cloned->mutateType(ResultTy);
3061 }
3062
3063 RepRecipe->applyFlags(*Cloned);
3064 RepRecipe->applyMetadata(*Cloned);
3065
3066 if (RepRecipe->hasPredicate())
3067 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3068
3069 if (auto DL = RepRecipe->getDebugLoc())
3070 State.setDebugLocFrom(DL);
3071
3072 // Replace the operands of the cloned instructions with their scalar
3073 // equivalents in the new loop.
3074 for (const auto &I : enumerate(RepRecipe->operands())) {
3075 auto InputLane = Lane;
3076 VPValue *Operand = I.value();
3077 if (vputils::isSingleScalar(Operand))
3078 InputLane = VPLane::getFirstLane();
3079 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3080 }
3081
3082 // Place the cloned scalar in the new loop.
3083 State.Builder.Insert(Cloned);
3084
3085 State.set(RepRecipe, Cloned, Lane);
3086
3087 // If we just cloned a new assumption, add it the assumption cache.
3088 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3089 State.AC->registerAssumption(II);
3090
3091 assert(
3092 (RepRecipe->getRegion() ||
3093 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3094 all_of(RepRecipe->operands(),
3095 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3096 "Expected a recipe is either within a region or all of its operands "
3097 "are defined outside the vectorized region.");
3098}
3099
3102
3103 if (!State.Lane) {
3104 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3105 "must have already been unrolled");
3106 scalarizeInstruction(UI, this, VPLane(0), State);
3107 return;
3108 }
3109
3110 assert((State.VF.isScalar() || !isSingleScalar()) &&
3111 "uniform recipe shouldn't be predicated");
3112 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3113 scalarizeInstruction(UI, this, *State.Lane, State);
3114 // Insert scalar instance packing it into a vector.
3115 if (State.VF.isVector() && shouldPack()) {
3116 Value *WideValue =
3117 State.Lane->isFirstLane()
3118 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3119 : State.get(this);
3120 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3121 *State.Lane));
3122 }
3123}
3124
3126 // Find if the recipe is used by a widened recipe via an intervening
3127 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3128 return any_of(users(), [](const VPUser *U) {
3129 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3130 return !vputils::onlyScalarValuesUsed(PredR);
3131 return false;
3132 });
3133}
3134
3135/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3136/// model computes a SCEV expression when computing the address cost.
3138 auto *PtrR = Ptr->getDefiningRecipe();
3139 if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3141 Instruction::GetElementPtr) ||
3142 isa<VPWidenGEPRecipe>(PtrR) ||
3144 return false;
3145
3146 // We are looking for a GEP where all indices are either loop invariant or
3147 // inductions.
3148 for (VPValue *Opd : drop_begin(PtrR->operands())) {
3149 if (!Opd->isDefinedOutsideLoopRegions() &&
3151 return false;
3152 }
3153
3154 return true;
3155}
3156
3157/// Returns true if \p V is used as part of the address of another load or
3158/// store.
3159static bool isUsedByLoadStoreAddress(const VPUser *V) {
3161 SmallVector<const VPUser *> WorkList = {V};
3162
3163 while (!WorkList.empty()) {
3164 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3165 if (!Cur || !Seen.insert(Cur).second)
3166 continue;
3167
3168 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3169 // Skip blends that use V only through a compare by checking if any incoming
3170 // value was already visited.
3171 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3172 [&](unsigned I) {
3173 return Seen.contains(
3174 Blend->getIncomingValue(I)->getDefiningRecipe());
3175 }))
3176 continue;
3177
3178 for (VPUser *U : Cur->users()) {
3179 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3180 if (InterleaveR->getAddr() == Cur)
3181 return true;
3182 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3183 if (RepR->getOpcode() == Instruction::Load &&
3184 RepR->getOperand(0) == Cur)
3185 return true;
3186 if (RepR->getOpcode() == Instruction::Store &&
3187 RepR->getOperand(1) == Cur)
3188 return true;
3189 }
3190 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3191 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3192 return true;
3193 }
3194 }
3195
3196 // The legacy cost model only supports scalarization loads/stores with phi
3197 // addresses, if the phi is directly used as load/store address. Don't
3198 // traverse further for Blends.
3199 if (Blend)
3200 continue;
3201
3202 append_range(WorkList, Cur->users());
3203 }
3204 return false;
3205}
3206
3208 VPCostContext &Ctx) const {
3210 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3211 // transform, avoid computing their cost multiple times for now.
3212 Ctx.SkipCostComputation.insert(UI);
3213
3214 if (VF.isScalable() && !isSingleScalar())
3216
3217 switch (UI->getOpcode()) {
3218 case Instruction::GetElementPtr:
3219 // We mark this instruction as zero-cost because the cost of GEPs in
3220 // vectorized code depends on whether the corresponding memory instruction
3221 // is scalarized or not. Therefore, we handle GEPs with the memory
3222 // instruction cost.
3223 return 0;
3224 case Instruction::Call: {
3225 auto *CalledFn =
3227
3230 for (const VPValue *ArgOp : ArgOps)
3231 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3232
3233 if (CalledFn->isIntrinsic())
3234 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3235 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3236 switch (CalledFn->getIntrinsicID()) {
3237 case Intrinsic::assume:
3238 case Intrinsic::lifetime_end:
3239 case Intrinsic::lifetime_start:
3240 case Intrinsic::sideeffect:
3241 case Intrinsic::pseudoprobe:
3242 case Intrinsic::experimental_noalias_scope_decl: {
3243 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3244 ElementCount::getFixed(1), Ctx) == 0 &&
3245 "scalarizing intrinsic should be free");
3246 return InstructionCost(0);
3247 }
3248 default:
3249 break;
3250 }
3251
3252 Type *ResultTy = Ctx.Types.inferScalarType(this);
3253 InstructionCost ScalarCallCost =
3254 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3255 if (isSingleScalar()) {
3256 if (CalledFn->isIntrinsic())
3257 ScalarCallCost = std::min(
3258 ScalarCallCost,
3259 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3260 ElementCount::getFixed(1), Ctx));
3261 return ScalarCallCost;
3262 }
3263
3264 return ScalarCallCost * VF.getFixedValue() +
3265 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3266 }
3267 case Instruction::Add:
3268 case Instruction::Sub:
3269 case Instruction::FAdd:
3270 case Instruction::FSub:
3271 case Instruction::Mul:
3272 case Instruction::FMul:
3273 case Instruction::FDiv:
3274 case Instruction::FRem:
3275 case Instruction::Shl:
3276 case Instruction::LShr:
3277 case Instruction::AShr:
3278 case Instruction::And:
3279 case Instruction::Or:
3280 case Instruction::Xor:
3281 case Instruction::ICmp:
3282 case Instruction::FCmp:
3284 Ctx) *
3285 (isSingleScalar() ? 1 : VF.getFixedValue());
3286 case Instruction::SDiv:
3287 case Instruction::UDiv:
3288 case Instruction::SRem:
3289 case Instruction::URem: {
3290 InstructionCost ScalarCost =
3292 if (isSingleScalar())
3293 return ScalarCost;
3294
3295 ScalarCost = ScalarCost * VF.getFixedValue() +
3296 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3297 to_vector(operands()), VF);
3298 // If the recipe is not predicated (i.e. not in a replicate region), return
3299 // the scalar cost. Otherwise handle predicated cost.
3300 if (!getRegion()->isReplicator())
3301 return ScalarCost;
3302
3303 // Account for the phi nodes that we will create.
3304 ScalarCost += VF.getFixedValue() *
3305 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3306 // Scale the cost by the probability of executing the predicated blocks.
3307 // This assumes the predicated block for each vector lane is equally
3308 // likely.
3309 ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3310 return ScalarCost;
3311 }
3312 case Instruction::Load:
3313 case Instruction::Store: {
3314 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3315 // predicated cases.
3316 const VPRegionBlock *ParentRegion = getRegion();
3317 if (ParentRegion && ParentRegion->isReplicator())
3318 break;
3319
3320 bool IsLoad = UI->getOpcode() == Instruction::Load;
3321 const VPValue *PtrOp = getOperand(!IsLoad);
3322 // TODO: Handle cases where we need to pass a SCEV to
3323 // getAddressComputationCost.
3324 if (shouldUseAddressAccessSCEV(PtrOp))
3325 break;
3326
3327 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3328 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3329 const Align Alignment = getLoadStoreAlignment(UI);
3330 unsigned AS = getLoadStoreAddressSpace(UI);
3332 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3333 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3334
3335 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3336 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3337 bool UsedByLoadStoreAddress =
3338 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3339 InstructionCost ScalarCost =
3340 ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3341 PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3342 nullptr, Ctx.CostKind);
3343 if (isSingleScalar())
3344 return ScalarCost;
3345
3346 SmallVector<const VPValue *> OpsToScalarize;
3347 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3348 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3349 // don't assign scalarization overhead in general, if the target prefers
3350 // vectorized addressing or the loaded value is used as part of an address
3351 // of another load or store.
3352 if (!UsedByLoadStoreAddress) {
3353 bool EfficientVectorLoadStore =
3354 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3355 if (!(IsLoad && !PreferVectorizedAddressing) &&
3356 !(!IsLoad && EfficientVectorLoadStore))
3357 append_range(OpsToScalarize, operands());
3358
3359 if (!EfficientVectorLoadStore)
3360 ResultTy = Ctx.Types.inferScalarType(this);
3361 }
3362
3363 return (ScalarCost * VF.getFixedValue()) +
3364 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3365 }
3366 }
3367
3368 return Ctx.getLegacyCost(UI, VF);
3369}
3370
3371#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3373 VPSlotTracker &SlotTracker) const {
3374 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3375
3376 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3378 O << " = ";
3379 }
3380 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3381 O << "call";
3382 printFlags(O);
3383 O << "@" << CB->getCalledFunction()->getName() << "(";
3385 O, [&O, &SlotTracker](VPValue *Op) {
3386 Op->printAsOperand(O, SlotTracker);
3387 });
3388 O << ")";
3389 } else {
3391 printFlags(O);
3393 }
3394
3395 if (shouldPack())
3396 O << " (S->V)";
3397}
3398#endif
3399
3401 assert(State.Lane && "Branch on Mask works only on single instance.");
3402
3403 VPValue *BlockInMask = getOperand(0);
3404 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3405
3406 // Replace the temporary unreachable terminator with a new conditional branch,
3407 // whose two destinations will be set later when they are created.
3408 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3409 assert(isa<UnreachableInst>(CurrentTerminator) &&
3410 "Expected to replace unreachable terminator with conditional branch.");
3411 auto CondBr =
3412 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3413 CondBr->setSuccessor(0, nullptr);
3414 CurrentTerminator->eraseFromParent();
3415}
3416
3418 VPCostContext &Ctx) const {
3419 // The legacy cost model doesn't assign costs to branches for individual
3420 // replicate regions. Match the current behavior in the VPlan cost model for
3421 // now.
3422 return 0;
3423}
3424
3426 assert(State.Lane && "Predicated instruction PHI works per instance.");
3427 Instruction *ScalarPredInst =
3428 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3429 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3430 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3431 assert(PredicatingBB && "Predicated block has no single predecessor.");
3433 "operand must be VPReplicateRecipe");
3434
3435 // By current pack/unpack logic we need to generate only a single phi node: if
3436 // a vector value for the predicated instruction exists at this point it means
3437 // the instruction has vector users only, and a phi for the vector value is
3438 // needed. In this case the recipe of the predicated instruction is marked to
3439 // also do that packing, thereby "hoisting" the insert-element sequence.
3440 // Otherwise, a phi node for the scalar value is needed.
3441 if (State.hasVectorValue(getOperand(0))) {
3442 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3444 "Packed operands must generate an insertelement or insertvalue");
3445
3446 // If VectorI is a struct, it will be a sequence like:
3447 // %1 = insertvalue %unmodified, %x, 0
3448 // %2 = insertvalue %1, %y, 1
3449 // %VectorI = insertvalue %2, %z, 2
3450 // To get the unmodified vector we need to look through the chain.
3451 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3452 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3453 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3454
3455 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3456 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3457 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3458 if (State.hasVectorValue(this))
3459 State.reset(this, VPhi);
3460 else
3461 State.set(this, VPhi);
3462 // NOTE: Currently we need to update the value of the operand, so the next
3463 // predicated iteration inserts its generated value in the correct vector.
3464 State.reset(getOperand(0), VPhi);
3465 } else {
3466 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3467 return;
3468
3469 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3470 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3471 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3472 PredicatingBB);
3473 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3474 if (State.hasScalarValue(this, *State.Lane))
3475 State.reset(this, Phi, *State.Lane);
3476 else
3477 State.set(this, Phi, *State.Lane);
3478 // NOTE: Currently we need to update the value of the operand, so the next
3479 // predicated iteration inserts its generated value in the correct vector.
3480 State.reset(getOperand(0), Phi, *State.Lane);
3481 }
3482}
3483
3484#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3486 VPSlotTracker &SlotTracker) const {
3487 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3489 O << " = ";
3491}
3492#endif
3493
3495 VPCostContext &Ctx) const {
3497 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3498 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3499 ->getAddressSpace();
3500 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3501 ? Instruction::Load
3502 : Instruction::Store;
3503
3504 if (!Consecutive) {
3505 // TODO: Using the original IR may not be accurate.
3506 // Currently, ARM will use the underlying IR to calculate gather/scatter
3507 // instruction cost.
3508 assert(!Reverse &&
3509 "Inconsecutive memory access should not have the order.");
3510
3512 Type *PtrTy = Ptr->getType();
3513
3514 // If the address value is uniform across all lanes, then the address can be
3515 // calculated with scalar type and broadcast.
3517 PtrTy = toVectorTy(PtrTy, VF);
3518
3519 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3520 Ctx.CostKind) +
3521 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3522 Ctx.CostKind, &Ingredient);
3523 }
3524
3526 if (IsMasked) {
3527 Cost +=
3528 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3529 } else {
3530 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3532 : getOperand(1));
3533 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3534 OpInfo, &Ingredient);
3535 }
3536 if (!Reverse)
3537 return Cost;
3538
3539 return Cost += Ctx.TTI.getShuffleCost(
3541 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3542}
3543
3545 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3546 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3547 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3548 bool CreateGather = !isConsecutive();
3549
3550 auto &Builder = State.Builder;
3551 Value *Mask = nullptr;
3552 if (auto *VPMask = getMask()) {
3553 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3554 // of a null all-one mask is a null mask.
3555 Mask = State.get(VPMask);
3556 if (isReverse())
3557 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3558 }
3559
3560 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3561 Value *NewLI;
3562 if (CreateGather) {
3563 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3564 "wide.masked.gather");
3565 } else if (Mask) {
3566 NewLI =
3567 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3568 PoisonValue::get(DataTy), "wide.masked.load");
3569 } else {
3570 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3571 }
3573 if (Reverse)
3574 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3575 State.set(this, NewLI);
3576}
3577
3578#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3580 VPSlotTracker &SlotTracker) const {
3581 O << Indent << "WIDEN ";
3583 O << " = load ";
3585}
3586#endif
3587
3588/// Use all-true mask for reverse rather than actual mask, as it avoids a
3589/// dependence w/o affecting the result.
3591 Value *EVL, const Twine &Name) {
3592 VectorType *ValTy = cast<VectorType>(Operand->getType());
3593 Value *AllTrueMask =
3594 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3595 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3596 {Operand, AllTrueMask, EVL}, nullptr, Name);
3597}
3598
3600 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3601 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3602 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3603 bool CreateGather = !isConsecutive();
3604
3605 auto &Builder = State.Builder;
3606 CallInst *NewLI;
3607 Value *EVL = State.get(getEVL(), VPLane(0));
3608 Value *Addr = State.get(getAddr(), !CreateGather);
3609 Value *Mask = nullptr;
3610 if (VPValue *VPMask = getMask()) {
3611 Mask = State.get(VPMask);
3612 if (isReverse())
3613 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3614 } else {
3615 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3616 }
3617
3618 if (CreateGather) {
3619 NewLI =
3620 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3621 nullptr, "wide.masked.gather");
3622 } else {
3623 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3624 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3625 }
3626 NewLI->addParamAttr(
3627 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
3628 applyMetadata(*NewLI);
3629 Instruction *Res = NewLI;
3630 if (isReverse())
3631 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3632 State.set(this, Res);
3633}
3634
3636 VPCostContext &Ctx) const {
3637 if (!Consecutive || IsMasked)
3638 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3639
3640 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3641 // here because the EVL recipes using EVL to replace the tail mask. But in the
3642 // legacy model, it will always calculate the cost of mask.
3643 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3644 // don't need to compare to the legacy cost model.
3646 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3647 unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3648 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3649 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3650 if (!Reverse)
3651 return Cost;
3652
3653 return Cost + Ctx.TTI.getShuffleCost(
3655 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3656}
3657
3658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3660 VPSlotTracker &SlotTracker) const {
3661 O << Indent << "WIDEN ";
3663 O << " = vp.load ";
3665}
3666#endif
3667
3669 VPValue *StoredVPValue = getStoredValue();
3670 bool CreateScatter = !isConsecutive();
3671 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3672
3673 auto &Builder = State.Builder;
3674
3675 Value *Mask = nullptr;
3676 if (auto *VPMask = getMask()) {
3677 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3678 // of a null all-one mask is a null mask.
3679 Mask = State.get(VPMask);
3680 if (isReverse())
3681 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3682 }
3683
3684 Value *StoredVal = State.get(StoredVPValue);
3685 if (isReverse()) {
3686 // If we store to reverse consecutive memory locations, then we need
3687 // to reverse the order of elements in the stored value.
3688 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3689 // We don't want to update the value in the map as it might be used in
3690 // another expression. So don't call resetVectorValue(StoredVal).
3691 }
3692 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3693 Instruction *NewSI = nullptr;
3694 if (CreateScatter)
3695 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3696 else if (Mask)
3697 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3698 else
3699 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3700 applyMetadata(*NewSI);
3701}
3702
3703#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3705 VPSlotTracker &SlotTracker) const {
3706 O << Indent << "WIDEN store ";
3708}
3709#endif
3710
3712 VPValue *StoredValue = getStoredValue();
3713 bool CreateScatter = !isConsecutive();
3714 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3715
3716 auto &Builder = State.Builder;
3717
3718 CallInst *NewSI = nullptr;
3719 Value *StoredVal = State.get(StoredValue);
3720 Value *EVL = State.get(getEVL(), VPLane(0));
3721 if (isReverse())
3722 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3723 Value *Mask = nullptr;
3724 if (VPValue *VPMask = getMask()) {
3725 Mask = State.get(VPMask);
3726 if (isReverse())
3727 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3728 } else {
3729 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3730 }
3731 Value *Addr = State.get(getAddr(), !CreateScatter);
3732 if (CreateScatter) {
3733 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3734 Intrinsic::vp_scatter,
3735 {StoredVal, Addr, Mask, EVL});
3736 } else {
3737 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3738 Intrinsic::vp_store,
3739 {StoredVal, Addr, Mask, EVL});
3740 }
3741 NewSI->addParamAttr(
3742 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
3743 applyMetadata(*NewSI);
3744}
3745
3747 VPCostContext &Ctx) const {
3748 if (!Consecutive || IsMasked)
3749 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3750
3751 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3752 // here because the EVL recipes using EVL to replace the tail mask. But in the
3753 // legacy model, it will always calculate the cost of mask.
3754 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3755 // don't need to compare to the legacy cost model.
3757 const Align Alignment = getLoadStoreAlignment(&Ingredient);
3758 unsigned AS = getLoadStoreAddressSpace(&Ingredient);
3759 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3760 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3761 if (!Reverse)
3762 return Cost;
3763
3764 return Cost + Ctx.TTI.getShuffleCost(
3766 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3767}
3768
3769#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3771 VPSlotTracker &SlotTracker) const {
3772 O << Indent << "WIDEN vp.store ";
3774}
3775#endif
3776
3778 VectorType *DstVTy, const DataLayout &DL) {
3779 // Verify that V is a vector type with same number of elements as DstVTy.
3780 auto VF = DstVTy->getElementCount();
3781 auto *SrcVecTy = cast<VectorType>(V->getType());
3782 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3783 Type *SrcElemTy = SrcVecTy->getElementType();
3784 Type *DstElemTy = DstVTy->getElementType();
3785 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3786 "Vector elements must have same size");
3787
3788 // Do a direct cast if element types are castable.
3789 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3790 return Builder.CreateBitOrPointerCast(V, DstVTy);
3791 }
3792 // V cannot be directly casted to desired vector type.
3793 // May happen when V is a floating point vector but DstVTy is a vector of
3794 // pointers or vice-versa. Handle this using a two-step bitcast using an
3795 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3796 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3797 "Only one type should be a pointer type");
3798 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3799 "Only one type should be a floating point type");
3800 Type *IntTy =
3801 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3802 auto *VecIntTy = VectorType::get(IntTy, VF);
3803 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3804 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3805}
3806
3807/// Return a vector containing interleaved elements from multiple
3808/// smaller input vectors.
3810 const Twine &Name) {
3811 unsigned Factor = Vals.size();
3812 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3813
3814 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3815#ifndef NDEBUG
3816 for (Value *Val : Vals)
3817 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3818#endif
3819
3820 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3821 // must use intrinsics to interleave.
3822 if (VecTy->isScalableTy()) {
3823 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3824 return Builder.CreateVectorInterleave(Vals, Name);
3825 }
3826
3827 // Fixed length. Start by concatenating all vectors into a wide vector.
3828 Value *WideVec = concatenateVectors(Builder, Vals);
3829
3830 // Interleave the elements into the wide vector.
3831 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3832 return Builder.CreateShuffleVector(
3833 WideVec, createInterleaveMask(NumElts, Factor), Name);
3834}
3835
3836// Try to vectorize the interleave group that \p Instr belongs to.
3837//
3838// E.g. Translate following interleaved load group (factor = 3):
3839// for (i = 0; i < N; i+=3) {
3840// R = Pic[i]; // Member of index 0
3841// G = Pic[i+1]; // Member of index 1
3842// B = Pic[i+2]; // Member of index 2
3843// ... // do something to R, G, B
3844// }
3845// To:
3846// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3847// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3848// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3849// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3850//
3851// Or translate following interleaved store group (factor = 3):
3852// for (i = 0; i < N; i+=3) {
3853// ... do something to R, G, B
3854// Pic[i] = R; // Member of index 0
3855// Pic[i+1] = G; // Member of index 1
3856// Pic[i+2] = B; // Member of index 2
3857// }
3858// To:
3859// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3860// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3861// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3862// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3863// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3865 assert(!State.Lane && "Interleave group being replicated.");
3866 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3867 "Masking gaps for scalable vectors is not yet supported.");
3869 Instruction *Instr = Group->getInsertPos();
3870
3871 // Prepare for the vector type of the interleaved load/store.
3872 Type *ScalarTy = getLoadStoreType(Instr);
3873 unsigned InterleaveFactor = Group->getFactor();
3874 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3875
3876 VPValue *BlockInMask = getMask();
3877 VPValue *Addr = getAddr();
3878 Value *ResAddr = State.get(Addr, VPLane(0));
3879
3880 auto CreateGroupMask = [&BlockInMask, &State,
3881 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3882 if (State.VF.isScalable()) {
3883 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3884 assert(InterleaveFactor <= 8 &&
3885 "Unsupported deinterleave factor for scalable vectors");
3886 auto *ResBlockInMask = State.get(BlockInMask);
3887 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3888 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3889 }
3890
3891 if (!BlockInMask)
3892 return MaskForGaps;
3893
3894 Value *ResBlockInMask = State.get(BlockInMask);
3895 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3896 ResBlockInMask,
3897 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3898 "interleaved.mask");
3899 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3900 ShuffledMask, MaskForGaps)
3901 : ShuffledMask;
3902 };
3903
3904 const DataLayout &DL = Instr->getDataLayout();
3905 // Vectorize the interleaved load group.
3906 if (isa<LoadInst>(Instr)) {
3907 Value *MaskForGaps = nullptr;
3908 if (needsMaskForGaps()) {
3909 MaskForGaps =
3910 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3911 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3912 }
3913
3914 Instruction *NewLoad;
3915 if (BlockInMask || MaskForGaps) {
3916 Value *GroupMask = CreateGroupMask(MaskForGaps);
3917 Value *PoisonVec = PoisonValue::get(VecTy);
3918 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3919 Group->getAlign(), GroupMask,
3920 PoisonVec, "wide.masked.vec");
3921 } else
3922 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3923 Group->getAlign(), "wide.vec");
3924 applyMetadata(*NewLoad);
3925 // TODO: Also manage existing metadata using VPIRMetadata.
3926 Group->addMetadata(NewLoad);
3927
3929 if (VecTy->isScalableTy()) {
3930 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3931 // so must use intrinsics to deinterleave.
3932 assert(InterleaveFactor <= 8 &&
3933 "Unsupported deinterleave factor for scalable vectors");
3934 NewLoad = State.Builder.CreateIntrinsic(
3935 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
3936 NewLoad->getType(), NewLoad,
3937 /*FMFSource=*/nullptr, "strided.vec");
3938 }
3939
3940 auto CreateStridedVector = [&InterleaveFactor, &State,
3941 &NewLoad](unsigned Index) -> Value * {
3942 assert(Index < InterleaveFactor && "Illegal group index");
3943 if (State.VF.isScalable())
3944 return State.Builder.CreateExtractValue(NewLoad, Index);
3945
3946 // For fixed length VF, use shuffle to extract the sub-vectors from the
3947 // wide load.
3948 auto StrideMask =
3949 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
3950 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
3951 "strided.vec");
3952 };
3953
3954 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3955 Instruction *Member = Group->getMember(I);
3956
3957 // Skip the gaps in the group.
3958 if (!Member)
3959 continue;
3960
3961 Value *StridedVec = CreateStridedVector(I);
3962
3963 // If this member has different type, cast the result type.
3964 if (Member->getType() != ScalarTy) {
3965 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3966 StridedVec =
3967 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3968 }
3969
3970 if (Group->isReverse())
3971 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
3972
3973 State.set(VPDefs[J], StridedVec);
3974 ++J;
3975 }
3976 return;
3977 }
3978
3979 // The sub vector type for current instruction.
3980 auto *SubVT = VectorType::get(ScalarTy, State.VF);
3981
3982 // Vectorize the interleaved store group.
3983 Value *MaskForGaps =
3984 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
3985 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
3986 "Mismatch between NeedsMaskForGaps and MaskForGaps");
3987 ArrayRef<VPValue *> StoredValues = getStoredValues();
3988 // Collect the stored vector from each member.
3989 SmallVector<Value *, 4> StoredVecs;
3990 unsigned StoredIdx = 0;
3991 for (unsigned i = 0; i < InterleaveFactor; i++) {
3992 assert((Group->getMember(i) || MaskForGaps) &&
3993 "Fail to get a member from an interleaved store group");
3994 Instruction *Member = Group->getMember(i);
3995
3996 // Skip the gaps in the group.
3997 if (!Member) {
3998 Value *Undef = PoisonValue::get(SubVT);
3999 StoredVecs.push_back(Undef);
4000 continue;
4001 }
4002
4003 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4004 ++StoredIdx;
4005
4006 if (Group->isReverse())
4007 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4008
4009 // If this member has different type, cast it to a unified type.
4010
4011 if (StoredVec->getType() != SubVT)
4012 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4013
4014 StoredVecs.push_back(StoredVec);
4015 }
4016
4017 // Interleave all the smaller vectors into one wider vector.
4018 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4019 Instruction *NewStoreInstr;
4020 if (BlockInMask || MaskForGaps) {
4021 Value *GroupMask = CreateGroupMask(MaskForGaps);
4022 NewStoreInstr = State.Builder.CreateMaskedStore(
4023 IVec, ResAddr, Group->getAlign(), GroupMask);
4024 } else
4025 NewStoreInstr =
4026 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4027
4028 applyMetadata(*NewStoreInstr);
4029 // TODO: Also manage existing metadata using VPIRMetadata.
4030 Group->addMetadata(NewStoreInstr);
4031}
4032
4033#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4035 VPSlotTracker &SlotTracker) const {
4037 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4038 IG->getInsertPos()->printAsOperand(O, false);
4039 O << ", ";
4041 VPValue *Mask = getMask();
4042 if (Mask) {
4043 O << ", ";
4044 Mask->printAsOperand(O, SlotTracker);
4045 }
4046
4047 unsigned OpIdx = 0;
4048 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4049 if (!IG->getMember(i))
4050 continue;
4051 if (getNumStoreOperands() > 0) {
4052 O << "\n" << Indent << " store ";
4054 O << " to index " << i;
4055 } else {
4056 O << "\n" << Indent << " ";
4058 O << " = load from index " << i;
4059 }
4060 ++OpIdx;
4061 }
4062}
4063#endif
4064
4066 assert(!State.Lane && "Interleave group being replicated.");
4067 assert(State.VF.isScalable() &&
4068 "Only support scalable VF for EVL tail-folding.");
4070 "Masking gaps for scalable vectors is not yet supported.");
4072 Instruction *Instr = Group->getInsertPos();
4073
4074 // Prepare for the vector type of the interleaved load/store.
4075 Type *ScalarTy = getLoadStoreType(Instr);
4076 unsigned InterleaveFactor = Group->getFactor();
4077 assert(InterleaveFactor <= 8 &&
4078 "Unsupported deinterleave/interleave factor for scalable vectors");
4079 ElementCount WideVF = State.VF * InterleaveFactor;
4080 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4081
4082 VPValue *Addr = getAddr();
4083 Value *ResAddr = State.get(Addr, VPLane(0));
4084 Value *EVL = State.get(getEVL(), VPLane(0));
4085 Value *InterleaveEVL = State.Builder.CreateMul(
4086 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4087 /* NUW= */ true, /* NSW= */ true);
4088 LLVMContext &Ctx = State.Builder.getContext();
4089
4090 Value *GroupMask = nullptr;
4091 if (VPValue *BlockInMask = getMask()) {
4092 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4093 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4094 } else {
4095 GroupMask =
4096 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4097 }
4098
4099 // Vectorize the interleaved load group.
4100 if (isa<LoadInst>(Instr)) {
4101 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4102 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4103 "wide.vp.load");
4104 NewLoad->addParamAttr(0,
4105 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4106
4107 applyMetadata(*NewLoad);
4108 // TODO: Also manage existing metadata using VPIRMetadata.
4109 Group->addMetadata(NewLoad);
4110
4111 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4112 // so must use intrinsics to deinterleave.
4113 NewLoad = State.Builder.CreateIntrinsic(
4114 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4115 NewLoad->getType(), NewLoad,
4116 /*FMFSource=*/nullptr, "strided.vec");
4117
4118 const DataLayout &DL = Instr->getDataLayout();
4119 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4120 Instruction *Member = Group->getMember(I);
4121 // Skip the gaps in the group.
4122 if (!Member)
4123 continue;
4124
4125 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4126 // If this member has different type, cast the result type.
4127 if (Member->getType() != ScalarTy) {
4128 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4129 StridedVec =
4130 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4131 }
4132
4133 State.set(getVPValue(J), StridedVec);
4134 ++J;
4135 }
4136 return;
4137 } // End for interleaved load.
4138
4139 // The sub vector type for current instruction.
4140 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4141 // Vectorize the interleaved store group.
4142 ArrayRef<VPValue *> StoredValues = getStoredValues();
4143 // Collect the stored vector from each member.
4144 SmallVector<Value *, 4> StoredVecs;
4145 const DataLayout &DL = Instr->getDataLayout();
4146 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4147 Instruction *Member = Group->getMember(I);
4148 // Skip the gaps in the group.
4149 if (!Member) {
4150 StoredVecs.push_back(PoisonValue::get(SubVT));
4151 continue;
4152 }
4153
4154 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4155 // If this member has different type, cast it to a unified type.
4156 if (StoredVec->getType() != SubVT)
4157 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4158
4159 StoredVecs.push_back(StoredVec);
4160 ++StoredIdx;
4161 }
4162
4163 // Interleave all the smaller vectors into one wider vector.
4164 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4165 CallInst *NewStore =
4166 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4167 {IVec, ResAddr, GroupMask, InterleaveEVL});
4168 NewStore->addParamAttr(1,
4169 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4170
4171 applyMetadata(*NewStore);
4172 // TODO: Also manage existing metadata using VPIRMetadata.
4173 Group->addMetadata(NewStore);
4174}
4175
4176#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4178 VPSlotTracker &SlotTracker) const {
4180 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4181 IG->getInsertPos()->printAsOperand(O, false);
4182 O << ", ";
4184 O << ", ";
4186 if (VPValue *Mask = getMask()) {
4187 O << ", ";
4188 Mask->printAsOperand(O, SlotTracker);
4189 }
4190
4191 unsigned OpIdx = 0;
4192 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4193 if (!IG->getMember(i))
4194 continue;
4195 if (getNumStoreOperands() > 0) {
4196 O << "\n" << Indent << " vp.store ";
4198 O << " to index " << i;
4199 } else {
4200 O << "\n" << Indent << " ";
4202 O << " = vp.load from index " << i;
4203 }
4204 ++OpIdx;
4205 }
4206}
4207#endif
4208
4210 VPCostContext &Ctx) const {
4211 Instruction *InsertPos = getInsertPos();
4212 // Find the VPValue index of the interleave group. We need to skip gaps.
4213 unsigned InsertPosIdx = 0;
4214 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4215 if (auto *Member = IG->getMember(Idx)) {
4216 if (Member == InsertPos)
4217 break;
4218 InsertPosIdx++;
4219 }
4220 Type *ValTy = Ctx.Types.inferScalarType(
4221 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4222 : getStoredValues()[InsertPosIdx]);
4223 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4224 unsigned AS = getLoadStoreAddressSpace(InsertPos);
4225
4226 unsigned InterleaveFactor = IG->getFactor();
4227 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4228
4229 // Holds the indices of existing members in the interleaved group.
4231 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4232 if (IG->getMember(IF))
4233 Indices.push_back(IF);
4234
4235 // Calculate the cost of the whole interleaved group.
4236 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4237 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4238 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4239
4240 if (!IG->isReverse())
4241 return Cost;
4242
4243 return Cost + IG->getNumMembers() *
4244 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4245 VectorTy, VectorTy, {}, Ctx.CostKind,
4246 0);
4247}
4248
4249#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4251 VPSlotTracker &SlotTracker) const {
4252 O << Indent << "EMIT ";
4254 O << " = CANONICAL-INDUCTION ";
4256}
4257#endif
4258
4260 return IsScalarAfterVectorization &&
4261 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4262}
4263
4264#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4266 VPSlotTracker &SlotTracker) const {
4267 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4268 "unexpected number of operands");
4269 O << Indent << "EMIT ";
4271 O << " = WIDEN-POINTER-INDUCTION ";
4273 O << ", ";
4275 O << ", ";
4277 if (getNumOperands() == 5) {
4278 O << ", ";
4280 O << ", ";
4282 }
4283}
4284
4286 VPSlotTracker &SlotTracker) const {
4287 O << Indent << "EMIT ";
4289 O << " = EXPAND SCEV " << *Expr;
4290}
4291#endif
4292
4294 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4295 Type *STy = CanonicalIV->getType();
4296 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4297 ElementCount VF = State.VF;
4298 Value *VStart = VF.isScalar()
4299 ? CanonicalIV
4300 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4301 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
4302 if (VF.isVector()) {
4303 VStep = Builder.CreateVectorSplat(VF, VStep);
4304 VStep =
4305 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4306 }
4307 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4308 State.set(this, CanonicalVectorIV);
4309}
4310
4311#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4313 VPSlotTracker &SlotTracker) const {
4314 O << Indent << "EMIT ";
4316 O << " = WIDEN-CANONICAL-INDUCTION ";
4318}
4319#endif
4320
4322 auto &Builder = State.Builder;
4323 // Create a vector from the initial value.
4324 auto *VectorInit = getStartValue()->getLiveInIRValue();
4325
4326 Type *VecTy = State.VF.isScalar()
4327 ? VectorInit->getType()
4328 : VectorType::get(VectorInit->getType(), State.VF);
4329
4330 BasicBlock *VectorPH =
4331 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4332 if (State.VF.isVector()) {
4333 auto *IdxTy = Builder.getInt32Ty();
4334 auto *One = ConstantInt::get(IdxTy, 1);
4335 IRBuilder<>::InsertPointGuard Guard(Builder);
4336 Builder.SetInsertPoint(VectorPH->getTerminator());
4337 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4338 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4339 VectorInit = Builder.CreateInsertElement(
4340 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4341 }
4342
4343 // Create a phi node for the new recurrence.
4344 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4345 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4346 Phi->addIncoming(VectorInit, VectorPH);
4347 State.set(this, Phi);
4348}
4349
4352 VPCostContext &Ctx) const {
4353 if (VF.isScalar())
4354 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4355
4356 return 0;
4357}
4358
4359#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4361 VPSlotTracker &SlotTracker) const {
4362 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4364 O << " = phi ";
4366}
4367#endif
4368
4370 // Reductions do not have to start at zero. They can start with
4371 // any loop invariant values.
4372 VPValue *StartVPV = getStartValue();
4373
4374 // In order to support recurrences we need to be able to vectorize Phi nodes.
4375 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4376 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4377 // this value when we vectorize all of the instructions that use the PHI.
4378 BasicBlock *VectorPH =
4379 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4380 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
4381 Value *StartV = State.get(StartVPV, ScalarPHI);
4382 Type *VecTy = StartV->getType();
4383
4384 BasicBlock *HeaderBB = State.CFG.PrevBB;
4385 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4386 "recipe must be in the vector loop header");
4387 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4388 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4389 State.set(this, Phi, IsInLoop);
4390
4391 Phi->addIncoming(StartV, VectorPH);
4392}
4393
4394#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4396 VPSlotTracker &SlotTracker) const {
4397 O << Indent << "WIDEN-REDUCTION-PHI ";
4398
4400 O << " = phi ";
4402 if (VFScaleFactor != 1)
4403 O << " (VF scaled by 1/" << VFScaleFactor << ")";
4404}
4405#endif
4406
4408 Value *Op0 = State.get(getOperand(0));
4409 Type *VecTy = Op0->getType();
4410 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4411 State.set(this, VecPhi);
4412}
4413
4414#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4416 VPSlotTracker &SlotTracker) const {
4417 O << Indent << "WIDEN-PHI ";
4418
4420 O << " = phi ";
4422}
4423#endif
4424
4425// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
4426// remove VPActiveLaneMaskPHIRecipe.
4428 BasicBlock *VectorPH =
4429 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4430 Value *StartMask = State.get(getOperand(0));
4431 PHINode *Phi =
4432 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4433 Phi->addIncoming(StartMask, VectorPH);
4434 State.set(this, Phi);
4435}
4436
4437#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4439 VPSlotTracker &SlotTracker) const {
4440 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4441
4443 O << " = phi ";
4445}
4446#endif
4447
4448#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4450 VPSlotTracker &SlotTracker) const {
4451 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4452
4454 O << " = phi ";
4456}
4457#endif
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
#define I(x, y, z)
Definition MD5.cpp:58
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Type * getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, unsigned CurrentPart, IRBuilderBase &Builder)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr)
Returns true if Ptr is a pointer computation for which the legacy cost model computes a SCEV expressi...
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
static BranchInst * createCondBranch(Value *Cond, VPBasicBlock *VPBB, VPTransformState &State)
Create a conditional branch using Cond branching to the successors of VPBB.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Conditional or Unconditional Branch instruction.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:271
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:661
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:594
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2633
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LLVM_ABI Value * CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, const Twine &Name="")
Return a vector splice intrinsic if using scalable vectors, otherwise return a shufflevector.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2039
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:567
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1725
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1808
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1134
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
LLVMContext & getContext() const
Definition IRBuilder.h:203
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
std::pair< MDNode *, MDNode * > getNoAliasMetadataFor(const Instruction *OrigInst) const
Returns a pair containing the alias_scope and noalias metadata nodes for OrigInst,...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static bool isSignedRecurrenceKind(RecurKind Kind)
Returns true if recurrece kind is a signed redux kind.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
value_op_iterator value_op_end()
Definition User.h:313
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
value_op_iterator value_op_begin()
Definition User.h:310
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3800
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:3853
iterator end()
Definition VPlan.h:3837
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:3866
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2435
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2430
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:348
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
This class augments a recipe with a set of VPValues defined by the recipe.
Definition VPlanValue.h:302
void dump() const
Dump the VPDef to stderr (for debugging).
Definition VPlan.cpp:126
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
friend class VPValue
Definition VPlanValue.h:303
unsigned getVPDefID() const
Definition VPlanValue.h:429
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3677
VPValue * getStartValue() const
Definition VPlan.h:3676
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2018
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:1714
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
FastMathFlagsTy FMFs
Definition VPlan.h:660
bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
WrapFlagsTy WrapFlags
Definition VPlan.h:654
CmpInst::Predicate CmpPredicate
Definition VPlan.h:653
void printFlags(raw_ostream &O) const
GEPNoWrapFlags GEPFlags
Definition VPlan.h:658
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:818
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
TruncFlagsTy TruncFlags
Definition VPlan.h:655
CmpInst::Predicate getPredicate() const
Definition VPlan.h:800
ExactFlagsTy ExactFlags
Definition VPlan.h:657
bool hasNoSignedWrap() const
Definition VPlan.h:842
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:812
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:815
DisjointFlagsTy DisjointFlags
Definition VPlan.h:656
unsigned AllFlags
Definition VPlan.h:661
bool hasNoUnsignedWrap() const
Definition VPlan.h:831
NonNegFlagsTy NonNegFlags
Definition VPlan.h:659
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:763
Instruction & getInstruction() const
Definition VPlan.h:1379
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void extractLastLaneOfFirstOperand(VPBuilder &Builder)
Update the recipes first operand to the last lane of the operand using Builder.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1354
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition VPlan.h:1104
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1063
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1017
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1053
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1066
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1014
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1057
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ VScale
Returns the value for vscale.
Definition VPlan.h:1068
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
bool hasResult() const
Definition VPlan.h:1143
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1183
unsigned getOpcode() const
Definition VPlan.h:1123
bool onlyFirstPartUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
void execute(VPTransformState &State) override
Generate the instruction.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2545
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2549
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2547
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2539
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2568
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2533
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2642
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2661
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2612
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPPartialReductionRecipe.
unsigned getOpcode() const
Get the binary op's opcode.
Definition VPlan.h:2808
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1269
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:3944
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1294
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1261
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4099
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:397
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2853
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:2750
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:2754
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:2756
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:2746
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:2752
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:3988
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4056
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2868
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:2913
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
unsigned getOpcode() const
Definition VPlan.h:2942
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3742
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:519
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:926
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1437
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
unsigned getNumOperands() const
Definition VPlanValue.h:237
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
virtual bool onlyFirstLaneUsed(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:282
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1391
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
friend class VPExpressionRecipe
Definition VPlanValue.h:53
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1433
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:140
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
VPValue(const unsigned char SC, Value *UV=nullptr, VPDef *Def=nullptr)
Definition VPlan.cpp:98
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1394
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
user_range users()
Definition VPlanValue.h:134
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:1918
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
operand_range args()
Definition VPlan.h:1671
Function * getCalledScalarFunction() const
Definition VPlan.h:1667
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1540
void execute(VPTransformState &State) override
Produce widened copies of the cast.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:1815
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2074
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2185
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2194
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1605
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1608
void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3180
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3177
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3217
Instruction & Ingredient
Definition VPlan.h:3171
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3174
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3231
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3224
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3221
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1443
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getUF() const
Definition VPlan.h:4355
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
Definition VPlanUtils.h:44
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * createFindLastIVReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind, Value *Start, Value *Sentinel)
Create a reduction of the given vector Src for a reduction of the kind RecurKind::FindLastIV.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1735
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind)
A helper function that returns how much we should divide the cost of a predicated block by.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI Value * createAnyOfReduction(IRBuilderBase &B, Value *Src, Value *InitVal, PHINode *OrigPhi)
Create a reduction of the given vector Src for a reduction of kind RecurKind::AnyOf.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1416
PHINode & getIRPhi()
Definition VPlan.h:1424
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:871
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:872
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
void execute(VPTransformState &State) override
Generate the wide load or gather.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3304
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isInvariantCond() const
Definition VPlan.h:1760
VPValue * getCond() const
Definition VPlan.h:1756
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenSelectRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the select instruction.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3385
void execute(VPTransformState &State) override
Generate the wide store or scatter.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3388
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3349
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.