LLVM 22.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
35#include "llvm/Support/Debug.h"
39#include <cassert>
40
41using namespace llvm;
42using namespace llvm::VPlanPatternMatch;
43
45
46#define LV_NAME "loop-vectorize"
47#define DEBUG_TYPE LV_NAME
48
50 switch (getVPDefID()) {
51 case VPExpressionSC:
52 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
53 case VPInstructionSC:
54 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
55 case VPInterleaveEVLSC:
56 case VPInterleaveSC:
57 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
58 case VPWidenStoreEVLSC:
59 case VPWidenStoreSC:
60 return true;
61 case VPReplicateSC:
62 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
63 ->mayWriteToMemory();
64 case VPWidenCallSC:
65 return !cast<VPWidenCallRecipe>(this)
66 ->getCalledScalarFunction()
67 ->onlyReadsMemory();
68 case VPWidenIntrinsicSC:
69 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
70 case VPCanonicalIVPHISC:
71 case VPBranchOnMaskSC:
72 case VPDerivedIVSC:
73 case VPFirstOrderRecurrencePHISC:
74 case VPReductionPHISC:
75 case VPScalarIVStepsSC:
76 case VPPredInstPHISC:
77 return false;
78 case VPBlendSC:
79 case VPReductionEVLSC:
80 case VPReductionSC:
81 case VPVectorPointerSC:
82 case VPWidenCanonicalIVSC:
83 case VPWidenCastSC:
84 case VPWidenGEPSC:
85 case VPWidenIntOrFpInductionSC:
86 case VPWidenLoadEVLSC:
87 case VPWidenLoadSC:
88 case VPWidenPHISC:
89 case VPWidenPointerInductionSC:
90 case VPWidenSC:
91 case VPWidenSelectSC: {
92 const Instruction *I =
93 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
94 (void)I;
95 assert((!I || !I->mayWriteToMemory()) &&
96 "underlying instruction may write to memory");
97 return false;
98 }
99 default:
100 return true;
101 }
102}
103
105 switch (getVPDefID()) {
106 case VPExpressionSC:
107 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
108 case VPInstructionSC:
109 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
110 case VPWidenLoadEVLSC:
111 case VPWidenLoadSC:
112 return true;
113 case VPReplicateSC:
114 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
115 ->mayReadFromMemory();
116 case VPWidenCallSC:
117 return !cast<VPWidenCallRecipe>(this)
118 ->getCalledScalarFunction()
119 ->onlyWritesMemory();
120 case VPWidenIntrinsicSC:
121 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
122 case VPBranchOnMaskSC:
123 case VPDerivedIVSC:
124 case VPFirstOrderRecurrencePHISC:
125 case VPPredInstPHISC:
126 case VPScalarIVStepsSC:
127 case VPWidenStoreEVLSC:
128 case VPWidenStoreSC:
129 return false;
130 case VPBlendSC:
131 case VPReductionEVLSC:
132 case VPReductionSC:
133 case VPVectorPointerSC:
134 case VPWidenCanonicalIVSC:
135 case VPWidenCastSC:
136 case VPWidenGEPSC:
137 case VPWidenIntOrFpInductionSC:
138 case VPWidenPHISC:
139 case VPWidenPointerInductionSC:
140 case VPWidenSC:
141 case VPWidenSelectSC: {
142 const Instruction *I =
143 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
144 (void)I;
145 assert((!I || !I->mayReadFromMemory()) &&
146 "underlying instruction may read from memory");
147 return false;
148 }
149 default:
150 // FIXME: Return false if the recipe represents an interleaved store.
151 return true;
152 }
153}
154
156 switch (getVPDefID()) {
157 case VPExpressionSC:
158 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
159 case VPDerivedIVSC:
160 case VPFirstOrderRecurrencePHISC:
161 case VPPredInstPHISC:
162 case VPVectorEndPointerSC:
163 return false;
164 case VPInstructionSC: {
165 auto *VPI = cast<VPInstruction>(this);
166 return mayWriteToMemory() ||
167 VPI->getOpcode() == VPInstruction::BranchOnCount ||
168 VPI->getOpcode() == VPInstruction::BranchOnCond;
169 }
170 case VPWidenCallSC: {
171 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
172 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
173 }
174 case VPWidenIntrinsicSC:
175 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
176 case VPBlendSC:
177 case VPReductionEVLSC:
178 case VPPartialReductionSC:
179 case VPReductionSC:
180 case VPScalarIVStepsSC:
181 case VPVectorPointerSC:
182 case VPWidenCanonicalIVSC:
183 case VPWidenCastSC:
184 case VPWidenGEPSC:
185 case VPWidenIntOrFpInductionSC:
186 case VPWidenPHISC:
187 case VPWidenPointerInductionSC:
188 case VPWidenSC:
189 case VPWidenSelectSC: {
190 const Instruction *I =
191 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
192 (void)I;
193 assert((!I || !I->mayHaveSideEffects()) &&
194 "underlying instruction has side-effects");
195 return false;
196 }
197 case VPInterleaveEVLSC:
198 case VPInterleaveSC:
199 return mayWriteToMemory();
200 case VPWidenLoadEVLSC:
201 case VPWidenLoadSC:
202 case VPWidenStoreEVLSC:
203 case VPWidenStoreSC:
204 assert(
205 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
207 "mayHaveSideffects result for ingredient differs from this "
208 "implementation");
209 return mayWriteToMemory();
210 case VPReplicateSC: {
211 auto *R = cast<VPReplicateRecipe>(this);
212 return R->getUnderlyingInstr()->mayHaveSideEffects();
213 }
214 default:
215 return true;
216 }
217}
218
220 assert(!Parent && "Recipe already in some VPBasicBlock");
221 assert(InsertPos->getParent() &&
222 "Insertion position not in any VPBasicBlock");
223 InsertPos->getParent()->insert(this, InsertPos->getIterator());
224}
225
226void VPRecipeBase::insertBefore(VPBasicBlock &BB,
228 assert(!Parent && "Recipe already in some VPBasicBlock");
229 assert(I == BB.end() || I->getParent() == &BB);
230 BB.insert(this, I);
231}
232
234 assert(!Parent && "Recipe already in some VPBasicBlock");
235 assert(InsertPos->getParent() &&
236 "Insertion position not in any VPBasicBlock");
237 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
238}
239
241 assert(getParent() && "Recipe not in any VPBasicBlock");
243 Parent = nullptr;
244}
245
247 assert(getParent() && "Recipe not in any VPBasicBlock");
249}
250
253 insertAfter(InsertPos);
254}
255
261
263 // Get the underlying instruction for the recipe, if there is one. It is used
264 // to
265 // * decide if cost computation should be skipped for this recipe,
266 // * apply forced target instruction cost.
267 Instruction *UI = nullptr;
268 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
269 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
270 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
271 UI = IG->getInsertPos();
272 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
273 UI = &WidenMem->getIngredient();
274
275 InstructionCost RecipeCost;
276 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
277 RecipeCost = 0;
278 } else {
279 RecipeCost = computeCost(VF, Ctx);
280 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
281 RecipeCost.isValid())
283 }
284
285 LLVM_DEBUG({
286 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
287 dump();
288 });
289 return RecipeCost;
290}
291
293 VPCostContext &Ctx) const {
294 llvm_unreachable("subclasses should implement computeCost");
295}
296
298 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
300}
301
303 auto *VPI = dyn_cast<VPInstruction>(this);
304 return VPI && Instruction::isCast(VPI->getOpcode());
305}
306
309 VPCostContext &Ctx) const {
310 std::optional<unsigned> Opcode;
311 VPValue *Op = getVecOp();
312 uint64_t MulConst;
313
314 InstructionCost CondCost = 0;
315 if (isConditional()) {
317 auto *VecTy = Ctx.Types.inferScalarType(Op);
318 auto *CondTy = Ctx.Types.inferScalarType(getCondOp());
319 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy,
320 Pred, Ctx.CostKind);
321 }
322
323 // If the partial reduction is predicated, a select will be operand 1.
324 // If it isn't predicated and the mul isn't operating on a constant, then it
325 // should have been turned into a VPExpressionRecipe.
326 // FIXME: Replace the entire function with this once all partial reduction
327 // variants are bundled into VPExpressionRecipe.
328 if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
329 auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
330 auto *InputType = Ctx.Types.inferScalarType(getVecOp());
331 return CondCost + Ctx.TTI.getPartialReductionCost(
332 getOpcode(), InputType, InputType, PhiType, VF,
333 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind);
334 }
335
336 VPRecipeBase *OpR = Op->getDefiningRecipe();
337 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
339 ExtBType = TTI::PR_None;
340
341 auto GetExtendKind = [](VPRecipeBase *R) {
342 if (!R)
343 return TTI::PR_None;
344 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
345 if (!WidenCastR)
346 return TTI::PR_None;
347 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
348 return TTI::PR_ZeroExtend;
349 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
350 return TTI::PR_SignExtend;
351 return TTI::PR_None;
352 };
353
354 // Pick out opcode, type/ext information and use sub side effects from a widen
355 // recipe.
356 auto HandleWiden = [&](VPWidenRecipe *Widen) {
357 if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
359 }
360 Opcode = Widen->getOpcode();
361 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
362 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
363 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
364 : Widen->getOperand(0));
365 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
366 : Widen->getOperand(1));
367 ExtAType = GetExtendKind(ExtAR);
368 ExtBType = GetExtendKind(ExtBR);
369
370 using namespace VPlanPatternMatch;
371 const APInt *C;
372 if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
373 canConstantBeExtended(C, InputTypeA, ExtAType)) {
374 InputTypeB = InputTypeA;
375 ExtBType = ExtAType;
376 }
377 };
378
379 if (isa<VPWidenCastRecipe>(OpR)) {
380 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
381 ExtAType = GetExtendKind(OpR);
382 } else if (isa<VPReductionPHIRecipe>(OpR)) {
383 if (auto RedPhiOp1R = dyn_cast_or_null<VPWidenCastRecipe>(getOperand(1))) {
384 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
385 ExtAType = GetExtendKind(RedPhiOp1R);
387 HandleWiden(Widen);
388 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
389 HandleWiden(Widen);
390 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
391 return CondCost + Reduction->computeCost(VF, Ctx);
392 }
393 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
394 return CondCost + Ctx.TTI.getPartialReductionCost(
395 getOpcode(), InputTypeA, InputTypeB, PhiType, VF,
396 ExtAType, ExtBType, Opcode, Ctx.CostKind);
397 ;
398}
399
401 auto &Builder = State.Builder;
402
403 assert(getOpcode() == Instruction::Add &&
404 "Unhandled partial reduction opcode");
405
406 Value *BinOpVal = State.get(getVecOp());
407 Value *PhiVal = State.get(getChainOp());
408 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
409
410 Type *RetTy = PhiVal->getType();
411
412 if (isConditional()) {
413 Value *Cond = State.get(getCondOp());
414 Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
415 BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero);
416 }
417
418 CallInst *V =
419 Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
420 {PhiVal, BinOpVal}, nullptr, "partial.reduce");
421
422 State.set(this, V);
423}
424
425#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
427 VPSlotTracker &SlotTracker) const {
428 O << Indent << "PARTIAL-REDUCE ";
430 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
432}
433#endif
434
436 assert(OpType == Other.OpType && "OpType must match");
437 switch (OpType) {
438 case OperationType::OverflowingBinOp:
439 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
440 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
441 break;
442 case OperationType::Trunc:
443 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
444 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
445 break;
446 case OperationType::DisjointOp:
447 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
448 break;
449 case OperationType::PossiblyExactOp:
450 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
451 break;
452 case OperationType::GEPOp:
453 GEPFlags &= Other.GEPFlags;
454 break;
455 case OperationType::FPMathOp:
456 case OperationType::FCmp:
457 assert((OpType != OperationType::FCmp ||
458 FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
459 "Cannot drop CmpPredicate");
460 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
461 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
462 break;
463 case OperationType::NonNegOp:
464 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
465 break;
466 case OperationType::Cmp:
467 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
468 break;
469 case OperationType::Other:
470 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
471 break;
472 }
473}
474
476 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp) &&
477 "recipe doesn't have fast math flags");
478 const FastMathFlagsTy &F = getFMFsRef();
479 FastMathFlags Res;
480 Res.setAllowReassoc(F.AllowReassoc);
481 Res.setNoNaNs(F.NoNaNs);
482 Res.setNoInfs(F.NoInfs);
483 Res.setNoSignedZeros(F.NoSignedZeros);
484 Res.setAllowReciprocal(F.AllowReciprocal);
485 Res.setAllowContract(F.AllowContract);
486 Res.setApproxFunc(F.ApproxFunc);
487 return Res;
488}
489
490#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
492
493void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
494 VPSlotTracker &SlotTracker) const {
495 printRecipe(O, Indent, SlotTracker);
496 if (auto DL = getDebugLoc()) {
497 O << ", !dbg ";
498 DL.print(O);
499 }
500}
501#endif
502
503template <unsigned PartOpIdx>
504VPValue *
506 if (U.getNumOperands() == PartOpIdx + 1)
507 return U.getOperand(PartOpIdx);
508 return nullptr;
509}
510
511template <unsigned PartOpIdx>
513 if (auto *UnrollPartOp = getUnrollPartOperand(U))
514 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
515 return 0;
516}
517
518namespace llvm {
519template class VPUnrollPartAccessor<1>;
520template class VPUnrollPartAccessor<2>;
521template class VPUnrollPartAccessor<3>;
522}
523
525 const VPIRFlags &Flags, const VPIRMetadata &MD,
526 DebugLoc DL, const Twine &Name)
527 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
528 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
530 "Set flags not supported for the provided opcode");
531 assert((getNumOperandsForOpcode(Opcode) == -1u ||
532 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
533 "number of operands does not match opcode");
534}
535
536#ifndef NDEBUG
537unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
538 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
539 return 1;
540
541 if (Instruction::isBinaryOp(Opcode))
542 return 2;
543
544 switch (Opcode) {
547 return 0;
548 case Instruction::Alloca:
549 case Instruction::ExtractValue:
550 case Instruction::Freeze:
551 case Instruction::Load:
567 return 1;
568 case Instruction::ICmp:
569 case Instruction::FCmp:
570 case Instruction::ExtractElement:
571 case Instruction::Store:
580 return 2;
581 case Instruction::Select:
585 return 3;
587 return 4;
588 case Instruction::Call:
589 case Instruction::GetElementPtr:
590 case Instruction::PHI:
591 case Instruction::Switch:
594 // Cannot determine the number of operands from the opcode.
595 return -1u;
596 }
597 llvm_unreachable("all cases should be handled above");
598}
599#endif
600
604
605bool VPInstruction::canGenerateScalarForFirstLane() const {
607 return true;
609 return true;
610 switch (Opcode) {
611 case Instruction::Freeze:
612 case Instruction::ICmp:
613 case Instruction::PHI:
614 case Instruction::Select:
623 return true;
624 default:
625 return false;
626 }
627}
628
629/// Create a conditional branch using \p Cond branching to the successors of \p
630/// VPBB. Note that the first successor is always forward (i.e. not created yet)
631/// while the second successor may already have been created (if it is a header
632/// block and VPBB is a latch).
634 VPTransformState &State) {
635 // Replace the temporary unreachable terminator with a new conditional
636 // branch, hooking it up to backward destination (header) for latch blocks
637 // now, and to forward destination(s) later when they are created.
638 // Second successor may be backwards - iff it is already in VPBB2IRBB.
639 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
640 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
641 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
642 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
643 // First successor is always forward, reset it to nullptr
644 CondBr->setSuccessor(0, nullptr);
646 return CondBr;
647}
648
649Value *VPInstruction::generate(VPTransformState &State) {
650 IRBuilderBase &Builder = State.Builder;
651
653 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
654 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
655 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
656 auto *Res =
657 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
658 if (auto *I = dyn_cast<Instruction>(Res))
659 applyFlags(*I);
660 return Res;
661 }
662
663 switch (getOpcode()) {
664 case VPInstruction::Not: {
665 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
666 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
667 return Builder.CreateNot(A, Name);
668 }
669 case Instruction::ExtractElement: {
670 assert(State.VF.isVector() && "Only extract elements from vectors");
671 if (getOperand(1)->isLiveIn()) {
672 unsigned IdxToExtract =
673 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
674 return State.get(getOperand(0), VPLane(IdxToExtract));
675 }
676 Value *Vec = State.get(getOperand(0));
677 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
678 return Builder.CreateExtractElement(Vec, Idx, Name);
679 }
680 case Instruction::Freeze: {
682 return Builder.CreateFreeze(Op, Name);
683 }
684 case Instruction::FCmp:
685 case Instruction::ICmp: {
686 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
687 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
688 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
689 return Builder.CreateCmp(getPredicate(), A, B, Name);
690 }
691 case Instruction::PHI: {
692 llvm_unreachable("should be handled by VPPhi::execute");
693 }
694 case Instruction::Select: {
695 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
696 Value *Cond =
697 State.get(getOperand(0),
698 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
699 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
700 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
701 return Builder.CreateSelect(Cond, Op1, Op2, Name);
702 }
704 // Get first lane of vector induction variable.
705 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
706 // Get the original loop tripcount.
707 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
708
709 // If this part of the active lane mask is scalar, generate the CMP directly
710 // to avoid unnecessary extracts.
711 if (State.VF.isScalar())
712 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
713 Name);
714
715 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
716 auto PredTy = VectorType::get(
717 Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
718 ->getZExtValue());
719 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
720 {PredTy, ScalarTC->getType()},
721 {VIVElem0, ScalarTC}, nullptr, Name);
722 }
724 // Generate code to combine the previous and current values in vector v3.
725 //
726 // vector.ph:
727 // v_init = vector(..., ..., ..., a[-1])
728 // br vector.body
729 //
730 // vector.body
731 // i = phi [0, vector.ph], [i+4, vector.body]
732 // v1 = phi [v_init, vector.ph], [v2, vector.body]
733 // v2 = a[i, i+1, i+2, i+3];
734 // v3 = vector(v1(3), v2(0, 1, 2))
735
736 auto *V1 = State.get(getOperand(0));
737 if (!V1->getType()->isVectorTy())
738 return V1;
739 Value *V2 = State.get(getOperand(1));
740 return Builder.CreateVectorSplice(V1, V2, -1, Name);
741 }
743 unsigned UF = getParent()->getPlan()->getUF();
744 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
745 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
746 Value *Sub = Builder.CreateSub(ScalarTC, Step);
747 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
748 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
749 return Builder.CreateSelect(Cmp, Sub, Zero);
750 }
752 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
753 // be outside of the main loop.
754 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
755 // Compute EVL
756 assert(AVL->getType()->isIntegerTy() &&
757 "Requested vector length should be an integer.");
758
759 assert(State.VF.isScalable() && "Expected scalable vector factor.");
760 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
761
762 Value *EVL = State.Builder.CreateIntrinsic(
763 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
764 {AVL, VFArg, State.Builder.getTrue()});
765 return EVL;
766 }
768 unsigned Part = getUnrollPart(*this);
769 auto *IV = State.get(getOperand(0), VPLane(0));
770 assert(Part != 0 && "Must have a positive part");
771 // The canonical IV is incremented by the vectorization factor (num of
772 // SIMD elements) times the unroll part.
773 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
774 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
776 }
778 Value *Cond = State.get(getOperand(0), VPLane(0));
779 auto *Br = createCondBranch(Cond, getParent(), State);
780 applyMetadata(*Br);
781 return Br;
782 }
784 // First create the compare.
785 Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
786 Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
787 Value *Cond = Builder.CreateICmpEQ(IV, TC);
788 return createCondBranch(Cond, getParent(), State);
789 }
791 return Builder.CreateVectorSplat(
792 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
793 }
795 // For struct types, we need to build a new 'wide' struct type, where each
796 // element is widened, i.e., we create a struct of vectors.
797 auto *StructTy =
799 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
800 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
801 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
802 FieldIndex++) {
803 Value *ScalarValue =
804 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
805 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
806 VectorValue =
807 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
808 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
809 }
810 }
811 return Res;
812 }
814 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
815 auto NumOfElements = ElementCount::getFixed(getNumOperands());
816 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
817 for (const auto &[Idx, Op] : enumerate(operands()))
818 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
819 State.Builder.getInt32(Idx));
820 return Res;
821 }
823 if (State.VF.isScalar())
824 return State.get(getOperand(0), true);
825 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
827 // If this start vector is scaled then it should produce a vector with fewer
828 // elements than the VF.
829 ElementCount VF = State.VF.divideCoefficientBy(
830 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
831 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
832 Constant *Zero = Builder.getInt32(0);
833 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
834 Zero);
835 }
837 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
838 // and will be removed by breaking up the recipe further.
839 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
840 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
841 Value *ReducedPartRdx = State.get(getOperand(2));
842 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
843 ReducedPartRdx =
844 Builder.CreateBinOp(Instruction::Or, State.get(getOperand(Idx)),
845 ReducedPartRdx, "bin.rdx");
846 return createAnyOfReduction(Builder, ReducedPartRdx,
847 State.get(getOperand(1), VPLane(0)), OrigPhi);
848 }
850 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
851 // and will be removed by breaking up the recipe further.
852 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
853 // Get its reduction variable descriptor.
854 RecurKind RK = PhiR->getRecurrenceKind();
856 "Unexpected reduction kind");
857 assert(!PhiR->isInLoop() &&
858 "In-loop FindLastIV reduction is not supported yet");
859
860 // The recipe's operands are the reduction phi, the start value, the
861 // sentinel value, followed by one operand for each part of the reduction.
862 unsigned UF = getNumOperands() - 3;
863 Value *ReducedPartRdx = State.get(getOperand(3));
864 RecurKind MinMaxKind;
867 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
868 else
869 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
870 for (unsigned Part = 1; Part < UF; ++Part)
871 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
872 State.get(getOperand(3 + Part)));
873
874 Value *Start = State.get(getOperand(1), true);
876 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
877 Sentinel);
878 }
880 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
881 // and will be removed by breaking up the recipe further.
882 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
883 // Get its reduction variable descriptor.
884
885 RecurKind RK = PhiR->getRecurrenceKind();
887 "should be handled by ComputeFindIVResult");
888
889 // The recipe's operands are the reduction phi, followed by one operand for
890 // each part of the reduction.
891 unsigned UF = getNumOperands() - 1;
892 VectorParts RdxParts(UF);
893 for (unsigned Part = 0; Part < UF; ++Part)
894 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
895
896 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
897 if (hasFastMathFlags())
899
900 // Reduce all of the unrolled parts into a single vector.
901 Value *ReducedPartRdx = RdxParts[0];
902 if (PhiR->isOrdered()) {
903 ReducedPartRdx = RdxParts[UF - 1];
904 } else {
905 // Floating-point operations should have some FMF to enable the reduction.
906 for (unsigned Part = 1; Part < UF; ++Part) {
907 Value *RdxPart = RdxParts[Part];
909 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
910 else {
912 // For sub-recurrences, each UF's reduction variable is already
913 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
914 if (RK == RecurKind::Sub)
915 Opcode = Instruction::Add;
916 else
917 Opcode =
919 ReducedPartRdx =
920 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
921 }
922 }
923 }
924
925 // Create the reduction after the loop. Note that inloop reductions create
926 // the target reduction in the loop using a Reduction recipe.
927 if (State.VF.isVector() && !PhiR->isInLoop()) {
928 // TODO: Support in-order reductions based on the recurrence descriptor.
929 // All ops in the reduction inherit fast-math-flags from the recurrence
930 // descriptor.
931 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
932 }
933
934 return ReducedPartRdx;
935 }
939 unsigned Offset =
941 Value *Res;
942 if (State.VF.isVector()) {
943 assert(Offset <= State.VF.getKnownMinValue() &&
944 "invalid offset to extract from");
945 // Extract lane VF - Offset from the operand.
946 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
947 } else {
948 assert(Offset <= 1 && "invalid offset to extract from");
949 Res = State.get(getOperand(0));
950 }
952 Res->setName(Name);
953 return Res;
954 }
956 Value *A = State.get(getOperand(0));
957 Value *B = State.get(getOperand(1));
958 return Builder.CreateLogicalAnd(A, B, Name);
959 }
962 "can only generate first lane for PtrAdd");
963 Value *Ptr = State.get(getOperand(0), VPLane(0));
964 Value *Addend = State.get(getOperand(1), VPLane(0));
965 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
966 }
968 Value *Ptr =
970 Value *Addend = State.get(getOperand(1));
971 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
972 }
974 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
975 for (VPValue *Op : drop_begin(operands()))
976 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
977 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
978 }
980 Value *LaneToExtract = State.get(getOperand(0), true);
981 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
982 Value *Res = nullptr;
983 Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
984
985 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
986 Value *VectorStart =
987 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
988 Value *VectorIdx = Idx == 1
989 ? LaneToExtract
990 : Builder.CreateSub(LaneToExtract, VectorStart);
991 Value *Ext = State.VF.isScalar()
992 ? State.get(getOperand(Idx))
993 : Builder.CreateExtractElement(
994 State.get(getOperand(Idx)), VectorIdx);
995 if (Res) {
996 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
997 Res = Builder.CreateSelect(Cmp, Ext, Res);
998 } else {
999 Res = Ext;
1000 }
1001 }
1002 return Res;
1003 }
1005 if (getNumOperands() == 1) {
1006 Value *Mask = State.get(getOperand(0));
1007 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
1008 true, Name);
1009 }
1010 // If there are multiple operands, create a chain of selects to pick the
1011 // first operand with an active lane and add the number of lanes of the
1012 // preceding operands.
1013 Value *RuntimeVF =
1014 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
1015 unsigned LastOpIdx = getNumOperands() - 1;
1016 Value *Res = nullptr;
1017 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1018 Value *TrailingZeros =
1019 State.VF.isScalar()
1020 ? Builder.CreateZExt(
1021 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
1022 Builder.getFalse()),
1023 Builder.getInt64Ty())
1024 : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
1025 State.get(getOperand(Idx)),
1026 true, Name);
1027 Value *Current = Builder.CreateAdd(
1028 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
1029 if (Res) {
1030 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
1031 Res = Builder.CreateSelect(Cmp, Current, Res);
1032 } else {
1033 Res = Current;
1034 }
1035 }
1036
1037 return Res;
1038 }
1040 return State.get(getOperand(0), true);
1041 default:
1042 llvm_unreachable("Unsupported opcode for instruction");
1043 }
1044}
1045
1047 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1048 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1049 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1050 switch (Opcode) {
1051 case Instruction::FNeg:
1052 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1053 case Instruction::UDiv:
1054 case Instruction::SDiv:
1055 case Instruction::SRem:
1056 case Instruction::URem:
1057 case Instruction::Add:
1058 case Instruction::FAdd:
1059 case Instruction::Sub:
1060 case Instruction::FSub:
1061 case Instruction::Mul:
1062 case Instruction::FMul:
1063 case Instruction::FDiv:
1064 case Instruction::FRem:
1065 case Instruction::Shl:
1066 case Instruction::LShr:
1067 case Instruction::AShr:
1068 case Instruction::And:
1069 case Instruction::Or:
1070 case Instruction::Xor: {
1073
1074 if (VF.isVector()) {
1075 // Certain instructions can be cheaper to vectorize if they have a
1076 // constant second vector operand. One example of this are shifts on x86.
1077 VPValue *RHS = getOperand(1);
1078 RHSInfo = Ctx.getOperandInfo(RHS);
1079
1080 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1083 }
1084
1087 if (CtxI)
1088 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1089 return Ctx.TTI.getArithmeticInstrCost(
1090 Opcode, ResultTy, Ctx.CostKind,
1091 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1092 RHSInfo, Operands, CtxI, &Ctx.TLI);
1093 }
1094 case Instruction::Freeze:
1095 // This opcode is unknown. Assume that it is the same as 'mul'.
1096 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
1097 Ctx.CostKind);
1098 case Instruction::ExtractValue:
1099 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1100 Ctx.CostKind);
1101 case Instruction::ICmp:
1102 case Instruction::FCmp: {
1103 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1104 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1106 return Ctx.TTI.getCmpSelInstrCost(
1107 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1108 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1109 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1110 }
1111 }
1112 llvm_unreachable("called for unsupported opcode");
1113}
1114
1116 VPCostContext &Ctx) const {
1118 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1119 // TODO: Compute cost for VPInstructions without underlying values once
1120 // the legacy cost model has been retired.
1121 return 0;
1122 }
1123
1125 "Should only generate a vector value or single scalar, not scalars "
1126 "for all lanes.");
1128 getOpcode(),
1130 }
1131
1132 switch (getOpcode()) {
1133 case Instruction::Select: {
1134 // TODO: It may be possible to improve this by analyzing where the
1135 // condition operand comes from.
1137 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1138 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1139 if (!vputils::onlyFirstLaneUsed(this)) {
1140 CondTy = toVectorTy(CondTy, VF);
1141 VecTy = toVectorTy(VecTy, VF);
1142 }
1143 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1144 Ctx.CostKind);
1145 }
1146 case Instruction::ExtractElement:
1148 if (VF.isScalar()) {
1149 // ExtractLane with VF=1 takes care of handling extracting across multiple
1150 // parts.
1151 return 0;
1152 }
1153
1154 // Add on the cost of extracting the element.
1155 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1156 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1157 Ctx.CostKind);
1158 }
1159 case VPInstruction::AnyOf: {
1160 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1161 return Ctx.TTI.getArithmeticReductionCost(
1162 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1163 }
1165 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1166 if (VF.isScalar())
1167 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1169 CmpInst::ICMP_EQ, Ctx.CostKind);
1170 // Calculate the cost of determining the lane index.
1171 auto *PredTy = toVectorTy(ScalarTy, VF);
1172 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1173 Type::getInt64Ty(Ctx.LLVMCtx),
1174 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1175 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1176 }
1178 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1180 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1181 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1182
1183 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
1184 cast<VectorType>(VectorTy),
1185 cast<VectorType>(VectorTy), Mask,
1186 Ctx.CostKind, VF.getKnownMinValue() - 1);
1187 }
1189 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1190 unsigned Multiplier =
1191 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1192 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1193 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1194 {ArgTy, ArgTy});
1195 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1196 }
1198 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1199 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1200 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1201 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1202 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1203 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1204 }
1206 // Add on the cost of extracting the element.
1207 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1208 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1209 VecTy, Ctx.CostKind, 0);
1210 }
1212 if (VF == ElementCount::getScalable(1))
1214 [[fallthrough]];
1215 default:
1216 // TODO: Compute cost other VPInstructions once the legacy cost model has
1217 // been retired.
1219 "unexpected VPInstruction witht underlying value");
1220 return 0;
1221 }
1222}
1223
1236
1238 switch (getOpcode()) {
1239 case Instruction::PHI:
1243 return true;
1244 default:
1245 return isScalarCast();
1246 }
1247}
1248
1250 assert(!State.Lane && "VPInstruction executing an Lane");
1251 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1253 "Set flags not supported for the provided opcode");
1254 if (hasFastMathFlags())
1255 State.Builder.setFastMathFlags(getFastMathFlags());
1256 Value *GeneratedValue = generate(State);
1257 if (!hasResult())
1258 return;
1259 assert(GeneratedValue && "generate must produce a value");
1260 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1263 assert((((GeneratedValue->getType()->isVectorTy() ||
1264 GeneratedValue->getType()->isStructTy()) ==
1265 !GeneratesPerFirstLaneOnly) ||
1266 State.VF.isScalar()) &&
1267 "scalar value but not only first lane defined");
1268 State.set(this, GeneratedValue,
1269 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1270}
1271
1274 return false;
1275 switch (getOpcode()) {
1276 case Instruction::ExtractElement:
1277 case Instruction::Freeze:
1278 case Instruction::FCmp:
1279 case Instruction::ICmp:
1280 case Instruction::Select:
1281 case Instruction::PHI:
1299 case VPInstruction::Not:
1307 return false;
1308 default:
1309 return true;
1310 }
1311}
1312
1314 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1316 return vputils::onlyFirstLaneUsed(this);
1317
1318 switch (getOpcode()) {
1319 default:
1320 return false;
1321 case Instruction::ExtractElement:
1322 return Op == getOperand(1);
1323 case Instruction::PHI:
1324 return true;
1325 case Instruction::FCmp:
1326 case Instruction::ICmp:
1327 case Instruction::Select:
1328 case Instruction::Or:
1329 case Instruction::Freeze:
1330 case VPInstruction::Not:
1331 // TODO: Cover additional opcodes.
1332 return vputils::onlyFirstLaneUsed(this);
1341 return true;
1344 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1345 // operand, after replicating its operands only the first lane is used.
1346 // Before replicating, it will have only a single operand.
1347 return getNumOperands() > 1;
1349 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1351 // WidePtrAdd supports scalar and vector base addresses.
1352 return false;
1355 return Op == getOperand(1);
1357 return Op == getOperand(0);
1358 };
1359 llvm_unreachable("switch should return");
1360}
1361
1363 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1365 return vputils::onlyFirstPartUsed(this);
1366
1367 switch (getOpcode()) {
1368 default:
1369 return false;
1370 case Instruction::FCmp:
1371 case Instruction::ICmp:
1372 case Instruction::Select:
1373 return vputils::onlyFirstPartUsed(this);
1377 return true;
1378 };
1379 llvm_unreachable("switch should return");
1380}
1381
1382#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1384 VPSlotTracker SlotTracker(getParent()->getPlan());
1386}
1387
1389 VPSlotTracker &SlotTracker) const {
1390 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1391
1392 if (hasResult()) {
1394 O << " = ";
1395 }
1396
1397 switch (getOpcode()) {
1398 case VPInstruction::Not:
1399 O << "not";
1400 break;
1402 O << "combined load";
1403 break;
1405 O << "combined store";
1406 break;
1408 O << "active lane mask";
1409 break;
1411 O << "EXPLICIT-VECTOR-LENGTH";
1412 break;
1414 O << "first-order splice";
1415 break;
1417 O << "branch-on-cond";
1418 break;
1420 O << "TC > VF ? TC - VF : 0";
1421 break;
1423 O << "VF * Part +";
1424 break;
1426 O << "branch-on-count";
1427 break;
1429 O << "broadcast";
1430 break;
1432 O << "buildstructvector";
1433 break;
1435 O << "buildvector";
1436 break;
1438 O << "extract-lane";
1439 break;
1441 O << "extract-last-element";
1442 break;
1444 O << "extract-last-lane-per-part";
1445 break;
1447 O << "extract-penultimate-element";
1448 break;
1450 O << "compute-anyof-result";
1451 break;
1453 O << "compute-find-iv-result";
1454 break;
1456 O << "compute-reduction-result";
1457 break;
1459 O << "logical-and";
1460 break;
1462 O << "ptradd";
1463 break;
1465 O << "wide-ptradd";
1466 break;
1468 O << "any-of";
1469 break;
1471 O << "first-active-lane";
1472 break;
1474 O << "reduction-start-vector";
1475 break;
1477 O << "resume-for-epilogue";
1478 break;
1480 O << "unpack";
1481 break;
1482 default:
1484 }
1485
1486 printFlags(O);
1488}
1489#endif
1490
1492 State.setDebugLocFrom(getDebugLoc());
1493 if (isScalarCast()) {
1494 Value *Op = State.get(getOperand(0), VPLane(0));
1495 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1496 Op, ResultTy);
1497 State.set(this, Cast, VPLane(0));
1498 return;
1499 }
1500 switch (getOpcode()) {
1502 Value *StepVector =
1503 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1504 State.set(this, StepVector);
1505 break;
1506 }
1507 case VPInstruction::VScale: {
1508 Value *VScale = State.Builder.CreateVScale(ResultTy);
1509 State.set(this, VScale, true);
1510 break;
1511 }
1512
1513 default:
1514 llvm_unreachable("opcode not implemented yet");
1515 }
1516}
1517
1518#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1520 VPSlotTracker &SlotTracker) const {
1521 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1523 O << " = ";
1524
1525 switch (getOpcode()) {
1527 O << "wide-iv-step ";
1529 break;
1531 O << "step-vector " << *ResultTy;
1532 break;
1534 O << "vscale " << *ResultTy;
1535 break;
1536 default:
1537 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1540 O << " to " << *ResultTy;
1541 }
1542}
1543#endif
1544
1546 State.setDebugLocFrom(getDebugLoc());
1547 PHINode *NewPhi = State.Builder.CreatePHI(
1548 State.TypeAnalysis.inferScalarType(this), 2, getName());
1549 unsigned NumIncoming = getNumIncoming();
1550 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1551 // TODO: Fixup all incoming values of header phis once recipes defining them
1552 // are introduced.
1553 NumIncoming = 1;
1554 }
1555 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1556 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1557 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1558 NewPhi->addIncoming(IncV, PredBB);
1559 }
1560 State.set(this, NewPhi, VPLane(0));
1561}
1562
1563#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1564void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1565 VPSlotTracker &SlotTracker) const {
1566 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1568 O << " = phi ";
1570}
1571#endif
1572
1573VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1574 if (auto *Phi = dyn_cast<PHINode>(&I))
1575 return new VPIRPhi(*Phi);
1576 return new VPIRInstruction(I);
1577}
1578
1580 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1581 "PHINodes must be handled by VPIRPhi");
1582 // Advance the insert point after the wrapped IR instruction. This allows
1583 // interleaving VPIRInstructions and other recipes.
1584 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1585}
1586
1588 VPCostContext &Ctx) const {
1589 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1590 // hence it does not contribute to the cost-modeling for the VPlan.
1591 return 0;
1592}
1593
1596 "can only update exiting operands to phi nodes");
1597 assert(getNumOperands() > 0 && "must have at least one operand");
1598 VPValue *Exiting = getOperand(0);
1599 if (Exiting->isLiveIn())
1600 return;
1601
1602 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1603 setOperand(0, Exiting);
1604}
1605
1606#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1608 VPSlotTracker &SlotTracker) const {
1609 O << Indent << "IR " << I;
1610}
1611#endif
1612
1614 PHINode *Phi = &getIRPhi();
1615 for (const auto &[Idx, Op] : enumerate(operands())) {
1616 VPValue *ExitValue = Op;
1617 auto Lane = vputils::isSingleScalar(ExitValue)
1619 : VPLane::getLastLaneForVF(State.VF);
1620 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1621 auto *PredVPBB = Pred->getExitingBasicBlock();
1622 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1623 // Set insertion point in PredBB in case an extract needs to be generated.
1624 // TODO: Model extracts explicitly.
1625 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1626 Value *V = State.get(ExitValue, VPLane(Lane));
1627 // If there is no existing block for PredBB in the phi, add a new incoming
1628 // value. Otherwise update the existing incoming value for PredBB.
1629 if (Phi->getBasicBlockIndex(PredBB) == -1)
1630 Phi->addIncoming(V, PredBB);
1631 else
1632 Phi->setIncomingValueForBlock(PredBB, V);
1633 }
1634
1635 // Advance the insert point after the wrapped IR instruction. This allows
1636 // interleaving VPIRInstructions and other recipes.
1637 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1638}
1639
1641 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1642 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1643 "Number of phi operands must match number of predecessors");
1644 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1645 R->removeOperand(Position);
1646}
1647
1648#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1650 VPSlotTracker &SlotTracker) const {
1651 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1652 [this, &O, &SlotTracker](auto Op) {
1653 O << "[ ";
1654 Op.value()->printAsOperand(O, SlotTracker);
1655 O << ", ";
1656 getIncomingBlock(Op.index())->printAsOperand(O);
1657 O << " ]";
1658 });
1659}
1660#endif
1661
1662#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1664 VPSlotTracker &SlotTracker) const {
1666
1667 if (getNumOperands() != 0) {
1668 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1670 [&O, &SlotTracker](auto Op) {
1671 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1672 O << " from ";
1673 std::get<1>(Op)->printAsOperand(O);
1674 });
1675 O << ")";
1676 }
1677}
1678#endif
1679
1681 for (const auto &[Kind, Node] : Metadata)
1682 I.setMetadata(Kind, Node);
1683}
1684
1686 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1687 for (const auto &[KindA, MDA] : Metadata) {
1688 for (const auto &[KindB, MDB] : Other.Metadata) {
1689 if (KindA == KindB && MDA == MDB) {
1690 MetadataIntersection.emplace_back(KindA, MDA);
1691 break;
1692 }
1693 }
1694 }
1695 Metadata = std::move(MetadataIntersection);
1696}
1697
1699 assert(State.VF.isVector() && "not widening");
1700 assert(Variant != nullptr && "Can't create vector function.");
1701
1702 FunctionType *VFTy = Variant->getFunctionType();
1703 // Add return type if intrinsic is overloaded on it.
1705 for (const auto &I : enumerate(args())) {
1706 Value *Arg;
1707 // Some vectorized function variants may also take a scalar argument,
1708 // e.g. linear parameters for pointers. This needs to be the scalar value
1709 // from the start of the respective part when interleaving.
1710 if (!VFTy->getParamType(I.index())->isVectorTy())
1711 Arg = State.get(I.value(), VPLane(0));
1712 else
1713 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1714 Args.push_back(Arg);
1715 }
1716
1719 if (CI)
1720 CI->getOperandBundlesAsDefs(OpBundles);
1721
1722 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1723 applyFlags(*V);
1724 applyMetadata(*V);
1725 V->setCallingConv(Variant->getCallingConv());
1726
1727 if (!V->getType()->isVoidTy())
1728 State.set(this, V);
1729}
1730
1732 VPCostContext &Ctx) const {
1733 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1734 Variant->getFunctionType()->params(),
1735 Ctx.CostKind);
1736}
1737
1738#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1740 VPSlotTracker &SlotTracker) const {
1741 O << Indent << "WIDEN-CALL ";
1742
1743 Function *CalledFn = getCalledScalarFunction();
1744 if (CalledFn->getReturnType()->isVoidTy())
1745 O << "void ";
1746 else {
1748 O << " = ";
1749 }
1750
1751 O << "call";
1752 printFlags(O);
1753 O << " @" << CalledFn->getName() << "(";
1754 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1755 Op->printAsOperand(O, SlotTracker);
1756 });
1757 O << ")";
1758
1759 O << " (using library function";
1760 if (Variant->hasName())
1761 O << ": " << Variant->getName();
1762 O << ")";
1763}
1764#endif
1765
1767 assert(State.VF.isVector() && "not widening");
1768
1769 SmallVector<Type *, 2> TysForDecl;
1770 // Add return type if intrinsic is overloaded on it.
1771 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1772 TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1774 for (const auto &I : enumerate(operands())) {
1775 // Some intrinsics have a scalar argument - don't replace it with a
1776 // vector.
1777 Value *Arg;
1778 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1779 State.TTI))
1780 Arg = State.get(I.value(), VPLane(0));
1781 else
1782 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1783 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1784 State.TTI))
1785 TysForDecl.push_back(Arg->getType());
1786 Args.push_back(Arg);
1787 }
1788
1789 // Use vector version of the intrinsic.
1790 Module *M = State.Builder.GetInsertBlock()->getModule();
1791 Function *VectorF =
1792 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1793 assert(VectorF &&
1794 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1795
1798 if (CI)
1799 CI->getOperandBundlesAsDefs(OpBundles);
1800
1801 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1802
1803 applyFlags(*V);
1804 applyMetadata(*V);
1805
1806 if (!V->getType()->isVoidTy())
1807 State.set(this, V);
1808}
1809
1810/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1813 const VPRecipeWithIRFlags &R,
1814 ElementCount VF,
1815 VPCostContext &Ctx) {
1816 // Some backends analyze intrinsic arguments to determine cost. Use the
1817 // underlying value for the operand if it has one. Otherwise try to use the
1818 // operand of the underlying call instruction, if there is one. Otherwise
1819 // clear Arguments.
1820 // TODO: Rework TTI interface to be independent of concrete IR values.
1822 for (const auto &[Idx, Op] : enumerate(Operands)) {
1823 auto *V = Op->getUnderlyingValue();
1824 if (!V) {
1825 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1826 Arguments.push_back(UI->getArgOperand(Idx));
1827 continue;
1828 }
1829 Arguments.clear();
1830 break;
1831 }
1832 Arguments.push_back(V);
1833 }
1834
1835 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1836 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1837 SmallVector<Type *> ParamTys;
1838 for (const VPValue *Op : Operands) {
1839 ParamTys.push_back(VF.isVector()
1840 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1841 : Ctx.Types.inferScalarType(Op));
1842 }
1843
1844 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1845 FastMathFlags FMF =
1846 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1847 IntrinsicCostAttributes CostAttrs(
1848 ID, RetTy, Arguments, ParamTys, FMF,
1849 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1850 InstructionCost::getInvalid(), &Ctx.TLI);
1851 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1852}
1853
1855 VPCostContext &Ctx) const {
1857 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1858}
1859
1861 return Intrinsic::getBaseName(VectorIntrinsicID);
1862}
1863
1865 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1866 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1867 auto [Idx, V] = X;
1869 Idx, nullptr);
1870 });
1871}
1872
1873#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1875 VPSlotTracker &SlotTracker) const {
1876 O << Indent << "WIDEN-INTRINSIC ";
1877 if (ResultTy->isVoidTy()) {
1878 O << "void ";
1879 } else {
1881 O << " = ";
1882 }
1883
1884 O << "call";
1885 printFlags(O);
1886 O << getIntrinsicName() << "(";
1887
1889 Op->printAsOperand(O, SlotTracker);
1890 });
1891 O << ")";
1892}
1893#endif
1894
1896 IRBuilderBase &Builder = State.Builder;
1897
1898 Value *Address = State.get(getOperand(0));
1899 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1900 VectorType *VTy = cast<VectorType>(Address->getType());
1901
1902 // The histogram intrinsic requires a mask even if the recipe doesn't;
1903 // if the mask operand was omitted then all lanes should be executed and
1904 // we just need to synthesize an all-true mask.
1905 Value *Mask = nullptr;
1906 if (VPValue *VPMask = getMask())
1907 Mask = State.get(VPMask);
1908 else
1909 Mask =
1910 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1911
1912 // If this is a subtract, we want to invert the increment amount. We may
1913 // add a separate intrinsic in future, but for now we'll try this.
1914 if (Opcode == Instruction::Sub)
1915 IncAmt = Builder.CreateNeg(IncAmt);
1916 else
1917 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1918
1919 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1920 {VTy, IncAmt->getType()},
1921 {Address, IncAmt, Mask});
1922}
1923
1925 VPCostContext &Ctx) const {
1926 // FIXME: Take the gather and scatter into account as well. For now we're
1927 // generating the same cost as the fallback path, but we'll likely
1928 // need to create a new TTI method for determining the cost, including
1929 // whether we can use base + vec-of-smaller-indices or just
1930 // vec-of-pointers.
1931 assert(VF.isVector() && "Invalid VF for histogram cost");
1932 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1933 VPValue *IncAmt = getOperand(1);
1934 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1935 VectorType *VTy = VectorType::get(IncTy, VF);
1936
1937 // Assume that a non-constant update value (or a constant != 1) requires
1938 // a multiply, and add that into the cost.
1939 InstructionCost MulCost =
1940 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1941 if (IncAmt->isLiveIn()) {
1943
1944 if (CI && CI->getZExtValue() == 1)
1945 MulCost = TTI::TCC_Free;
1946 }
1947
1948 // Find the cost of the histogram operation itself.
1949 Type *PtrTy = VectorType::get(AddressTy, VF);
1950 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1951 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1952 Type::getVoidTy(Ctx.LLVMCtx),
1953 {PtrTy, IncTy, MaskTy});
1954
1955 // Add the costs together with the add/sub operation.
1956 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1957 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1958}
1959
1960#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1962 VPSlotTracker &SlotTracker) const {
1963 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1965
1966 if (Opcode == Instruction::Sub)
1967 O << ", dec: ";
1968 else {
1969 assert(Opcode == Instruction::Add);
1970 O << ", inc: ";
1971 }
1973
1974 if (VPValue *Mask = getMask()) {
1975 O << ", mask: ";
1976 Mask->printAsOperand(O, SlotTracker);
1977 }
1978}
1979
1981 VPSlotTracker &SlotTracker) const {
1982 O << Indent << "WIDEN-SELECT ";
1984 O << " = select ";
1985 printFlags(O);
1987 O << ", ";
1989 O << ", ";
1991 O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)"
1992 : "");
1993}
1994#endif
1995
1997 Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond()));
1998
1999 Value *Op0 = State.get(getOperand(1));
2000 Value *Op1 = State.get(getOperand(2));
2001 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2002 State.set(this, Sel);
2003 if (auto *I = dyn_cast<Instruction>(Sel)) {
2005 applyFlags(*I);
2006 applyMetadata(*I);
2007 }
2008}
2009
2011 VPCostContext &Ctx) const {
2013 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
2014 Type *ScalarTy = Ctx.Types.inferScalarType(this);
2015 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2016
2017 VPValue *Op0, *Op1;
2018 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
2019 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
2020 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
2021 // select x, y, false --> x & y
2022 // select x, true, y --> x | y
2023 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
2024 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
2025
2027 if (all_of(operands(),
2028 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
2029 Operands.append(SI->op_begin(), SI->op_end());
2030 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
2031 return Ctx.TTI.getArithmeticInstrCost(
2032 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
2033 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
2034 }
2035
2036 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
2037 if (!ScalarCond)
2038 CondTy = VectorType::get(CondTy, VF);
2039
2041 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
2042 Pred = Cmp->getPredicate();
2043 return Ctx.TTI.getCmpSelInstrCost(
2044 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
2045 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
2046}
2047
2048VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2049 AllowReassoc = FMF.allowReassoc();
2050 NoNaNs = FMF.noNaNs();
2051 NoInfs = FMF.noInfs();
2052 NoSignedZeros = FMF.noSignedZeros();
2053 AllowReciprocal = FMF.allowReciprocal();
2054 AllowContract = FMF.allowContract();
2055 ApproxFunc = FMF.approxFunc();
2056}
2057
2058#if !defined(NDEBUG)
2059bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2060 switch (OpType) {
2061 case OperationType::OverflowingBinOp:
2062 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2063 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2064 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2065 case OperationType::Trunc:
2066 return Opcode == Instruction::Trunc;
2067 case OperationType::DisjointOp:
2068 return Opcode == Instruction::Or;
2069 case OperationType::PossiblyExactOp:
2070 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2071 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2072 case OperationType::GEPOp:
2073 return Opcode == Instruction::GetElementPtr ||
2074 Opcode == VPInstruction::PtrAdd ||
2075 Opcode == VPInstruction::WidePtrAdd;
2076 case OperationType::FPMathOp:
2077 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2078 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2079 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2080 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2081 Opcode == Instruction::FPTrunc || Opcode == Instruction::Select ||
2082 Opcode == VPInstruction::WideIVStep ||
2085 case OperationType::FCmp:
2086 return Opcode == Instruction::FCmp;
2087 case OperationType::NonNegOp:
2088 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2089 case OperationType::Cmp:
2090 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2091 case OperationType::Other:
2092 return true;
2093 }
2094 llvm_unreachable("Unknown OperationType enum");
2095}
2096#endif
2097
2098#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2100 switch (OpType) {
2101 case OperationType::Cmp:
2103 break;
2104 case OperationType::FCmp:
2107 break;
2108 case OperationType::DisjointOp:
2109 if (DisjointFlags.IsDisjoint)
2110 O << " disjoint";
2111 break;
2112 case OperationType::PossiblyExactOp:
2113 if (ExactFlags.IsExact)
2114 O << " exact";
2115 break;
2116 case OperationType::OverflowingBinOp:
2117 if (WrapFlags.HasNUW)
2118 O << " nuw";
2119 if (WrapFlags.HasNSW)
2120 O << " nsw";
2121 break;
2122 case OperationType::Trunc:
2123 if (TruncFlags.HasNUW)
2124 O << " nuw";
2125 if (TruncFlags.HasNSW)
2126 O << " nsw";
2127 break;
2128 case OperationType::FPMathOp:
2130 break;
2131 case OperationType::GEPOp:
2132 if (GEPFlags.isInBounds())
2133 O << " inbounds";
2134 else if (GEPFlags.hasNoUnsignedSignedWrap())
2135 O << " nusw";
2136 if (GEPFlags.hasNoUnsignedWrap())
2137 O << " nuw";
2138 break;
2139 case OperationType::NonNegOp:
2140 if (NonNegFlags.NonNeg)
2141 O << " nneg";
2142 break;
2143 case OperationType::Other:
2144 break;
2145 }
2146 O << " ";
2147}
2148#endif
2149
2151 auto &Builder = State.Builder;
2152 switch (Opcode) {
2153 case Instruction::Call:
2154 case Instruction::Br:
2155 case Instruction::PHI:
2156 case Instruction::GetElementPtr:
2157 case Instruction::Select:
2158 llvm_unreachable("This instruction is handled by a different recipe.");
2159 case Instruction::UDiv:
2160 case Instruction::SDiv:
2161 case Instruction::SRem:
2162 case Instruction::URem:
2163 case Instruction::Add:
2164 case Instruction::FAdd:
2165 case Instruction::Sub:
2166 case Instruction::FSub:
2167 case Instruction::FNeg:
2168 case Instruction::Mul:
2169 case Instruction::FMul:
2170 case Instruction::FDiv:
2171 case Instruction::FRem:
2172 case Instruction::Shl:
2173 case Instruction::LShr:
2174 case Instruction::AShr:
2175 case Instruction::And:
2176 case Instruction::Or:
2177 case Instruction::Xor: {
2178 // Just widen unops and binops.
2180 for (VPValue *VPOp : operands())
2181 Ops.push_back(State.get(VPOp));
2182
2183 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2184
2185 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2186 applyFlags(*VecOp);
2187 applyMetadata(*VecOp);
2188 }
2189
2190 // Use this vector value for all users of the original instruction.
2191 State.set(this, V);
2192 break;
2193 }
2194 case Instruction::ExtractValue: {
2195 assert(getNumOperands() == 2 && "expected single level extractvalue");
2196 Value *Op = State.get(getOperand(0));
2198 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
2199 State.set(this, Extract);
2200 break;
2201 }
2202 case Instruction::Freeze: {
2203 Value *Op = State.get(getOperand(0));
2204 Value *Freeze = Builder.CreateFreeze(Op);
2205 State.set(this, Freeze);
2206 break;
2207 }
2208 case Instruction::ICmp:
2209 case Instruction::FCmp: {
2210 // Widen compares. Generate vector compares.
2211 bool FCmp = Opcode == Instruction::FCmp;
2212 Value *A = State.get(getOperand(0));
2213 Value *B = State.get(getOperand(1));
2214 Value *C = nullptr;
2215 if (FCmp) {
2216 C = Builder.CreateFCmp(getPredicate(), A, B);
2217 } else {
2218 C = Builder.CreateICmp(getPredicate(), A, B);
2219 }
2220 if (auto *I = dyn_cast<Instruction>(C)) {
2221 applyFlags(*I);
2222 applyMetadata(*I);
2223 }
2224 State.set(this, C);
2225 break;
2226 }
2227 default:
2228 // This instruction is not vectorized by simple widening.
2229 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2230 << Instruction::getOpcodeName(Opcode));
2231 llvm_unreachable("Unhandled instruction!");
2232 } // end of switch.
2233
2234#if !defined(NDEBUG)
2235 // Verify that VPlan type inference results agree with the type of the
2236 // generated values.
2237 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2238 State.get(this)->getType() &&
2239 "inferred type and type from generated instructions do not match");
2240#endif
2241}
2242
2244 VPCostContext &Ctx) const {
2245 switch (Opcode) {
2246 case Instruction::UDiv:
2247 case Instruction::SDiv:
2248 case Instruction::SRem:
2249 case Instruction::URem:
2250 // If the div/rem operation isn't safe to speculate and requires
2251 // predication, then the only way we can even create a vplan is to insert
2252 // a select on the second input operand to ensure we use the value of 1
2253 // for the inactive lanes. The select will be costed separately.
2254 case Instruction::FNeg:
2255 case Instruction::Add:
2256 case Instruction::FAdd:
2257 case Instruction::Sub:
2258 case Instruction::FSub:
2259 case Instruction::Mul:
2260 case Instruction::FMul:
2261 case Instruction::FDiv:
2262 case Instruction::FRem:
2263 case Instruction::Shl:
2264 case Instruction::LShr:
2265 case Instruction::AShr:
2266 case Instruction::And:
2267 case Instruction::Or:
2268 case Instruction::Xor:
2269 case Instruction::Freeze:
2270 case Instruction::ExtractValue:
2271 case Instruction::ICmp:
2272 case Instruction::FCmp:
2273 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2274 default:
2275 llvm_unreachable("Unsupported opcode for instruction");
2276 }
2277}
2278
2279#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2281 VPSlotTracker &SlotTracker) const {
2282 O << Indent << "WIDEN ";
2284 O << " = " << Instruction::getOpcodeName(Opcode);
2285 printFlags(O);
2287}
2288#endif
2289
2291 auto &Builder = State.Builder;
2292 /// Vectorize casts.
2293 assert(State.VF.isVector() && "Not vectorizing?");
2294 Type *DestTy = VectorType::get(getResultType(), State.VF);
2295 VPValue *Op = getOperand(0);
2296 Value *A = State.get(Op);
2297 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2298 State.set(this, Cast);
2299 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2300 applyFlags(*CastOp);
2301 applyMetadata(*CastOp);
2302 }
2303}
2304
2306 VPCostContext &Ctx) const {
2307 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2308 // the legacy cost model, including truncates/extends when evaluating a
2309 // reduction in a smaller type.
2310 if (!getUnderlyingValue())
2311 return 0;
2312 // Computes the CastContextHint from a recipes that may access memory.
2313 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2314 if (VF.isScalar())
2316 if (isa<VPInterleaveBase>(R))
2318 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2319 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2321 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2322 if (WidenMemoryRecipe == nullptr)
2324 if (!WidenMemoryRecipe->isConsecutive())
2326 if (WidenMemoryRecipe->isReverse())
2328 if (WidenMemoryRecipe->isMasked())
2331 };
2332
2333 VPValue *Operand = getOperand(0);
2335 // For Trunc/FPTrunc, get the context from the only user.
2336 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2338 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2339 CCH = ComputeCCH(StoreRecipe);
2340 }
2341 // For Z/Sext, get the context from the operand.
2342 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2343 Opcode == Instruction::FPExt) {
2344 if (Operand->isLiveIn())
2346 else if (Operand->getDefiningRecipe())
2347 CCH = ComputeCCH(Operand->getDefiningRecipe());
2348 }
2349
2350 auto *SrcTy =
2351 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2352 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2353 // Arm TTI will use the underlying instruction to determine the cost.
2354 return Ctx.TTI.getCastInstrCost(
2355 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2357}
2358
2359#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2361 VPSlotTracker &SlotTracker) const {
2362 O << Indent << "WIDEN-CAST ";
2364 O << " = " << Instruction::getOpcodeName(Opcode);
2365 printFlags(O);
2367 O << " to " << *getResultType();
2368}
2369#endif
2370
2372 VPCostContext &Ctx) const {
2373 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2374}
2375
2376/// A helper function that returns an integer or floating-point constant with
2377/// value C.
2379 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2380 : ConstantFP::get(Ty, C);
2381}
2382
2383#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2385 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2386 O << Indent;
2388 O << " = WIDEN-INDUCTION";
2389 printFlags(O);
2390 O << " ";
2392
2393 if (auto *TI = getTruncInst())
2394 O << " (truncated to " << *TI->getType() << ")";
2395}
2396#endif
2397
2399 // The step may be defined by a recipe in the preheader (e.g. if it requires
2400 // SCEV expansion), but for the canonical induction the step is required to be
2401 // 1, which is represented as live-in.
2403 return false;
2406 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2407 getScalarType() == getRegion()->getCanonicalIVType();
2408}
2409
2410#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2412 VPSlotTracker &SlotTracker) const {
2413 O << Indent;
2415 O << " = DERIVED-IV ";
2416 getStartValue()->printAsOperand(O, SlotTracker);
2417 O << " + ";
2418 getOperand(1)->printAsOperand(O, SlotTracker);
2419 O << " * ";
2420 getStepValue()->printAsOperand(O, SlotTracker);
2421}
2422#endif
2423
2425 // Fast-math-flags propagate from the original induction instruction.
2426 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2427 if (hasFastMathFlags())
2428 State.Builder.setFastMathFlags(getFastMathFlags());
2429
2430 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2431 /// variable on which to base the steps, \p Step is the size of the step.
2432
2433 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2434 Value *Step = State.get(getStepValue(), VPLane(0));
2435 IRBuilderBase &Builder = State.Builder;
2436
2437 // Ensure step has the same type as that of scalar IV.
2438 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2439 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2440
2441 // We build scalar steps for both integer and floating-point induction
2442 // variables. Here, we determine the kind of arithmetic we will perform.
2445 if (BaseIVTy->isIntegerTy()) {
2446 AddOp = Instruction::Add;
2447 MulOp = Instruction::Mul;
2448 } else {
2449 AddOp = InductionOpcode;
2450 MulOp = Instruction::FMul;
2451 }
2452
2453 // Determine the number of scalars we need to generate for each unroll
2454 // iteration.
2455 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2456 // Compute the scalar steps and save the results in State.
2457 Type *IntStepTy =
2458 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2459 Type *VecIVTy = nullptr;
2460 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2461 if (!FirstLaneOnly && State.VF.isScalable()) {
2462 VecIVTy = VectorType::get(BaseIVTy, State.VF);
2463 UnitStepVec =
2464 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2465 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2466 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2467 }
2468
2469 unsigned StartLane = 0;
2470 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2471 if (State.Lane) {
2472 StartLane = State.Lane->getKnownLane();
2473 EndLane = StartLane + 1;
2474 }
2475 Value *StartIdx0;
2476 if (getUnrollPart(*this) == 0)
2477 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2478 else {
2479 StartIdx0 = State.get(getOperand(2), true);
2480 if (getUnrollPart(*this) != 1) {
2481 StartIdx0 =
2482 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2483 getUnrollPart(*this)));
2484 }
2485 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2486 }
2487
2488 if (!FirstLaneOnly && State.VF.isScalable()) {
2489 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2490 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2491 if (BaseIVTy->isFloatingPointTy())
2492 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2493 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2494 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2495 State.set(this, Add);
2496 // It's useful to record the lane values too for the known minimum number
2497 // of elements so we do those below. This improves the code quality when
2498 // trying to extract the first element, for example.
2499 }
2500
2501 if (BaseIVTy->isFloatingPointTy())
2502 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2503
2504 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2505 Value *StartIdx = Builder.CreateBinOp(
2506 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2507 // The step returned by `createStepForVF` is a runtime-evaluated value
2508 // when VF is scalable. Otherwise, it should be folded into a Constant.
2509 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2510 "Expected StartIdx to be folded to a constant when VF is not "
2511 "scalable");
2512 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2513 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2514 State.set(this, Add, VPLane(Lane));
2515 }
2516}
2517
2518#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2520 VPSlotTracker &SlotTracker) const {
2521 O << Indent;
2523 O << " = SCALAR-STEPS ";
2525}
2526#endif
2527
2529 assert(State.VF.isVector() && "not widening");
2530 // Construct a vector GEP by widening the operands of the scalar GEP as
2531 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2532 // results in a vector of pointers when at least one operand of the GEP
2533 // is vector-typed. Thus, to keep the representation compact, we only use
2534 // vector-typed operands for loop-varying values.
2535
2536 if (areAllOperandsInvariant()) {
2537 // If we are vectorizing, but the GEP has only loop-invariant operands,
2538 // the GEP we build (by only using vector-typed operands for
2539 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2540 // produce a vector of pointers, we need to either arbitrarily pick an
2541 // operand to broadcast, or broadcast a clone of the original GEP.
2542 // Here, we broadcast a clone of the original.
2543 //
2544 // TODO: If at some point we decide to scalarize instructions having
2545 // loop-invariant operands, this special case will no longer be
2546 // required. We would add the scalarization decision to
2547 // collectLoopScalars() and teach getVectorValue() to broadcast
2548 // the lane-zero scalar value.
2550 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2551 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2552
2553 auto *NewGEP =
2554 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2555 "", getGEPNoWrapFlags());
2556 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2557 State.set(this, Splat);
2558 } else {
2559 // If the GEP has at least one loop-varying operand, we are sure to
2560 // produce a vector of pointers unless VF is scalar.
2561 // The pointer operand of the new GEP. If it's loop-invariant, we
2562 // won't broadcast it.
2563 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2564
2565 // Collect all the indices for the new GEP. If any index is
2566 // loop-invariant, we won't broadcast it.
2568 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2569 VPValue *Operand = getOperand(I);
2570 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2571 }
2572
2573 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2574 // but it should be a vector, otherwise.
2575 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2576 "", getGEPNoWrapFlags());
2577 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2578 "NewGEP is not a pointer vector");
2579 State.set(this, NewGEP);
2580 }
2581}
2582
2583#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2585 VPSlotTracker &SlotTracker) const {
2586 O << Indent << "WIDEN-GEP ";
2587 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2588 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2589 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2590
2591 O << " ";
2593 O << " = getelementptr";
2594 printFlags(O);
2596}
2597#endif
2598
2599static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2600 unsigned CurrentPart, IRBuilderBase &Builder) {
2601 // Use i32 for the gep index type when the value is constant,
2602 // or query DataLayout for a more suitable index type otherwise.
2603 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2604 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2605 ? DL.getIndexType(Builder.getPtrTy(0))
2606 : Builder.getInt32Ty();
2607}
2608
2610 auto &Builder = State.Builder;
2611 unsigned CurrentPart = getUnrollPart(*this);
2612 bool IsUnitStride = Stride == 1 || Stride == -1;
2613 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2614 IsUnitStride, CurrentPart, Builder);
2615
2616 // The wide store needs to start at the last vector element.
2617 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2618 if (IndexTy != RunTimeVF->getType())
2619 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2620 // NumElt = Stride * CurrentPart * RunTimeVF
2621 Value *NumElt = Builder.CreateMul(
2622 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2623 // LastLane = Stride * (RunTimeVF - 1)
2624 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2625 if (Stride != 1)
2626 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2627 Value *Ptr = State.get(getOperand(0), VPLane(0));
2628 Value *ResultPtr =
2629 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2630 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2632
2633 State.set(this, ResultPtr, /*IsScalar*/ true);
2634}
2635
2636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2638 VPSlotTracker &SlotTracker) const {
2639 O << Indent;
2641 O << " = vector-end-pointer";
2642 printFlags(O);
2644}
2645#endif
2646
2648 auto &Builder = State.Builder;
2649 unsigned CurrentPart = getUnrollPart(*this);
2650 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2651 /*IsUnitStride*/ true, CurrentPart, Builder);
2652 Value *Ptr = State.get(getOperand(0), VPLane(0));
2653
2654 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2655 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2656 "", getGEPNoWrapFlags());
2657
2658 State.set(this, ResultPtr, /*IsScalar*/ true);
2659}
2660
2661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2663 VPSlotTracker &SlotTracker) const {
2664 O << Indent;
2666 O << " = vector-pointer ";
2667
2669}
2670#endif
2671
2673 VPCostContext &Ctx) const {
2674 // Handle cases where only the first lane is used the same way as the legacy
2675 // cost model.
2677 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2678
2679 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2680 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2681 return (getNumIncomingValues() - 1) *
2682 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2683 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2684}
2685
2686#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2688 VPSlotTracker &SlotTracker) const {
2689 O << Indent << "BLEND ";
2691 O << " =";
2692 if (getNumIncomingValues() == 1) {
2693 // Not a User of any mask: not really blending, this is a
2694 // single-predecessor phi.
2695 O << " ";
2696 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2697 } else {
2698 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2699 O << " ";
2700 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2701 if (I == 0)
2702 continue;
2703 O << "/";
2704 getMask(I)->printAsOperand(O, SlotTracker);
2705 }
2706 }
2707}
2708#endif
2709
2711 assert(!State.Lane && "Reduction being replicated.");
2712 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2715 "In-loop AnyOf reductions aren't currently supported");
2716 // Propagate the fast-math flags carried by the underlying instruction.
2717 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2718 State.Builder.setFastMathFlags(getFastMathFlags());
2719 Value *NewVecOp = State.get(getVecOp());
2720 if (VPValue *Cond = getCondOp()) {
2721 Value *NewCond = State.get(Cond, State.VF.isScalar());
2722 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2723 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2724
2725 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2726 if (State.VF.isVector())
2727 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2728
2729 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2730 NewVecOp = Select;
2731 }
2732 Value *NewRed;
2733 Value *NextInChain;
2734 if (IsOrdered) {
2735 if (State.VF.isVector())
2736 NewRed =
2737 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2738 else
2739 NewRed = State.Builder.CreateBinOp(
2741 PrevInChain, NewVecOp);
2742 PrevInChain = NewRed;
2743 NextInChain = NewRed;
2744 } else {
2745 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2746 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2748 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2749 else
2750 NextInChain = State.Builder.CreateBinOp(
2752 PrevInChain, NewRed);
2753 }
2754 State.set(this, NextInChain, /*IsScalar*/ true);
2755}
2756
2758 assert(!State.Lane && "Reduction being replicated.");
2759
2760 auto &Builder = State.Builder;
2761 // Propagate the fast-math flags carried by the underlying instruction.
2762 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2763 Builder.setFastMathFlags(getFastMathFlags());
2764
2766 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2767 Value *VecOp = State.get(getVecOp());
2768 Value *EVL = State.get(getEVL(), VPLane(0));
2769
2770 Value *Mask;
2771 if (VPValue *CondOp = getCondOp())
2772 Mask = State.get(CondOp);
2773 else
2774 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2775
2776 Value *NewRed;
2777 if (isOrdered()) {
2778 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2779 } else {
2780 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2782 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2783 else
2784 NewRed = Builder.CreateBinOp(
2786 Prev);
2787 }
2788 State.set(this, NewRed, /*IsScalar*/ true);
2789}
2790
2792 VPCostContext &Ctx) const {
2793 RecurKind RdxKind = getRecurrenceKind();
2794 Type *ElementTy = Ctx.Types.inferScalarType(this);
2795 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2796 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2798 std::optional<FastMathFlags> OptionalFMF =
2799 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2800
2801 // TODO: Support any-of reductions.
2802 assert(
2804 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2805 "Any-of reduction not implemented in VPlan-based cost model currently.");
2806
2807 // Note that TTI should model the cost of moving result to the scalar register
2808 // and the BinOp cost in the getMinMaxReductionCost().
2811 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2812 }
2813
2814 // Note that TTI should model the cost of moving result to the scalar register
2815 // and the BinOp cost in the getArithmeticReductionCost().
2816 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2817 Ctx.CostKind);
2818}
2819
2821 ExpressionTypes ExpressionType,
2822 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2823 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2824 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2825 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2826 assert(
2827 none_of(ExpressionRecipes,
2828 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2829 "expression cannot contain recipes with side-effects");
2830
2831 // Maintain a copy of the expression recipes as a set of users.
2832 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2833 for (auto *R : ExpressionRecipes)
2834 ExpressionRecipesAsSetOfUsers.insert(R);
2835
2836 // Recipes in the expression, except the last one, must only be used by
2837 // (other) recipes inside the expression. If there are other users, external
2838 // to the expression, use a clone of the recipe for external users.
2839 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2840 if (R != ExpressionRecipes.back() &&
2841 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2842 return !ExpressionRecipesAsSetOfUsers.contains(U);
2843 })) {
2844 // There are users outside of the expression. Clone the recipe and use the
2845 // clone those external users.
2846 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2847 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2848 VPUser &U, unsigned) {
2849 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2850 });
2851 CopyForExtUsers->insertBefore(R);
2852 }
2853 if (R->getParent())
2854 R->removeFromParent();
2855 }
2856
2857 // Internalize all external operands to the expression recipes. To do so,
2858 // create new temporary VPValues for all operands defined by a recipe outside
2859 // the expression. The original operands are added as operands of the
2860 // VPExpressionRecipe itself.
2861 for (auto *R : ExpressionRecipes) {
2862 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2863 auto *Def = Op->getDefiningRecipe();
2864 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2865 continue;
2866 addOperand(Op);
2867 LiveInPlaceholders.push_back(new VPValue());
2868 }
2869 }
2870
2871 // Replace each external operand with the first one created for it in
2872 // LiveInPlaceholders.
2873 for (auto *R : ExpressionRecipes)
2874 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2875 R->replaceUsesOfWith(LiveIn, Tmp);
2876}
2877
2879 for (auto *R : ExpressionRecipes)
2880 // Since the list could contain duplicates, make sure the recipe hasn't
2881 // already been inserted.
2882 if (!R->getParent())
2883 R->insertBefore(this);
2884
2885 for (const auto &[Idx, Op] : enumerate(operands()))
2886 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2887
2888 replaceAllUsesWith(ExpressionRecipes.back());
2889 ExpressionRecipes.clear();
2890}
2891
2893 VPCostContext &Ctx) const {
2894 Type *RedTy = Ctx.Types.inferScalarType(this);
2895 auto *SrcVecTy = cast<VectorType>(
2896 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2897 assert(RedTy->isIntegerTy() &&
2898 "VPExpressionRecipe only supports integer types currently.");
2899 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2900 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
2901 switch (ExpressionType) {
2902 case ExpressionTypes::ExtendedReduction: {
2903 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2904 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2905 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2906 return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
2907 ? Ctx.TTI.getPartialReductionCost(
2908 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
2909 RedTy, VF,
2911 ExtR->getOpcode()),
2912 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
2913 : Ctx.TTI.getExtendedReductionCost(
2914 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
2915 SrcVecTy, std::nullopt, Ctx.CostKind);
2916 }
2917 case ExpressionTypes::MulAccReduction:
2918 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
2919 Ctx.CostKind);
2920
2921 case ExpressionTypes::ExtNegatedMulAccReduction:
2922 assert(Opcode == Instruction::Add && "Unexpected opcode");
2923 Opcode = Instruction::Sub;
2924 [[fallthrough]];
2925 case ExpressionTypes::ExtMulAccReduction: {
2926 if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
2927 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2928 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2929 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2930 return Ctx.TTI.getPartialReductionCost(
2931 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
2932 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
2934 Ext0R->getOpcode()),
2936 Ext1R->getOpcode()),
2937 Mul->getOpcode(), Ctx.CostKind);
2938 }
2939 return Ctx.TTI.getMulAccReductionCost(
2940 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2941 Instruction::ZExt,
2942 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2943 }
2944 }
2945 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2946}
2947
2949 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2950 return R->mayReadFromMemory() || R->mayWriteToMemory();
2951 });
2952}
2953
2955 assert(
2956 none_of(ExpressionRecipes,
2957 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2958 "expression cannot contain recipes with side-effects");
2959 return false;
2960}
2961
2963 // Cannot use vputils::isSingleScalar(), because all external operands
2964 // of the expression will be live-ins while bundled.
2965 return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2966 !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2967}
2968
2969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2970
2972 VPSlotTracker &SlotTracker) const {
2973 O << Indent << "EXPRESSION ";
2975 O << " = ";
2976 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2977 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2978 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
2979
2980 switch (ExpressionType) {
2981 case ExpressionTypes::ExtendedReduction: {
2983 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2984 O << Instruction::getOpcodeName(Opcode) << " (";
2986 Red->printFlags(O);
2987
2988 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2989 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2990 << *Ext0->getResultType();
2991 if (Red->isConditional()) {
2992 O << ", ";
2993 Red->getCondOp()->printAsOperand(O, SlotTracker);
2994 }
2995 O << ")";
2996 break;
2997 }
2998 case ExpressionTypes::ExtNegatedMulAccReduction: {
3000 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
3002 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3003 << " (sub (0, mul";
3004 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3005 Mul->printFlags(O);
3006 O << "(";
3008 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3009 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3010 << *Ext0->getResultType() << "), (";
3012 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3013 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3014 << *Ext1->getResultType() << ")";
3015 if (Red->isConditional()) {
3016 O << ", ";
3017 Red->getCondOp()->printAsOperand(O, SlotTracker);
3018 }
3019 O << "))";
3020 break;
3021 }
3022 case ExpressionTypes::MulAccReduction:
3023 case ExpressionTypes::ExtMulAccReduction: {
3025 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
3027 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3028 << " (";
3029 O << "mul";
3030 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3031 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3032 : ExpressionRecipes[0]);
3033 Mul->printFlags(O);
3034 if (IsExtended)
3035 O << "(";
3037 if (IsExtended) {
3038 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3039 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3040 << *Ext0->getResultType() << "), (";
3041 } else {
3042 O << ", ";
3043 }
3045 if (IsExtended) {
3046 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3047 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3048 << *Ext1->getResultType() << ")";
3049 }
3050 if (Red->isConditional()) {
3051 O << ", ";
3052 Red->getCondOp()->printAsOperand(O, SlotTracker);
3053 }
3054 O << ")";
3055 break;
3056 }
3057 }
3058}
3059
3061 VPSlotTracker &SlotTracker) const {
3062 O << Indent << "REDUCE ";
3064 O << " = ";
3066 O << " +";
3067 printFlags(O);
3068 O << " reduce."
3071 << " (";
3073 if (isConditional()) {
3074 O << ", ";
3076 }
3077 O << ")";
3078}
3079
3081 VPSlotTracker &SlotTracker) const {
3082 O << Indent << "REDUCE ";
3084 O << " = ";
3086 O << " +";
3087 printFlags(O);
3088 O << " vp.reduce."
3091 << " (";
3093 O << ", ";
3095 if (isConditional()) {
3096 O << ", ";
3098 }
3099 O << ")";
3100}
3101
3102#endif
3103
3104/// A helper function to scalarize a single Instruction in the innermost loop.
3105/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3106/// operands from \p RepRecipe instead of \p Instr's operands.
3107static void scalarizeInstruction(const Instruction *Instr,
3108 VPReplicateRecipe *RepRecipe,
3109 const VPLane &Lane, VPTransformState &State) {
3110 assert((!Instr->getType()->isAggregateType() ||
3111 canVectorizeTy(Instr->getType())) &&
3112 "Expected vectorizable or non-aggregate type.");
3113
3114 // Does this instruction return a value ?
3115 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3116
3117 Instruction *Cloned = Instr->clone();
3118 if (!IsVoidRetTy) {
3119 Cloned->setName(Instr->getName() + ".cloned");
3120 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3121 // The operands of the replicate recipe may have been narrowed, resulting in
3122 // a narrower result type. Update the type of the cloned instruction to the
3123 // correct type.
3124 if (ResultTy != Cloned->getType())
3125 Cloned->mutateType(ResultTy);
3126 }
3127
3128 RepRecipe->applyFlags(*Cloned);
3129 RepRecipe->applyMetadata(*Cloned);
3130
3131 if (RepRecipe->hasPredicate())
3132 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3133
3134 if (auto DL = RepRecipe->getDebugLoc())
3135 State.setDebugLocFrom(DL);
3136
3137 // Replace the operands of the cloned instructions with their scalar
3138 // equivalents in the new loop.
3139 for (const auto &I : enumerate(RepRecipe->operands())) {
3140 auto InputLane = Lane;
3141 VPValue *Operand = I.value();
3142 if (vputils::isSingleScalar(Operand))
3143 InputLane = VPLane::getFirstLane();
3144 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3145 }
3146
3147 // Place the cloned scalar in the new loop.
3148 State.Builder.Insert(Cloned);
3149
3150 State.set(RepRecipe, Cloned, Lane);
3151
3152 // If we just cloned a new assumption, add it the assumption cache.
3153 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3154 State.AC->registerAssumption(II);
3155
3156 assert(
3157 (RepRecipe->getRegion() ||
3158 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3159 all_of(RepRecipe->operands(),
3160 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3161 "Expected a recipe is either within a region or all of its operands "
3162 "are defined outside the vectorized region.");
3163}
3164
3167
3168 if (!State.Lane) {
3169 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3170 "must have already been unrolled");
3171 scalarizeInstruction(UI, this, VPLane(0), State);
3172 return;
3173 }
3174
3175 assert((State.VF.isScalar() || !isSingleScalar()) &&
3176 "uniform recipe shouldn't be predicated");
3177 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3178 scalarizeInstruction(UI, this, *State.Lane, State);
3179 // Insert scalar instance packing it into a vector.
3180 if (State.VF.isVector() && shouldPack()) {
3181 Value *WideValue =
3182 State.Lane->isFirstLane()
3183 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3184 : State.get(this);
3185 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3186 *State.Lane));
3187 }
3188}
3189
3191 // Find if the recipe is used by a widened recipe via an intervening
3192 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3193 return any_of(users(), [](const VPUser *U) {
3194 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3195 return !vputils::onlyScalarValuesUsed(PredR);
3196 return false;
3197 });
3198}
3199
3200/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3201/// which the legacy cost model computes a SCEV expression when computing the
3202/// address cost. Computing SCEVs for VPValues is incomplete and returns
3203/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3204/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3205static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
3206 const Loop *L) {
3207 auto *PtrR = Ptr->getDefiningRecipe();
3208 if (!PtrR || !((isa<VPReplicateRecipe>(Ptr) &&
3210 Instruction::GetElementPtr) ||
3211 isa<VPWidenGEPRecipe>(Ptr) ||
3213 return nullptr;
3214
3215 // We are looking for a GEP where all indices are either loop invariant or
3216 // inductions.
3217 for (VPValue *Opd : drop_begin(PtrR->operands())) {
3218 if (!Opd->isDefinedOutsideLoopRegions() &&
3220 return nullptr;
3221 }
3222
3223 return vputils::getSCEVExprForVPValue(Ptr, SE, L);
3224}
3225
3226/// Returns true if \p V is used as part of the address of another load or
3227/// store.
3228static bool isUsedByLoadStoreAddress(const VPUser *V) {
3230 SmallVector<const VPUser *> WorkList = {V};
3231
3232 while (!WorkList.empty()) {
3233 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3234 if (!Cur || !Seen.insert(Cur).second)
3235 continue;
3236
3237 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3238 // Skip blends that use V only through a compare by checking if any incoming
3239 // value was already visited.
3240 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3241 [&](unsigned I) {
3242 return Seen.contains(
3243 Blend->getIncomingValue(I)->getDefiningRecipe());
3244 }))
3245 continue;
3246
3247 for (VPUser *U : Cur->users()) {
3248 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3249 if (InterleaveR->getAddr() == Cur)
3250 return true;
3251 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3252 if (RepR->getOpcode() == Instruction::Load &&
3253 RepR->getOperand(0) == Cur)
3254 return true;
3255 if (RepR->getOpcode() == Instruction::Store &&
3256 RepR->getOperand(1) == Cur)
3257 return true;
3258 }
3259 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3260 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3261 return true;
3262 }
3263 }
3264
3265 // The legacy cost model only supports scalarization loads/stores with phi
3266 // addresses, if the phi is directly used as load/store address. Don't
3267 // traverse further for Blends.
3268 if (Blend)
3269 continue;
3270
3271 append_range(WorkList, Cur->users());
3272 }
3273 return false;
3274}
3275
3277 VPCostContext &Ctx) const {
3279 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3280 // transform, avoid computing their cost multiple times for now.
3281 Ctx.SkipCostComputation.insert(UI);
3282
3283 if (VF.isScalable() && !isSingleScalar())
3285
3286 switch (UI->getOpcode()) {
3287 case Instruction::GetElementPtr:
3288 // We mark this instruction as zero-cost because the cost of GEPs in
3289 // vectorized code depends on whether the corresponding memory instruction
3290 // is scalarized or not. Therefore, we handle GEPs with the memory
3291 // instruction cost.
3292 return 0;
3293 case Instruction::Call: {
3294 auto *CalledFn =
3296
3299 for (const VPValue *ArgOp : ArgOps)
3300 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3301
3302 if (CalledFn->isIntrinsic())
3303 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3304 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3305 switch (CalledFn->getIntrinsicID()) {
3306 case Intrinsic::assume:
3307 case Intrinsic::lifetime_end:
3308 case Intrinsic::lifetime_start:
3309 case Intrinsic::sideeffect:
3310 case Intrinsic::pseudoprobe:
3311 case Intrinsic::experimental_noalias_scope_decl: {
3312 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3313 ElementCount::getFixed(1), Ctx) == 0 &&
3314 "scalarizing intrinsic should be free");
3315 return InstructionCost(0);
3316 }
3317 default:
3318 break;
3319 }
3320
3321 Type *ResultTy = Ctx.Types.inferScalarType(this);
3322 InstructionCost ScalarCallCost =
3323 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3324 if (isSingleScalar()) {
3325 if (CalledFn->isIntrinsic())
3326 ScalarCallCost = std::min(
3327 ScalarCallCost,
3328 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3329 ElementCount::getFixed(1), Ctx));
3330 return ScalarCallCost;
3331 }
3332
3333 return ScalarCallCost * VF.getFixedValue() +
3334 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3335 }
3336 case Instruction::Add:
3337 case Instruction::Sub:
3338 case Instruction::FAdd:
3339 case Instruction::FSub:
3340 case Instruction::Mul:
3341 case Instruction::FMul:
3342 case Instruction::FDiv:
3343 case Instruction::FRem:
3344 case Instruction::Shl:
3345 case Instruction::LShr:
3346 case Instruction::AShr:
3347 case Instruction::And:
3348 case Instruction::Or:
3349 case Instruction::Xor:
3350 case Instruction::ICmp:
3351 case Instruction::FCmp:
3353 Ctx) *
3354 (isSingleScalar() ? 1 : VF.getFixedValue());
3355 case Instruction::SDiv:
3356 case Instruction::UDiv:
3357 case Instruction::SRem:
3358 case Instruction::URem: {
3359 InstructionCost ScalarCost =
3361 if (isSingleScalar())
3362 return ScalarCost;
3363
3364 ScalarCost = ScalarCost * VF.getFixedValue() +
3365 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3366 to_vector(operands()), VF);
3367 // If the recipe is not predicated (i.e. not in a replicate region), return
3368 // the scalar cost. Otherwise handle predicated cost.
3369 if (!getRegion()->isReplicator())
3370 return ScalarCost;
3371
3372 // Account for the phi nodes that we will create.
3373 ScalarCost += VF.getFixedValue() *
3374 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3375 // Scale the cost by the probability of executing the predicated blocks.
3376 // This assumes the predicated block for each vector lane is equally
3377 // likely.
3378 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3379 return ScalarCost;
3380 }
3381 case Instruction::Load:
3382 case Instruction::Store: {
3383 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3384 // predicated cases.
3385 const VPRegionBlock *ParentRegion = getRegion();
3386 if (ParentRegion && ParentRegion->isReplicator())
3387 break;
3388
3389 bool IsLoad = UI->getOpcode() == Instruction::Load;
3390 const VPValue *PtrOp = getOperand(!IsLoad);
3391 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
3393 break;
3394
3395 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3396 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3397 const Align Alignment = getLoadStoreAlignment(UI);
3398 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3400 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3401 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3402
3403 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3404 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3405 bool UsedByLoadStoreAddress =
3406 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3407 InstructionCost ScalarCost =
3408 ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3409 PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3410 PtrSCEV, Ctx.CostKind);
3411 if (isSingleScalar())
3412 return ScalarCost;
3413
3414 SmallVector<const VPValue *> OpsToScalarize;
3415 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3416 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3417 // don't assign scalarization overhead in general, if the target prefers
3418 // vectorized addressing or the loaded value is used as part of an address
3419 // of another load or store.
3420 if (!UsedByLoadStoreAddress) {
3421 bool EfficientVectorLoadStore =
3422 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3423 if (!(IsLoad && !PreferVectorizedAddressing) &&
3424 !(!IsLoad && EfficientVectorLoadStore))
3425 append_range(OpsToScalarize, operands());
3426
3427 if (!EfficientVectorLoadStore)
3428 ResultTy = Ctx.Types.inferScalarType(this);
3429 }
3430
3431 return (ScalarCost * VF.getFixedValue()) +
3432 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3433 }
3434 }
3435
3436 return Ctx.getLegacyCost(UI, VF);
3437}
3438
3439#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3441 VPSlotTracker &SlotTracker) const {
3442 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3443
3444 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3446 O << " = ";
3447 }
3448 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3449 O << "call";
3450 printFlags(O);
3451 O << "@" << CB->getCalledFunction()->getName() << "(";
3453 O, [&O, &SlotTracker](VPValue *Op) {
3454 Op->printAsOperand(O, SlotTracker);
3455 });
3456 O << ")";
3457 } else {
3459 printFlags(O);
3461 }
3462
3463 if (shouldPack())
3464 O << " (S->V)";
3465}
3466#endif
3467
3469 assert(State.Lane && "Branch on Mask works only on single instance.");
3470
3471 VPValue *BlockInMask = getOperand(0);
3472 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3473
3474 // Replace the temporary unreachable terminator with a new conditional branch,
3475 // whose two destinations will be set later when they are created.
3476 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3477 assert(isa<UnreachableInst>(CurrentTerminator) &&
3478 "Expected to replace unreachable terminator with conditional branch.");
3479 auto CondBr =
3480 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3481 CondBr->setSuccessor(0, nullptr);
3482 CurrentTerminator->eraseFromParent();
3483}
3484
3486 VPCostContext &Ctx) const {
3487 // The legacy cost model doesn't assign costs to branches for individual
3488 // replicate regions. Match the current behavior in the VPlan cost model for
3489 // now.
3490 return 0;
3491}
3492
3494 assert(State.Lane && "Predicated instruction PHI works per instance.");
3495 Instruction *ScalarPredInst =
3496 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3497 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3498 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3499 assert(PredicatingBB && "Predicated block has no single predecessor.");
3501 "operand must be VPReplicateRecipe");
3502
3503 // By current pack/unpack logic we need to generate only a single phi node: if
3504 // a vector value for the predicated instruction exists at this point it means
3505 // the instruction has vector users only, and a phi for the vector value is
3506 // needed. In this case the recipe of the predicated instruction is marked to
3507 // also do that packing, thereby "hoisting" the insert-element sequence.
3508 // Otherwise, a phi node for the scalar value is needed.
3509 if (State.hasVectorValue(getOperand(0))) {
3510 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3512 "Packed operands must generate an insertelement or insertvalue");
3513
3514 // If VectorI is a struct, it will be a sequence like:
3515 // %1 = insertvalue %unmodified, %x, 0
3516 // %2 = insertvalue %1, %y, 1
3517 // %VectorI = insertvalue %2, %z, 2
3518 // To get the unmodified vector we need to look through the chain.
3519 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3520 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3521 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3522
3523 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3524 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3525 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3526 if (State.hasVectorValue(this))
3527 State.reset(this, VPhi);
3528 else
3529 State.set(this, VPhi);
3530 // NOTE: Currently we need to update the value of the operand, so the next
3531 // predicated iteration inserts its generated value in the correct vector.
3532 State.reset(getOperand(0), VPhi);
3533 } else {
3534 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3535 return;
3536
3537 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3538 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3539 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3540 PredicatingBB);
3541 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3542 if (State.hasScalarValue(this, *State.Lane))
3543 State.reset(this, Phi, *State.Lane);
3544 else
3545 State.set(this, Phi, *State.Lane);
3546 // NOTE: Currently we need to update the value of the operand, so the next
3547 // predicated iteration inserts its generated value in the correct vector.
3548 State.reset(getOperand(0), Phi, *State.Lane);
3549 }
3550}
3551
3552#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3554 VPSlotTracker &SlotTracker) const {
3555 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3557 O << " = ";
3559}
3560#endif
3561
3563 VPCostContext &Ctx) const {
3565 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3566 ->getAddressSpace();
3567 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3568 ? Instruction::Load
3569 : Instruction::Store;
3570
3571 if (!Consecutive) {
3572 // TODO: Using the original IR may not be accurate.
3573 // Currently, ARM will use the underlying IR to calculate gather/scatter
3574 // instruction cost.
3575 assert(!Reverse &&
3576 "Inconsecutive memory access should not have the order.");
3577
3579 Type *PtrTy = Ptr->getType();
3580
3581 // If the address value is uniform across all lanes, then the address can be
3582 // calculated with scalar type and broadcast.
3584 PtrTy = toVectorTy(PtrTy, VF);
3585
3586 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3587 Ctx.CostKind) +
3588 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3589 Ctx.CostKind, &Ingredient);
3590 }
3591
3593 if (IsMasked) {
3594 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3595 : Intrinsic::masked_store;
3596 Cost +=
3597 Ctx.TTI.getMaskedMemoryOpCost({IID, Ty, Alignment, AS}, Ctx.CostKind);
3598 } else {
3599 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3601 : getOperand(1));
3602 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3603 OpInfo, &Ingredient);
3604 }
3605 if (!Reverse)
3606 return Cost;
3607
3608 return Cost += Ctx.TTI.getShuffleCost(
3610 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3611}
3612
3614 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3615 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3616 bool CreateGather = !isConsecutive();
3617
3618 auto &Builder = State.Builder;
3619 Value *Mask = nullptr;
3620 if (auto *VPMask = getMask()) {
3621 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3622 // of a null all-one mask is a null mask.
3623 Mask = State.get(VPMask);
3624 if (isReverse())
3625 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3626 }
3627
3628 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3629 Value *NewLI;
3630 if (CreateGather) {
3631 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3632 "wide.masked.gather");
3633 } else if (Mask) {
3634 NewLI =
3635 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3636 PoisonValue::get(DataTy), "wide.masked.load");
3637 } else {
3638 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3639 }
3641 if (Reverse)
3642 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3643 State.set(this, NewLI);
3644}
3645
3646#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3648 VPSlotTracker &SlotTracker) const {
3649 O << Indent << "WIDEN ";
3651 O << " = load ";
3653}
3654#endif
3655
3656/// Use all-true mask for reverse rather than actual mask, as it avoids a
3657/// dependence w/o affecting the result.
3659 Value *EVL, const Twine &Name) {
3660 VectorType *ValTy = cast<VectorType>(Operand->getType());
3661 Value *AllTrueMask =
3662 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3663 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3664 {Operand, AllTrueMask, EVL}, nullptr, Name);
3665}
3666
3668 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3669 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3670 bool CreateGather = !isConsecutive();
3671
3672 auto &Builder = State.Builder;
3673 CallInst *NewLI;
3674 Value *EVL = State.get(getEVL(), VPLane(0));
3675 Value *Addr = State.get(getAddr(), !CreateGather);
3676 Value *Mask = nullptr;
3677 if (VPValue *VPMask = getMask()) {
3678 Mask = State.get(VPMask);
3679 if (isReverse())
3680 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3681 } else {
3682 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3683 }
3684
3685 if (CreateGather) {
3686 NewLI =
3687 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3688 nullptr, "wide.masked.gather");
3689 } else {
3690 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3691 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3692 }
3693 NewLI->addParamAttr(
3695 applyMetadata(*NewLI);
3696 Instruction *Res = NewLI;
3697 if (isReverse())
3698 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3699 State.set(this, Res);
3700}
3701
3703 VPCostContext &Ctx) const {
3704 if (!Consecutive || IsMasked)
3705 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3706
3707 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3708 // here because the EVL recipes using EVL to replace the tail mask. But in the
3709 // legacy model, it will always calculate the cost of mask.
3710 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3711 // don't need to compare to the legacy cost model.
3713 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3714 ->getAddressSpace();
3715 // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics.
3716 // After migrating to getMemIntrinsicInstrCost, switch this to vp_load.
3717 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3718 {Intrinsic::masked_load, Ty, Alignment, AS}, Ctx.CostKind);
3719 if (!Reverse)
3720 return Cost;
3721
3722 return Cost + Ctx.TTI.getShuffleCost(
3724 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3725}
3726
3727#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3729 VPSlotTracker &SlotTracker) const {
3730 O << Indent << "WIDEN ";
3732 O << " = vp.load ";
3734}
3735#endif
3736
3738 VPValue *StoredVPValue = getStoredValue();
3739 bool CreateScatter = !isConsecutive();
3740
3741 auto &Builder = State.Builder;
3742
3743 Value *Mask = nullptr;
3744 if (auto *VPMask = getMask()) {
3745 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3746 // of a null all-one mask is a null mask.
3747 Mask = State.get(VPMask);
3748 if (isReverse())
3749 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3750 }
3751
3752 Value *StoredVal = State.get(StoredVPValue);
3753 if (isReverse()) {
3754 // If we store to reverse consecutive memory locations, then we need
3755 // to reverse the order of elements in the stored value.
3756 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3757 // We don't want to update the value in the map as it might be used in
3758 // another expression. So don't call resetVectorValue(StoredVal).
3759 }
3760 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3761 Instruction *NewSI = nullptr;
3762 if (CreateScatter)
3763 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3764 else if (Mask)
3765 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3766 else
3767 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3768 applyMetadata(*NewSI);
3769}
3770
3771#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3773 VPSlotTracker &SlotTracker) const {
3774 O << Indent << "WIDEN store ";
3776}
3777#endif
3778
3780 VPValue *StoredValue = getStoredValue();
3781 bool CreateScatter = !isConsecutive();
3782
3783 auto &Builder = State.Builder;
3784
3785 CallInst *NewSI = nullptr;
3786 Value *StoredVal = State.get(StoredValue);
3787 Value *EVL = State.get(getEVL(), VPLane(0));
3788 if (isReverse())
3789 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3790 Value *Mask = nullptr;
3791 if (VPValue *VPMask = getMask()) {
3792 Mask = State.get(VPMask);
3793 if (isReverse())
3794 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3795 } else {
3796 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3797 }
3798 Value *Addr = State.get(getAddr(), !CreateScatter);
3799 if (CreateScatter) {
3800 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3801 Intrinsic::vp_scatter,
3802 {StoredVal, Addr, Mask, EVL});
3803 } else {
3804 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3805 Intrinsic::vp_store,
3806 {StoredVal, Addr, Mask, EVL});
3807 }
3808 NewSI->addParamAttr(
3810 applyMetadata(*NewSI);
3811}
3812
3814 VPCostContext &Ctx) const {
3815 if (!Consecutive || IsMasked)
3816 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3817
3818 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3819 // here because the EVL recipes using EVL to replace the tail mask. But in the
3820 // legacy model, it will always calculate the cost of mask.
3821 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3822 // don't need to compare to the legacy cost model.
3824 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3825 ->getAddressSpace();
3826 // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics.
3827 // After migrating to getMemIntrinsicInstrCost, switch this to vp_store.
3828 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3829 {Intrinsic::masked_store, Ty, Alignment, AS}, Ctx.CostKind);
3830 if (!Reverse)
3831 return Cost;
3832
3833 return Cost + Ctx.TTI.getShuffleCost(
3835 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3836}
3837
3838#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3840 VPSlotTracker &SlotTracker) const {
3841 O << Indent << "WIDEN vp.store ";
3843}
3844#endif
3845
3847 VectorType *DstVTy, const DataLayout &DL) {
3848 // Verify that V is a vector type with same number of elements as DstVTy.
3849 auto VF = DstVTy->getElementCount();
3850 auto *SrcVecTy = cast<VectorType>(V->getType());
3851 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3852 Type *SrcElemTy = SrcVecTy->getElementType();
3853 Type *DstElemTy = DstVTy->getElementType();
3854 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3855 "Vector elements must have same size");
3856
3857 // Do a direct cast if element types are castable.
3858 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3859 return Builder.CreateBitOrPointerCast(V, DstVTy);
3860 }
3861 // V cannot be directly casted to desired vector type.
3862 // May happen when V is a floating point vector but DstVTy is a vector of
3863 // pointers or vice-versa. Handle this using a two-step bitcast using an
3864 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3865 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3866 "Only one type should be a pointer type");
3867 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3868 "Only one type should be a floating point type");
3869 Type *IntTy =
3870 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3871 auto *VecIntTy = VectorType::get(IntTy, VF);
3872 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3873 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3874}
3875
3876/// Return a vector containing interleaved elements from multiple
3877/// smaller input vectors.
3879 const Twine &Name) {
3880 unsigned Factor = Vals.size();
3881 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3882
3883 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3884#ifndef NDEBUG
3885 for (Value *Val : Vals)
3886 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3887#endif
3888
3889 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3890 // must use intrinsics to interleave.
3891 if (VecTy->isScalableTy()) {
3892 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3893 return Builder.CreateVectorInterleave(Vals, Name);
3894 }
3895
3896 // Fixed length. Start by concatenating all vectors into a wide vector.
3897 Value *WideVec = concatenateVectors(Builder, Vals);
3898
3899 // Interleave the elements into the wide vector.
3900 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3901 return Builder.CreateShuffleVector(
3902 WideVec, createInterleaveMask(NumElts, Factor), Name);
3903}
3904
3905// Try to vectorize the interleave group that \p Instr belongs to.
3906//
3907// E.g. Translate following interleaved load group (factor = 3):
3908// for (i = 0; i < N; i+=3) {
3909// R = Pic[i]; // Member of index 0
3910// G = Pic[i+1]; // Member of index 1
3911// B = Pic[i+2]; // Member of index 2
3912// ... // do something to R, G, B
3913// }
3914// To:
3915// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3916// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3917// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3918// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3919//
3920// Or translate following interleaved store group (factor = 3):
3921// for (i = 0; i < N; i+=3) {
3922// ... do something to R, G, B
3923// Pic[i] = R; // Member of index 0
3924// Pic[i+1] = G; // Member of index 1
3925// Pic[i+2] = B; // Member of index 2
3926// }
3927// To:
3928// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3929// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3930// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3931// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3932// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3934 assert(!State.Lane && "Interleave group being replicated.");
3935 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3936 "Masking gaps for scalable vectors is not yet supported.");
3938 Instruction *Instr = Group->getInsertPos();
3939
3940 // Prepare for the vector type of the interleaved load/store.
3941 Type *ScalarTy = getLoadStoreType(Instr);
3942 unsigned InterleaveFactor = Group->getFactor();
3943 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3944
3945 VPValue *BlockInMask = getMask();
3946 VPValue *Addr = getAddr();
3947 Value *ResAddr = State.get(Addr, VPLane(0));
3948
3949 auto CreateGroupMask = [&BlockInMask, &State,
3950 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3951 if (State.VF.isScalable()) {
3952 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3953 assert(InterleaveFactor <= 8 &&
3954 "Unsupported deinterleave factor for scalable vectors");
3955 auto *ResBlockInMask = State.get(BlockInMask);
3956 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3957 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3958 }
3959
3960 if (!BlockInMask)
3961 return MaskForGaps;
3962
3963 Value *ResBlockInMask = State.get(BlockInMask);
3964 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3965 ResBlockInMask,
3966 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3967 "interleaved.mask");
3968 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3969 ShuffledMask, MaskForGaps)
3970 : ShuffledMask;
3971 };
3972
3973 const DataLayout &DL = Instr->getDataLayout();
3974 // Vectorize the interleaved load group.
3975 if (isa<LoadInst>(Instr)) {
3976 Value *MaskForGaps = nullptr;
3977 if (needsMaskForGaps()) {
3978 MaskForGaps =
3979 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3980 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3981 }
3982
3983 Instruction *NewLoad;
3984 if (BlockInMask || MaskForGaps) {
3985 Value *GroupMask = CreateGroupMask(MaskForGaps);
3986 Value *PoisonVec = PoisonValue::get(VecTy);
3987 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3988 Group->getAlign(), GroupMask,
3989 PoisonVec, "wide.masked.vec");
3990 } else
3991 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3992 Group->getAlign(), "wide.vec");
3993 applyMetadata(*NewLoad);
3994 // TODO: Also manage existing metadata using VPIRMetadata.
3995 Group->addMetadata(NewLoad);
3996
3998 if (VecTy->isScalableTy()) {
3999 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4000 // so must use intrinsics to deinterleave.
4001 assert(InterleaveFactor <= 8 &&
4002 "Unsupported deinterleave factor for scalable vectors");
4003 NewLoad = State.Builder.CreateIntrinsic(
4004 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4005 NewLoad->getType(), NewLoad,
4006 /*FMFSource=*/nullptr, "strided.vec");
4007 }
4008
4009 auto CreateStridedVector = [&InterleaveFactor, &State,
4010 &NewLoad](unsigned Index) -> Value * {
4011 assert(Index < InterleaveFactor && "Illegal group index");
4012 if (State.VF.isScalable())
4013 return State.Builder.CreateExtractValue(NewLoad, Index);
4014
4015 // For fixed length VF, use shuffle to extract the sub-vectors from the
4016 // wide load.
4017 auto StrideMask =
4018 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4019 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4020 "strided.vec");
4021 };
4022
4023 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4024 Instruction *Member = Group->getMember(I);
4025
4026 // Skip the gaps in the group.
4027 if (!Member)
4028 continue;
4029
4030 Value *StridedVec = CreateStridedVector(I);
4031
4032 // If this member has different type, cast the result type.
4033 if (Member->getType() != ScalarTy) {
4034 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4035 StridedVec =
4036 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4037 }
4038
4039 if (Group->isReverse())
4040 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4041
4042 State.set(VPDefs[J], StridedVec);
4043 ++J;
4044 }
4045 return;
4046 }
4047
4048 // The sub vector type for current instruction.
4049 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4050
4051 // Vectorize the interleaved store group.
4052 Value *MaskForGaps =
4053 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4054 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4055 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4056 ArrayRef<VPValue *> StoredValues = getStoredValues();
4057 // Collect the stored vector from each member.
4058 SmallVector<Value *, 4> StoredVecs;
4059 unsigned StoredIdx = 0;
4060 for (unsigned i = 0; i < InterleaveFactor; i++) {
4061 assert((Group->getMember(i) || MaskForGaps) &&
4062 "Fail to get a member from an interleaved store group");
4063 Instruction *Member = Group->getMember(i);
4064
4065 // Skip the gaps in the group.
4066 if (!Member) {
4067 Value *Undef = PoisonValue::get(SubVT);
4068 StoredVecs.push_back(Undef);
4069 continue;
4070 }
4071
4072 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4073 ++StoredIdx;
4074
4075 if (Group->isReverse())
4076 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4077
4078 // If this member has different type, cast it to a unified type.
4079
4080 if (StoredVec->getType() != SubVT)
4081 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4082
4083 StoredVecs.push_back(StoredVec);
4084 }
4085
4086 // Interleave all the smaller vectors into one wider vector.
4087 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4088 Instruction *NewStoreInstr;
4089 if (BlockInMask || MaskForGaps) {
4090 Value *GroupMask = CreateGroupMask(MaskForGaps);
4091 NewStoreInstr = State.Builder.CreateMaskedStore(
4092 IVec, ResAddr, Group->getAlign(), GroupMask);
4093 } else
4094 NewStoreInstr =
4095 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4096
4097 applyMetadata(*NewStoreInstr);
4098 // TODO: Also manage existing metadata using VPIRMetadata.
4099 Group->addMetadata(NewStoreInstr);
4100}
4101
4102#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4104 VPSlotTracker &SlotTracker) const {
4106 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4107 IG->getInsertPos()->printAsOperand(O, false);
4108 O << ", ";
4110 VPValue *Mask = getMask();
4111 if (Mask) {
4112 O << ", ";
4113 Mask->printAsOperand(O, SlotTracker);
4114 }
4115
4116 unsigned OpIdx = 0;
4117 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4118 if (!IG->getMember(i))
4119 continue;
4120 if (getNumStoreOperands() > 0) {
4121 O << "\n" << Indent << " store ";
4123 O << " to index " << i;
4124 } else {
4125 O << "\n" << Indent << " ";
4127 O << " = load from index " << i;
4128 }
4129 ++OpIdx;
4130 }
4131}
4132#endif
4133
4135 assert(!State.Lane && "Interleave group being replicated.");
4136 assert(State.VF.isScalable() &&
4137 "Only support scalable VF for EVL tail-folding.");
4139 "Masking gaps for scalable vectors is not yet supported.");
4141 Instruction *Instr = Group->getInsertPos();
4142
4143 // Prepare for the vector type of the interleaved load/store.
4144 Type *ScalarTy = getLoadStoreType(Instr);
4145 unsigned InterleaveFactor = Group->getFactor();
4146 assert(InterleaveFactor <= 8 &&
4147 "Unsupported deinterleave/interleave factor for scalable vectors");
4148 ElementCount WideVF = State.VF * InterleaveFactor;
4149 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4150
4151 VPValue *Addr = getAddr();
4152 Value *ResAddr = State.get(Addr, VPLane(0));
4153 Value *EVL = State.get(getEVL(), VPLane(0));
4154 Value *InterleaveEVL = State.Builder.CreateMul(
4155 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4156 /* NUW= */ true, /* NSW= */ true);
4157 LLVMContext &Ctx = State.Builder.getContext();
4158
4159 Value *GroupMask = nullptr;
4160 if (VPValue *BlockInMask = getMask()) {
4161 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4162 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4163 } else {
4164 GroupMask =
4165 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4166 }
4167
4168 // Vectorize the interleaved load group.
4169 if (isa<LoadInst>(Instr)) {
4170 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4171 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4172 "wide.vp.load");
4173 NewLoad->addParamAttr(0,
4174 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4175
4176 applyMetadata(*NewLoad);
4177 // TODO: Also manage existing metadata using VPIRMetadata.
4178 Group->addMetadata(NewLoad);
4179
4180 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4181 // so must use intrinsics to deinterleave.
4182 NewLoad = State.Builder.CreateIntrinsic(
4183 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4184 NewLoad->getType(), NewLoad,
4185 /*FMFSource=*/nullptr, "strided.vec");
4186
4187 const DataLayout &DL = Instr->getDataLayout();
4188 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4189 Instruction *Member = Group->getMember(I);
4190 // Skip the gaps in the group.
4191 if (!Member)
4192 continue;
4193
4194 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4195 // If this member has different type, cast the result type.
4196 if (Member->getType() != ScalarTy) {
4197 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4198 StridedVec =
4199 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4200 }
4201
4202 State.set(getVPValue(J), StridedVec);
4203 ++J;
4204 }
4205 return;
4206 } // End for interleaved load.
4207
4208 // The sub vector type for current instruction.
4209 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4210 // Vectorize the interleaved store group.
4211 ArrayRef<VPValue *> StoredValues = getStoredValues();
4212 // Collect the stored vector from each member.
4213 SmallVector<Value *, 4> StoredVecs;
4214 const DataLayout &DL = Instr->getDataLayout();
4215 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4216 Instruction *Member = Group->getMember(I);
4217 // Skip the gaps in the group.
4218 if (!Member) {
4219 StoredVecs.push_back(PoisonValue::get(SubVT));
4220 continue;
4221 }
4222
4223 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4224 // If this member has different type, cast it to a unified type.
4225 if (StoredVec->getType() != SubVT)
4226 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4227
4228 StoredVecs.push_back(StoredVec);
4229 ++StoredIdx;
4230 }
4231
4232 // Interleave all the smaller vectors into one wider vector.
4233 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4234 CallInst *NewStore =
4235 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4236 {IVec, ResAddr, GroupMask, InterleaveEVL});
4237 NewStore->addParamAttr(1,
4238 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4239
4240 applyMetadata(*NewStore);
4241 // TODO: Also manage existing metadata using VPIRMetadata.
4242 Group->addMetadata(NewStore);
4243}
4244
4245#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4247 VPSlotTracker &SlotTracker) const {
4249 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4250 IG->getInsertPos()->printAsOperand(O, false);
4251 O << ", ";
4253 O << ", ";
4255 if (VPValue *Mask = getMask()) {
4256 O << ", ";
4257 Mask->printAsOperand(O, SlotTracker);
4258 }
4259
4260 unsigned OpIdx = 0;
4261 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4262 if (!IG->getMember(i))
4263 continue;
4264 if (getNumStoreOperands() > 0) {
4265 O << "\n" << Indent << " vp.store ";
4267 O << " to index " << i;
4268 } else {
4269 O << "\n" << Indent << " ";
4271 O << " = vp.load from index " << i;
4272 }
4273 ++OpIdx;
4274 }
4275}
4276#endif
4277
4279 VPCostContext &Ctx) const {
4280 Instruction *InsertPos = getInsertPos();
4281 // Find the VPValue index of the interleave group. We need to skip gaps.
4282 unsigned InsertPosIdx = 0;
4283 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4284 if (auto *Member = IG->getMember(Idx)) {
4285 if (Member == InsertPos)
4286 break;
4287 InsertPosIdx++;
4288 }
4289 Type *ValTy = Ctx.Types.inferScalarType(
4290 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4291 : getStoredValues()[InsertPosIdx]);
4292 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4293 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4294 ->getAddressSpace();
4295
4296 unsigned InterleaveFactor = IG->getFactor();
4297 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4298
4299 // Holds the indices of existing members in the interleaved group.
4301 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4302 if (IG->getMember(IF))
4303 Indices.push_back(IF);
4304
4305 // Calculate the cost of the whole interleaved group.
4306 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4307 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4308 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4309
4310 if (!IG->isReverse())
4311 return Cost;
4312
4313 return Cost + IG->getNumMembers() *
4314 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4315 VectorTy, VectorTy, {}, Ctx.CostKind,
4316 0);
4317}
4318
4319#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4321 VPSlotTracker &SlotTracker) const {
4322 O << Indent << "EMIT ";
4324 O << " = CANONICAL-INDUCTION ";
4326}
4327#endif
4328
4330 return vputils::onlyScalarValuesUsed(this) &&
4331 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4332}
4333
4334#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4336 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4337 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4338 "unexpected number of operands");
4339 O << Indent << "EMIT ";
4341 O << " = WIDEN-POINTER-INDUCTION ";
4343 O << ", ";
4345 O << ", ";
4347 if (getNumOperands() == 5) {
4348 O << ", ";
4350 O << ", ";
4352 }
4353}
4354
4356 VPSlotTracker &SlotTracker) const {
4357 O << Indent << "EMIT ";
4359 O << " = EXPAND SCEV " << *Expr;
4360}
4361#endif
4362
4364 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4365 Type *STy = CanonicalIV->getType();
4366 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4367 ElementCount VF = State.VF;
4368 Value *VStart = VF.isScalar()
4369 ? CanonicalIV
4370 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4371 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
4372 if (VF.isVector()) {
4373 VStep = Builder.CreateVectorSplat(VF, VStep);
4374 VStep =
4375 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4376 }
4377 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4378 State.set(this, CanonicalVectorIV);
4379}
4380
4381#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4383 VPSlotTracker &SlotTracker) const {
4384 O << Indent << "EMIT ";
4386 O << " = WIDEN-CANONICAL-INDUCTION ";
4388}
4389#endif
4390
4392 auto &Builder = State.Builder;
4393 // Create a vector from the initial value.
4394 auto *VectorInit = getStartValue()->getLiveInIRValue();
4395
4396 Type *VecTy = State.VF.isScalar()
4397 ? VectorInit->getType()
4398 : VectorType::get(VectorInit->getType(), State.VF);
4399
4400 BasicBlock *VectorPH =
4401 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4402 if (State.VF.isVector()) {
4403 auto *IdxTy = Builder.getInt32Ty();
4404 auto *One = ConstantInt::get(IdxTy, 1);
4405 IRBuilder<>::InsertPointGuard Guard(Builder);
4406 Builder.SetInsertPoint(VectorPH->getTerminator());
4407 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4408 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4409 VectorInit = Builder.CreateInsertElement(
4410 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4411 }
4412
4413 // Create a phi node for the new recurrence.
4414 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4415 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4416 Phi->addIncoming(VectorInit, VectorPH);
4417 State.set(this, Phi);
4418}
4419
4422 VPCostContext &Ctx) const {
4423 if (VF.isScalar())
4424 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4425
4426 return 0;
4427}
4428
4429#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4431 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4432 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4434 O << " = phi ";
4436}
4437#endif
4438
4440 // Reductions do not have to start at zero. They can start with
4441 // any loop invariant values.
4442 VPValue *StartVPV = getStartValue();
4443
4444 // In order to support recurrences we need to be able to vectorize Phi nodes.
4445 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4446 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4447 // this value when we vectorize all of the instructions that use the PHI.
4448 BasicBlock *VectorPH =
4449 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4450 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
4451 Value *StartV = State.get(StartVPV, ScalarPHI);
4452 Type *VecTy = StartV->getType();
4453
4454 BasicBlock *HeaderBB = State.CFG.PrevBB;
4455 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4456 "recipe must be in the vector loop header");
4457 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4458 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4459 State.set(this, Phi, IsInLoop);
4460
4461 Phi->addIncoming(StartV, VectorPH);
4462}
4463
4464#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4466 VPSlotTracker &SlotTracker) const {
4467 O << Indent << "WIDEN-REDUCTION-PHI ";
4468
4470 O << " = phi ";
4472 if (VFScaleFactor != 1)
4473 O << " (VF scaled by 1/" << VFScaleFactor << ")";
4474}
4475#endif
4476
4478 Value *Op0 = State.get(getOperand(0));
4479 Type *VecTy = Op0->getType();
4480 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4481 State.set(this, VecPhi);
4482}
4483
4484#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4486 VPSlotTracker &SlotTracker) const {
4487 O << Indent << "WIDEN-PHI ";
4488
4490 O << " = phi ";
4492}
4493#endif
4494
4495// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
4496// remove VPActiveLaneMaskPHIRecipe.
4498 BasicBlock *VectorPH =
4499 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4500 Value *StartMask = State.get(getOperand(0));
4501 PHINode *Phi =
4502 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4503 Phi->addIncoming(StartMask, VectorPH);
4504 State.set(this, Phi);
4505}
4506
4507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4509 VPSlotTracker &SlotTracker) const {
4510 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4511
4513 O << " = phi ";
4515}
4516#endif
4517
4518#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4520 VPSlotTracker &SlotTracker) const {
4521 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4522
4524 O << " = phi ";
4526}
4527#endif
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Type * getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, unsigned CurrentPart, IRBuilderBase &Builder)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
static BranchInst * createCondBranch(Value *Cond, VPBasicBlock *VPBB, VPTransformState &State)
Create a conditional branch using Cond branching to the successors of VPBB.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Conditional or Unconditional Branch instruction.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:272
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:661
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:594
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2633
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LLVM_ABI Value * CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, const Twine &Name="")
Return a vector splice intrinsic if using scalable vectors, otherwise return a shufflevector.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2039
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:567
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1725
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1808
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1134
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
LLVMContext & getContext() const
Definition IRBuilder.h:203
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static bool isSignedRecurrenceKind(RecurKind Kind)
Returns true if recurrece kind is a signed redux kind.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
value_op_iterator value_op_end()
Definition User.h:313
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
value_op_iterator value_op_begin()
Definition User.h:310
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3983
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4036
iterator end()
Definition VPlan.h:4020
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4049
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2509
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2504
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:161
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:348
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
This class augments a recipe with a set of VPValues defined by the recipe.
Definition VPlanValue.h:310
LLVM_ABI_FOR_TEST void dump() const
Dump the VPDef to stderr (for debugging).
Definition VPlan.cpp:122
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:432
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:405
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:417
unsigned getVPDefID() const
Definition VPlanValue.h:437
VPValue * getStepValue() const
Definition VPlan.h:3783
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStartValue() const
Definition VPlan.h:3782
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2084
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:1778
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
FastMathFlagsTy FMFs
Definition VPlan.h:680
bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
WrapFlagsTy WrapFlags
Definition VPlan.h:674
CmpInst::Predicate CmpPredicate
Definition VPlan.h:673
void printFlags(raw_ostream &O) const
GEPNoWrapFlags GEPFlags
Definition VPlan.h:678
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:858
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
TruncFlagsTy TruncFlags
Definition VPlan.h:675
CmpInst::Predicate getPredicate() const
Definition VPlan.h:835
ExactFlagsTy ExactFlags
Definition VPlan.h:677
bool hasNoSignedWrap() const
Definition VPlan.h:884
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:850
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:853
DisjointFlagsTy DisjointFlags
Definition VPlan.h:676
unsigned AllFlags
Definition VPlan.h:682
bool hasNoUnsignedWrap() const
Definition VPlan.h:873
FCmpFlagsTy FCmpFlags
Definition VPlan.h:681
NonNegFlagsTy NonNegFlags
Definition VPlan.h:679
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:795
Instruction & getInstruction() const
Definition VPlan.h:1432
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void extractLastLaneOfFirstOperand(VPBuilder &Builder)
Update the recipes first operand to the last lane of the operand using Builder.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1407
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1115
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1069
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1105
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1118
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1066
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1109
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1061
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1058
@ VScale
Returns the value for vscale.
Definition VPlan.h:1120
@ CanonicalIVIncrementForPart
Definition VPlan.h:1051
bool hasResult() const
Definition VPlan.h:1186
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1226
unsigned getOpcode() const
Definition VPlan.h:1170
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2620
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2624
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2622
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2614
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2643
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2608
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2718
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2731
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2681
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPPartialReductionRecipe.
unsigned getOpcode() const
Get the binary op's opcode.
Definition VPlan.h:2890
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1322
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4127
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1347
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1314
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4288
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override final
Print the recipe, delegating to printRecipe().
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:397
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2930
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:2822
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:2826
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:2828
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:2818
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:2824
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4171
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4239
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2952
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:2993
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3022
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPValue * getStepValue() const
Definition VPlan.h:3849
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:530
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:532
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:970
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1420
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
unsigned getNumOperands() const
Definition VPlanValue.h:245
operand_iterator op_begin()
Definition VPlanValue.h:271
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:290
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
friend class VPExpressionRecipe
Definition VPlanValue.h:53
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1416
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:140
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
VPValue(const unsigned char SC, Value *UV=nullptr, VPDef *Def=nullptr)
Definition VPlan.cpp:94
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
user_range users()
Definition VPlanValue.h:134
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:1991
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:1734
Function * getCalledScalarFunction() const
Definition VPlan.h:1730
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1584
void execute(VPTransformState &State) override
Produce widened copies of the cast.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:1886
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2147
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2254
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2263
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1666
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1669
void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3277
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3274
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3317
Instruction & Ingredient
Definition VPlan.h:3265
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3271
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3331
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3268
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3324
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3321
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1497
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getUF() const
Definition VPlan.h:4526
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * createFindLastIVReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind, Value *Start, Value *Sentinel)
Create a reduction of the given vector Src for a reduction of the kind RecurKind::FindLastIV.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI Value * createAnyOfReduction(IRBuilderBase &B, Value *Src, Value *InitVal, PHINode *OrigPhi)
Create a reduction of the given vector Src for a reduction of kind RecurKind::AnyOf.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1470
PHINode & getIRPhi()
Definition VPlan.h:1478
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:923
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:924
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:263
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
void execute(VPTransformState &State) override
Generate the wide load or gather.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3408
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getCond() const
Definition VPlan.h:1821
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenSelectRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the select instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3491
void execute(VPTransformState &State) override
Generate the wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3494
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3454