LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCurrentIterationPHISC:
78 case VPBranchOnMaskSC:
79 case VPDerivedIVSC:
80 case VPFirstOrderRecurrencePHISC:
81 case VPReductionPHISC:
82 case VPScalarIVStepsSC:
83 case VPPredInstPHISC:
84 return false;
85 case VPBlendSC:
86 case VPReductionEVLSC:
87 case VPReductionSC:
88 case VPVectorPointerSC:
89 case VPWidenCanonicalIVSC:
90 case VPWidenCastSC:
91 case VPWidenGEPSC:
92 case VPWidenIntOrFpInductionSC:
93 case VPWidenLoadEVLSC:
94 case VPWidenLoadSC:
95 case VPWidenPHISC:
96 case VPWidenPointerInductionSC:
97 case VPWidenSC: {
98 const Instruction *I =
99 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
100 (void)I;
101 assert((!I || !I->mayWriteToMemory()) &&
102 "underlying instruction may write to memory");
103 return false;
104 }
105 default:
106 return true;
107 }
108}
109
111 switch (getVPRecipeID()) {
112 case VPExpressionSC:
113 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
114 case VPInstructionSC:
115 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
116 case VPWidenLoadEVLSC:
117 case VPWidenLoadSC:
118 return true;
119 case VPReplicateSC:
120 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
121 ->mayReadFromMemory();
122 case VPWidenCallSC:
123 return !cast<VPWidenCallRecipe>(this)
124 ->getCalledScalarFunction()
125 ->onlyWritesMemory();
126 case VPWidenIntrinsicSC:
127 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
128 case VPBranchOnMaskSC:
129 case VPDerivedIVSC:
130 case VPCurrentIterationPHISC:
131 case VPFirstOrderRecurrencePHISC:
132 case VPReductionPHISC:
133 case VPPredInstPHISC:
134 case VPScalarIVStepsSC:
135 case VPWidenStoreEVLSC:
136 case VPWidenStoreSC:
137 return false;
138 case VPBlendSC:
139 case VPReductionEVLSC:
140 case VPReductionSC:
141 case VPVectorPointerSC:
142 case VPWidenCanonicalIVSC:
143 case VPWidenCastSC:
144 case VPWidenGEPSC:
145 case VPWidenIntOrFpInductionSC:
146 case VPWidenPHISC:
147 case VPWidenPointerInductionSC:
148 case VPWidenSC: {
149 const Instruction *I =
150 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
151 (void)I;
152 assert((!I || !I->mayReadFromMemory()) &&
153 "underlying instruction may read from memory");
154 return false;
155 }
156 default:
157 // FIXME: Return false if the recipe represents an interleaved store.
158 return true;
159 }
160}
161
163 switch (getVPRecipeID()) {
164 case VPExpressionSC:
165 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
166 case VPActiveLaneMaskPHISC:
167 case VPDerivedIVSC:
168 case VPCurrentIterationPHISC:
169 case VPFirstOrderRecurrencePHISC:
170 case VPReductionPHISC:
171 case VPPredInstPHISC:
172 case VPVectorEndPointerSC:
173 return false;
174 case VPInstructionSC: {
175 auto *VPI = cast<VPInstruction>(this);
176 return mayWriteToMemory() ||
177 VPI->getOpcode() == VPInstruction::BranchOnCount ||
178 VPI->getOpcode() == VPInstruction::BranchOnCond ||
179 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
180 }
181 case VPWidenCallSC: {
182 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
183 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
184 }
185 case VPWidenIntrinsicSC:
186 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
187 case VPBlendSC:
188 case VPReductionEVLSC:
189 case VPReductionSC:
190 case VPScalarIVStepsSC:
191 case VPVectorPointerSC:
192 case VPWidenCanonicalIVSC:
193 case VPWidenCastSC:
194 case VPWidenGEPSC:
195 case VPWidenIntOrFpInductionSC:
196 case VPWidenPHISC:
197 case VPWidenPointerInductionSC:
198 case VPWidenSC: {
199 const Instruction *I =
200 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
201 (void)I;
202 assert((!I || !I->mayHaveSideEffects()) &&
203 "underlying instruction has side-effects");
204 return false;
205 }
206 case VPInterleaveEVLSC:
207 case VPInterleaveSC:
208 return mayWriteToMemory();
209 case VPWidenLoadEVLSC:
210 case VPWidenLoadSC:
211 case VPWidenStoreEVLSC:
212 case VPWidenStoreSC:
213 assert(
214 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
216 "mayHaveSideffects result for ingredient differs from this "
217 "implementation");
218 return mayWriteToMemory();
219 case VPReplicateSC: {
220 auto *R = cast<VPReplicateRecipe>(this);
221 return R->getUnderlyingInstr()->mayHaveSideEffects();
222 }
223 default:
224 return true;
225 }
226}
227
229 assert(!Parent && "Recipe already in some VPBasicBlock");
230 assert(InsertPos->getParent() &&
231 "Insertion position not in any VPBasicBlock");
232 InsertPos->getParent()->insert(this, InsertPos->getIterator());
233}
234
235void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237 assert(!Parent && "Recipe already in some VPBasicBlock");
238 assert(I == BB.end() || I->getParent() == &BB);
239 BB.insert(this, I);
240}
241
243 assert(!Parent && "Recipe already in some VPBasicBlock");
244 assert(InsertPos->getParent() &&
245 "Insertion position not in any VPBasicBlock");
246 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
247}
248
250 assert(getParent() && "Recipe not in any VPBasicBlock");
252 Parent = nullptr;
253}
254
256 assert(getParent() && "Recipe not in any VPBasicBlock");
258}
259
262 insertAfter(InsertPos);
263}
264
270
272 // Get the underlying instruction for the recipe, if there is one. It is used
273 // to
274 // * decide if cost computation should be skipped for this recipe,
275 // * apply forced target instruction cost.
276 Instruction *UI = nullptr;
277 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
278 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
279 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
280 UI = IG->getInsertPos();
281 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
282 UI = &WidenMem->getIngredient();
283
284 InstructionCost RecipeCost;
285 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
286 RecipeCost = 0;
287 } else {
288 RecipeCost = computeCost(VF, Ctx);
289 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
290 RecipeCost.isValid()) {
291 if (UI)
293 else
294 RecipeCost = InstructionCost(0);
295 }
296 }
297
298 LLVM_DEBUG({
299 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
300 dump();
301 });
302 return RecipeCost;
303}
304
306 VPCostContext &Ctx) const {
307 llvm_unreachable("subclasses should implement computeCost");
308}
309
311 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
313}
314
316 auto *VPI = dyn_cast<VPInstruction>(this);
317 return VPI && Instruction::isCast(VPI->getOpcode());
318}
319
321 assert(OpType == Other.OpType && "OpType must match");
322 switch (OpType) {
323 case OperationType::OverflowingBinOp:
324 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
325 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
326 break;
327 case OperationType::Trunc:
328 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
329 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
330 break;
331 case OperationType::DisjointOp:
332 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
333 break;
334 case OperationType::PossiblyExactOp:
335 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
336 break;
337 case OperationType::GEPOp:
338 GEPFlagsStorage &= Other.GEPFlagsStorage;
339 break;
340 case OperationType::FPMathOp:
341 case OperationType::FCmp:
342 assert((OpType != OperationType::FCmp ||
343 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
344 "Cannot drop CmpPredicate");
345 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
346 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
347 break;
348 case OperationType::NonNegOp:
349 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
350 break;
351 case OperationType::Cmp:
352 assert(CmpPredStorage == Other.CmpPredStorage &&
353 "Cannot drop CmpPredicate");
354 break;
355 case OperationType::ReductionOp:
356 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
357 "Cannot change RecurKind");
358 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
359 "Cannot change IsOrdered");
360 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
361 "Cannot change IsInLoop");
362 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
363 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
364 break;
365 case OperationType::Other:
366 break;
367 }
368}
369
371 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
372 OpType == OperationType::ReductionOp ||
373 OpType == OperationType::Other) &&
374 "recipe doesn't have fast math flags");
375 if (OpType == OperationType::Other)
376 return FastMathFlags();
377 const FastMathFlagsTy &F = getFMFsRef();
378 FastMathFlags Res;
379 Res.setAllowReassoc(F.AllowReassoc);
380 Res.setNoNaNs(F.NoNaNs);
381 Res.setNoInfs(F.NoInfs);
382 Res.setNoSignedZeros(F.NoSignedZeros);
383 Res.setAllowReciprocal(F.AllowReciprocal);
384 Res.setAllowContract(F.AllowContract);
385 Res.setApproxFunc(F.ApproxFunc);
386 return Res;
387}
388
389#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
391
392void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
393 VPSlotTracker &SlotTracker) const {
394 printRecipe(O, Indent, SlotTracker);
395 if (auto DL = getDebugLoc()) {
396 O << ", !dbg ";
397 DL.print(O);
398 }
399
400 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
402}
403#endif
404
405template <unsigned PartOpIdx>
406VPValue *
408 if (U.getNumOperands() == PartOpIdx + 1)
409 return U.getOperand(PartOpIdx);
410 return nullptr;
411}
412
413template <unsigned PartOpIdx>
415 if (auto *UnrollPartOp = getUnrollPartOperand(U))
416 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
417 return 0;
418}
419
420namespace llvm {
421template class VPUnrollPartAccessor<1>;
422template class VPUnrollPartAccessor<2>;
423template class VPUnrollPartAccessor<3>;
424}
425
427 const VPIRFlags &Flags, const VPIRMetadata &MD,
428 DebugLoc DL, const Twine &Name)
429 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
430 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
432 "Set flags not supported for the provided opcode");
434 "Opcode requires specific flags to be set");
438 "number of operands does not match opcode");
439}
440
441/// For call VPInstructions, return the operand index of the called function.
442/// The function is either the last operand (for unmasked calls) or the
443/// second-to-last operand (for masked calls).
444static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
445 assert(VPI.getOpcode() == Instruction::Call && "must be a call");
446 unsigned NumOps = VPI.getNumOperands();
447 auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
448 if (LastOp && isa<Function>(LastOp->getValue()))
449 return NumOps - 1;
450 assert(
451 isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
452 "expected function operand");
453 return NumOps - 2;
454}
455
456/// For call VPInstructions, return the called function.
458 unsigned Idx = getCalledFnOperandIndex(VPI);
459 return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
460}
461
463 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
464 return 1;
465
466 if (Instruction::isBinaryOp(Opcode))
467 return 2;
468
469 switch (Opcode) {
472 return 0;
473 case Instruction::Alloca:
474 case Instruction::ExtractValue:
475 case Instruction::Freeze:
476 case Instruction::Load:
489 return 1;
490 case Instruction::ICmp:
491 case Instruction::FCmp:
492 case Instruction::ExtractElement:
493 case Instruction::Store:
503 return 2;
504 case Instruction::Select:
507 return 3;
508 case Instruction::Call:
509 return getCalledFnOperandIndex(*this) + 1;
510 case Instruction::GetElementPtr:
511 case Instruction::PHI:
512 case Instruction::Switch:
524 // Cannot determine the number of operands from the opcode.
525 return -1u;
526 }
527 llvm_unreachable("all cases should be handled above");
528}
529
533
534bool VPInstruction::canGenerateScalarForFirstLane() const {
536 return true;
538 return true;
539 switch (Opcode) {
540 case Instruction::Freeze:
541 case Instruction::ICmp:
542 case Instruction::PHI:
543 case Instruction::Select:
553 return true;
554 default:
555 return false;
556 }
557}
558
559Value *VPInstruction::generate(VPTransformState &State) {
560 IRBuilderBase &Builder = State.Builder;
561
563 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
564 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
565 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
566 auto *Res =
567 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
568 if (auto *I = dyn_cast<Instruction>(Res))
569 applyFlags(*I);
570 return Res;
571 }
572
573 switch (getOpcode()) {
574 case VPInstruction::Not: {
575 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
576 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
577 return Builder.CreateNot(A, Name);
578 }
579 case Instruction::ExtractElement: {
580 assert(State.VF.isVector() && "Only extract elements from vectors");
581 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
582 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
583 Value *Vec = State.get(getOperand(0));
584 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
585 return Builder.CreateExtractElement(Vec, Idx, Name);
586 }
587 case Instruction::Freeze: {
589 return Builder.CreateFreeze(Op, Name);
590 }
591 case Instruction::FCmp:
592 case Instruction::ICmp: {
593 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
594 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
595 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
596 return Builder.CreateCmp(getPredicate(), A, B, Name);
597 }
598 case Instruction::PHI: {
599 llvm_unreachable("should be handled by VPPhi::execute");
600 }
601 case Instruction::Select: {
602 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
603 Value *Cond =
604 State.get(getOperand(0),
605 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
606 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
607 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
608 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
609 }
611 // Get first lane of vector induction variable.
612 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
613 // Get the original loop tripcount.
614 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
615
616 // If this part of the active lane mask is scalar, generate the CMP directly
617 // to avoid unnecessary extracts.
618 if (State.VF.isScalar())
619 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
620 Name);
621
622 ElementCount EC = State.VF.multiplyCoefficientBy(
623 cast<VPConstantInt>(getOperand(2))->getZExtValue());
624 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
625 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
626 {PredTy, ScalarTC->getType()},
627 {VIVElem0, ScalarTC}, nullptr, Name);
628 }
630 // Generate code to combine the previous and current values in vector v3.
631 //
632 // vector.ph:
633 // v_init = vector(..., ..., ..., a[-1])
634 // br vector.body
635 //
636 // vector.body
637 // i = phi [0, vector.ph], [i+4, vector.body]
638 // v1 = phi [v_init, vector.ph], [v2, vector.body]
639 // v2 = a[i, i+1, i+2, i+3];
640 // v3 = vector(v1(3), v2(0, 1, 2))
641
642 auto *V1 = State.get(getOperand(0));
643 if (!V1->getType()->isVectorTy())
644 return V1;
645 Value *V2 = State.get(getOperand(1));
646 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
647 }
649 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
650 Value *VFxUF = State.get(getOperand(1), VPLane(0));
651 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
652 Value *Cmp =
653 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
655 return Builder.CreateSelect(Cmp, Sub, Zero);
656 }
658 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
659 // be outside of the main loop.
660 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
661 // Compute EVL
662 assert(AVL->getType()->isIntegerTy() &&
663 "Requested vector length should be an integer.");
664
665 assert(State.VF.isScalable() && "Expected scalable vector factor.");
666 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
667
668 Value *EVL = Builder.CreateIntrinsic(
669 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
670 {AVL, VFArg, Builder.getTrue()});
671 return EVL;
672 }
674 Value *Cond = State.get(getOperand(0), VPLane(0));
675 // Replace the temporary unreachable terminator with a new conditional
676 // branch, hooking it up to backward destination for latch blocks now, and
677 // to forward destination(s) later when they are created.
678 // Second successor may be backwards - iff it is already in VPBB2IRBB.
679 VPBasicBlock *SecondVPSucc =
680 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
681 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
682 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
683 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
684 // First successor is always forward, reset it to nullptr.
685 Br->setSuccessor(0, nullptr);
687 applyMetadata(*Br);
688 return Br;
689 }
691 return Builder.CreateVectorSplat(
692 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
693 }
695 // For struct types, we need to build a new 'wide' struct type, where each
696 // element is widened, i.e., we create a struct of vectors.
697 auto *StructTy =
699 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
700 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
701 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
702 FieldIndex++) {
703 Value *ScalarValue =
704 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
705 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
706 VectorValue =
707 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
708 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
709 }
710 }
711 return Res;
712 }
714 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
715 auto NumOfElements = ElementCount::getFixed(getNumOperands());
716 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
717 for (const auto &[Idx, Op] : enumerate(operands()))
718 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
719 Builder.getInt32(Idx));
720 return Res;
721 }
723 if (State.VF.isScalar())
724 return State.get(getOperand(0), true);
725 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
727 // If this start vector is scaled then it should produce a vector with fewer
728 // elements than the VF.
729 ElementCount VF = State.VF.divideCoefficientBy(
730 cast<VPConstantInt>(getOperand(2))->getZExtValue());
731 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
732 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
733 Builder.getInt32(0));
734 }
736 RecurKind RK = getRecurKind();
737 bool IsOrdered = isReductionOrdered();
738 bool IsInLoop = isReductionInLoop();
740 "FindIV should use min/max reduction kinds");
741
742 // The recipe may have multiple operands to be reduced together.
743 unsigned NumOperandsToReduce = getNumOperands();
744 VectorParts RdxParts(NumOperandsToReduce);
745 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
746 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
747
748 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
750
751 // Reduce multiple operands into one.
752 Value *ReducedPartRdx = RdxParts[0];
753 if (IsOrdered) {
754 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
755 } else {
756 // Floating-point operations should have some FMF to enable the reduction.
757 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
758 Value *RdxPart = RdxParts[Part];
760 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
761 else {
762 // For sub-recurrences, each part's reduction variable is already
763 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
765 RK == RecurKind::Sub
766 ? Instruction::Add
768 ReducedPartRdx =
769 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
770 }
771 }
772 }
773
774 // Create the reduction after the loop. Note that inloop reductions create
775 // the target reduction in the loop using a Reduction recipe.
776 if (State.VF.isVector() && !IsInLoop) {
777 // TODO: Support in-order reductions based on the recurrence descriptor.
778 // All ops in the reduction inherit fast-math-flags from the recurrence
779 // descriptor.
780 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
781 }
782
783 return ReducedPartRdx;
784 }
787 unsigned Offset =
789 Value *Res;
790 if (State.VF.isVector()) {
791 assert(Offset <= State.VF.getKnownMinValue() &&
792 "invalid offset to extract from");
793 // Extract lane VF - Offset from the operand.
794 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
795 } else {
796 // TODO: Remove ExtractLastLane for scalar VFs.
797 assert(Offset <= 1 && "invalid offset to extract from");
798 Res = State.get(getOperand(0));
799 }
801 Res->setName(Name);
802 return Res;
803 }
805 Value *A = State.get(getOperand(0));
806 Value *B = State.get(getOperand(1));
807 return Builder.CreateLogicalAnd(A, B, Name);
808 }
810 Value *A = State.get(getOperand(0));
811 Value *B = State.get(getOperand(1));
812 return Builder.CreateLogicalOr(A, B, Name);
813 }
815 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
816 "can only generate first lane for PtrAdd");
817 Value *Ptr = State.get(getOperand(0), VPLane(0));
818 Value *Addend = State.get(getOperand(1), VPLane(0));
819 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
820 }
822 Value *Ptr =
824 Value *Addend = State.get(getOperand(1));
825 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
826 }
828 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
829 for (VPValue *Op : drop_begin(operands()))
830 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
831 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
832 }
834 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
835 "simplified to ExtractElement.");
836 Value *LaneToExtract = State.get(getOperand(0), true);
837 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
838 Value *Res = nullptr;
839 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
840
841 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
842 Value *VectorStart =
843 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
844 Value *VectorIdx = Idx == 1
845 ? LaneToExtract
846 : Builder.CreateSub(LaneToExtract, VectorStart);
847 Value *Ext = State.VF.isScalar()
848 ? State.get(getOperand(Idx))
849 : Builder.CreateExtractElement(
850 State.get(getOperand(Idx)), VectorIdx);
851 if (Res) {
852 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
853 Res = Builder.CreateSelect(Cmp, Ext, Res);
854 } else {
855 Res = Ext;
856 }
857 }
858 return Res;
859 }
861 Type *Ty = State.TypeAnalysis.inferScalarType(this);
862 if (getNumOperands() == 1) {
863 Value *Mask = State.get(getOperand(0));
864 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
865 /*ZeroIsPoison=*/false, Name);
866 }
867 // If there are multiple operands, create a chain of selects to pick the
868 // first operand with an active lane and add the number of lanes of the
869 // preceding operands.
870 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
871 unsigned LastOpIdx = getNumOperands() - 1;
872 Value *Res = nullptr;
873 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
874 Value *TrailingZeros =
875 State.VF.isScalar()
876 ? Builder.CreateZExt(
877 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
878 Builder.getFalse()),
879 Ty)
881 Ty, State.get(getOperand(Idx)),
882 /*ZeroIsPoison=*/false, Name);
883 Value *Current = Builder.CreateAdd(
884 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
885 TrailingZeros);
886 if (Res) {
887 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
888 Res = Builder.CreateSelect(Cmp, Current, Res);
889 } else {
890 Res = Current;
891 }
892 }
893
894 return Res;
895 }
897 return State.get(getOperand(0), true);
899 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
901 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
902 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
903 Value *Data = State.get(getOperand(Idx));
904 Value *Mask = State.get(getOperand(Idx + 1));
905 Type *VTy = Data->getType();
906
907 if (State.VF.isScalar())
908 Result = Builder.CreateSelect(Mask, Data, Result);
909 else
910 Result = Builder.CreateIntrinsic(
911 Intrinsic::experimental_vector_extract_last_active, {VTy},
912 {Data, Mask, Result});
913 }
914
915 return Result;
916 }
917 default:
918 llvm_unreachable("Unsupported opcode for instruction");
919 }
920}
921
923 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
924 Type *ScalarTy = Ctx.Types.inferScalarType(this);
925 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
926 switch (Opcode) {
927 case Instruction::FNeg:
928 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
929 case Instruction::UDiv:
930 case Instruction::SDiv:
931 case Instruction::SRem:
932 case Instruction::URem:
933 case Instruction::Add:
934 case Instruction::FAdd:
935 case Instruction::Sub:
936 case Instruction::FSub:
937 case Instruction::Mul:
938 case Instruction::FMul:
939 case Instruction::FDiv:
940 case Instruction::FRem:
941 case Instruction::Shl:
942 case Instruction::LShr:
943 case Instruction::AShr:
944 case Instruction::And:
945 case Instruction::Or:
946 case Instruction::Xor: {
947 // Certain instructions can be cheaper if they have a constant second
948 // operand. One example of this are shifts on x86.
949 VPValue *RHS = getOperand(1);
950 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
951
952 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
955
958 if (CtxI)
959 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
960 return Ctx.TTI.getArithmeticInstrCost(
961 Opcode, ResultTy, Ctx.CostKind,
962 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
963 RHSInfo, Operands, CtxI, &Ctx.TLI);
964 }
965 case Instruction::Freeze:
966 // This opcode is unknown. Assume that it is the same as 'mul'.
967 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
968 Ctx.CostKind);
969 case Instruction::ExtractValue:
970 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
971 Ctx.CostKind);
972 case Instruction::ICmp:
973 case Instruction::FCmp: {
974 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
975 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
977 return Ctx.TTI.getCmpSelInstrCost(
978 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
979 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
980 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
981 }
982 case Instruction::BitCast: {
983 Type *ScalarTy = Ctx.Types.inferScalarType(this);
984 if (ScalarTy->isPointerTy())
985 return 0;
986 [[fallthrough]];
987 }
988 case Instruction::SExt:
989 case Instruction::ZExt:
990 case Instruction::FPToUI:
991 case Instruction::FPToSI:
992 case Instruction::FPExt:
993 case Instruction::PtrToInt:
994 case Instruction::PtrToAddr:
995 case Instruction::IntToPtr:
996 case Instruction::SIToFP:
997 case Instruction::UIToFP:
998 case Instruction::Trunc:
999 case Instruction::FPTrunc:
1000 case Instruction::AddrSpaceCast: {
1001 // Computes the CastContextHint from a recipe that may access memory.
1002 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1003 if (isa<VPInterleaveBase>(R))
1005 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1006 // Only compute CCH for memory operations, matching the legacy model
1007 // which only considers loads/stores for cast context hints.
1008 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1009 if (!isa<LoadInst, StoreInst>(UI))
1011 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1013 }
1014 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1015 if (WidenMemoryRecipe == nullptr)
1017 if (VF.isScalar())
1019 if (!WidenMemoryRecipe->isConsecutive())
1021 if (WidenMemoryRecipe->isMasked())
1024 };
1025
1026 VPValue *Operand = getOperand(0);
1028 bool IsReverse = false;
1029 // For Trunc/FPTrunc, get the context from the only user.
1030 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1031 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1032 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1033 return nullptr;
1034 return dyn_cast<VPRecipeBase>(*R->user_begin());
1035 };
1036 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1037 if (match(Recipe,
1041 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1042 IsReverse = true;
1043 }
1044 if (Recipe)
1045 CCH = ComputeCCH(Recipe);
1046 }
1047 }
1048 // For Z/Sext, get the context from the operand.
1049 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1050 Opcode == Instruction::FPExt) {
1051 if (auto *Recipe = Operand->getDefiningRecipe()) {
1052 VPValue *ReverseOp;
1053 if (match(Recipe,
1054 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1056 m_VPValue(ReverseOp))))) {
1057 Recipe = ReverseOp->getDefiningRecipe();
1058 IsReverse = true;
1059 }
1060 if (Recipe)
1061 CCH = ComputeCCH(Recipe);
1062 }
1063 }
1064 if (IsReverse && CCH != TTI::CastContextHint::None)
1066
1067 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1068 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1069 // Arm TTI will use the underlying instruction to determine the cost.
1070 return Ctx.TTI.getCastInstrCost(
1071 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1073 }
1074 case Instruction::Select: {
1076 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1077 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1078
1079 VPValue *Op0, *Op1;
1080 bool IsLogicalAnd =
1081 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1082 bool IsLogicalOr =
1083 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1084 // Also match the inverted forms:
1085 // select x, false, y --> !x & y (still AND)
1086 // select x, y, true --> !x | y (still OR)
1087 IsLogicalAnd |=
1088 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1089 IsLogicalOr |=
1090 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1091
1092 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1093 (IsLogicalAnd || IsLogicalOr)) {
1094 // select x, y, false --> x & y
1095 // select x, true, y --> x | y
1096 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1097 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1098
1100 if (SI && all_of(operands(),
1101 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1102 append_range(Operands, SI->operands());
1103 return Ctx.TTI.getArithmeticInstrCost(
1104 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1105 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1106 }
1107
1108 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1109 if (!IsScalarCond && VF.isVector())
1110 CondTy = VectorType::get(CondTy, VF);
1111
1112 llvm::CmpPredicate Pred;
1113 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1114 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1115 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1116 Pred = Cmp->getPredicate();
1117 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1118 return Ctx.TTI.getCmpSelInstrCost(
1119 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1120 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1121 }
1122 }
1123 llvm_unreachable("called for unsupported opcode");
1124}
1125
1127 VPCostContext &Ctx) const {
1129 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1130 // TODO: Compute cost for VPInstructions without underlying values once
1131 // the legacy cost model has been retired.
1132 return 0;
1133 }
1134
1136 "Should only generate a vector value or single scalar, not scalars "
1137 "for all lanes.");
1139 getOpcode(),
1141 }
1142
1143 switch (getOpcode()) {
1144 case Instruction::Select: {
1146 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1147 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1148 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1149 if (!vputils::onlyFirstLaneUsed(this)) {
1150 CondTy = toVectorTy(CondTy, VF);
1151 VecTy = toVectorTy(VecTy, VF);
1152 }
1153 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1154 Ctx.CostKind);
1155 }
1156 case Instruction::ExtractElement:
1158 if (VF.isScalar()) {
1159 // ExtractLane with VF=1 takes care of handling extracting across multiple
1160 // parts.
1161 return 0;
1162 }
1164 // Add on the cost of extracting the element.
1165 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1166 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1167 Ctx.CostKind);
1168 }
1169 case VPInstruction::AnyOf: {
1170 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1171 return Ctx.TTI.getArithmeticReductionCost(
1172 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1173 }
1175 Type *Ty = Ctx.Types.inferScalarType(this);
1176 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1177 if (VF.isScalar())
1178 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1180 CmpInst::ICMP_EQ, Ctx.CostKind);
1181 // Calculate the cost of determining the lane index.
1182 auto *PredTy = toVectorTy(ScalarTy, VF);
1183 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1184 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1185 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1186 }
1188 Type *Ty = Ctx.Types.inferScalarType(this);
1189 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1190 if (VF.isScalar())
1191 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1194 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1195 auto *PredTy = toVectorTy(ScalarTy, VF);
1196 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1197 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1199 // Add cost of NOT operation on the predicate.
1201 Instruction::Xor, PredTy, Ctx.CostKind,
1202 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1203 {TargetTransformInfo::OK_UniformConstantValue,
1204 TargetTransformInfo::OP_None});
1205 // Add cost of SUB operation on the index.
1206 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1207 return Cost;
1208 }
1210 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1211 Type *VecTy = toVectorTy(ScalarTy, VF);
1212 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1213 IntrinsicCostAttributes ICA(
1214 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1215 {VecTy, MaskTy, ScalarTy});
1216 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1217 }
1219 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1220 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1221 return Ctx.TTI.getShuffleCost(
1223 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1224 }
1226 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1227 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1228 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1229 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1230 {ArgTy, ArgTy});
1231 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1232 }
1234 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1235 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1236 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1237 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1238 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1239 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1240 }
1242 assert(VF.isVector() && "Reverse operation must be vector type");
1243 Type *EltTy = Ctx.Types.inferScalarType(this);
1244 // Skip the reverse operation cost for the mask.
1245 // FIXME: Remove this once redundant mask reverse operations can be
1246 // eliminated by VPlanTransforms::cse before cost computation.
1247 if (EltTy->isIntegerTy(1))
1248 return 0;
1249 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1251 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1252 /*Index=*/0);
1253 }
1255 // Add on the cost of extracting the element.
1256 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1257 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1258 VecTy, Ctx.CostKind, 0);
1259 }
1261 if (VF == ElementCount::getScalable(1))
1263 [[fallthrough]];
1264 default:
1265 // TODO: Compute cost other VPInstructions once the legacy cost model has
1266 // been retired.
1268 "unexpected VPInstruction witht underlying value");
1269 return 0;
1270 }
1271}
1272
1284
1286 switch (getOpcode()) {
1287 case Instruction::Load:
1288 case Instruction::PHI:
1292 return true;
1293 default:
1294 return isScalarCast();
1295 }
1296}
1297
1299 assert(!isMasked() && "cannot execute masked VPInstruction");
1300 assert(!State.Lane && "VPInstruction executing an Lane");
1301 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1303 "Set flags not supported for the provided opcode");
1305 "Opcode requires specific flags to be set");
1306 if (hasFastMathFlags())
1307 State.Builder.setFastMathFlags(getFastMathFlags());
1308 Value *GeneratedValue = generate(State);
1309 if (!hasResult())
1310 return;
1311 assert(GeneratedValue && "generate must produce a value");
1312 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1315 assert((((GeneratedValue->getType()->isVectorTy() ||
1316 GeneratedValue->getType()->isStructTy()) ==
1317 !GeneratesPerFirstLaneOnly) ||
1318 State.VF.isScalar()) &&
1319 "scalar value but not only first lane defined");
1320 State.set(this, GeneratedValue,
1321 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1323 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1324 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1325 // vectorization explicitly connects VPlans.
1326 setUnderlyingValue(GeneratedValue);
1327 }
1328}
1329
1333 return false;
1334 switch (getOpcode()) {
1335 case Instruction::ExtractValue:
1336 case Instruction::InsertValue:
1337 case Instruction::GetElementPtr:
1338 case Instruction::ExtractElement:
1339 case Instruction::Freeze:
1340 case Instruction::FCmp:
1341 case Instruction::ICmp:
1342 case Instruction::Select:
1343 case Instruction::PHI:
1367 case VPInstruction::Not:
1376 return false;
1377 case Instruction::Call:
1378 return !getCalledFunction(*this)->doesNotAccessMemory();
1379 default:
1380 return true;
1381 }
1382}
1383
1385 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1387 return vputils::onlyFirstLaneUsed(this);
1388
1389 switch (getOpcode()) {
1390 default:
1391 return false;
1392 case Instruction::ExtractElement:
1393 return Op == getOperand(1);
1394 case Instruction::PHI:
1395 return true;
1396 case Instruction::FCmp:
1397 case Instruction::ICmp:
1398 case Instruction::Select:
1399 case Instruction::Or:
1400 case Instruction::Freeze:
1401 case VPInstruction::Not:
1402 // TODO: Cover additional opcodes.
1403 return vputils::onlyFirstLaneUsed(this);
1404 case Instruction::Load:
1413 return true;
1416 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1417 // operand, after replicating its operands only the first lane is used.
1418 // Before replicating, it will have only a single operand.
1419 return getNumOperands() > 1;
1421 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1423 // WidePtrAdd supports scalar and vector base addresses.
1424 return false;
1427 return Op == getOperand(0);
1428 };
1429 llvm_unreachable("switch should return");
1430}
1431
1433 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1435 return vputils::onlyFirstPartUsed(this);
1436
1437 switch (getOpcode()) {
1438 default:
1439 return false;
1440 case Instruction::FCmp:
1441 case Instruction::ICmp:
1442 case Instruction::Select:
1443 return vputils::onlyFirstPartUsed(this);
1448 return true;
1449 };
1450 llvm_unreachable("switch should return");
1451}
1452
1453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1455 VPSlotTracker SlotTracker(getParent()->getPlan());
1457}
1458
1460 VPSlotTracker &SlotTracker) const {
1461 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1462
1463 if (hasResult()) {
1465 O << " = ";
1466 }
1467
1468 switch (getOpcode()) {
1469 case VPInstruction::Not:
1470 O << "not";
1471 break;
1473 O << "combined load";
1474 break;
1476 O << "combined store";
1477 break;
1479 O << "active lane mask";
1480 break;
1482 O << "EXPLICIT-VECTOR-LENGTH";
1483 break;
1485 O << "first-order splice";
1486 break;
1488 O << "branch-on-cond";
1489 break;
1491 O << "branch-on-two-conds";
1492 break;
1494 O << "TC > VF ? TC - VF : 0";
1495 break;
1497 O << "VF * Part +";
1498 break;
1500 O << "branch-on-count";
1501 break;
1503 O << "broadcast";
1504 break;
1506 O << "buildstructvector";
1507 break;
1509 O << "buildvector";
1510 break;
1512 O << "exiting-iv-value";
1513 break;
1515 O << "masked-cond";
1516 break;
1518 O << "extract-lane";
1519 break;
1521 O << "extract-last-lane";
1522 break;
1524 O << "extract-last-part";
1525 break;
1527 O << "extract-penultimate-element";
1528 break;
1530 O << "compute-reduction-result";
1531 break;
1533 O << "logical-and";
1534 break;
1536 O << "logical-or";
1537 break;
1539 O << "ptradd";
1540 break;
1542 O << "wide-ptradd";
1543 break;
1545 O << "any-of";
1546 break;
1548 O << "first-active-lane";
1549 break;
1551 O << "last-active-lane";
1552 break;
1554 O << "reduction-start-vector";
1555 break;
1557 O << "resume-for-epilogue";
1558 break;
1560 O << "reverse";
1561 break;
1563 O << "unpack";
1564 break;
1566 O << "extract-last-active";
1567 break;
1568 default:
1570 }
1571
1572 printFlags(O);
1574}
1575#endif
1576
1578 State.setDebugLocFrom(getDebugLoc());
1579 if (isScalarCast()) {
1580 Value *Op = State.get(getOperand(0), VPLane(0));
1581 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1582 Op, ResultTy);
1583 State.set(this, Cast, VPLane(0));
1584 return;
1585 }
1586 switch (getOpcode()) {
1588 Value *StepVector =
1589 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1590 State.set(this, StepVector);
1591 break;
1592 }
1593 case VPInstruction::VScale: {
1594 Value *VScale = State.Builder.CreateVScale(ResultTy);
1595 State.set(this, VScale, true);
1596 break;
1597 }
1598
1599 default:
1600 llvm_unreachable("opcode not implemented yet");
1601 }
1602}
1603
1604#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1606 VPSlotTracker &SlotTracker) const {
1607 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1609 O << " = ";
1610
1611 switch (getOpcode()) {
1613 O << "wide-iv-step ";
1615 break;
1617 O << "step-vector " << *ResultTy;
1618 break;
1620 O << "vscale " << *ResultTy;
1621 break;
1622 case Instruction::Load:
1623 O << "load ";
1625 break;
1626 default:
1627 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1630 O << " to " << *ResultTy;
1631 }
1632}
1633#endif
1634
1636 State.setDebugLocFrom(getDebugLoc());
1637 PHINode *NewPhi = State.Builder.CreatePHI(
1638 State.TypeAnalysis.inferScalarType(this), 2, getName());
1639 unsigned NumIncoming = getNumIncoming();
1640 // Detect header phis: the parent block dominates its second incoming block
1641 // (the latch). Those IR incoming values have not been generated yet and need
1642 // to be added after they have been executed.
1643 if (NumIncoming == 2 &&
1644 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1645 NumIncoming = 1;
1646 }
1647 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1648 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1649 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1650 NewPhi->addIncoming(IncV, PredBB);
1651 }
1652 State.set(this, NewPhi, VPLane(0));
1653}
1654
1655#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1656void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1657 VPSlotTracker &SlotTracker) const {
1658 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1660 O << " = phi";
1661 printFlags(O);
1663}
1664#endif
1665
1666VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1667 if (auto *Phi = dyn_cast<PHINode>(&I))
1668 return new VPIRPhi(*Phi);
1669 return new VPIRInstruction(I);
1670}
1671
1673 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1674 "PHINodes must be handled by VPIRPhi");
1675 // Advance the insert point after the wrapped IR instruction. This allows
1676 // interleaving VPIRInstructions and other recipes.
1677 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1678}
1679
1681 VPCostContext &Ctx) const {
1682 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1683 // hence it does not contribute to the cost-modeling for the VPlan.
1684 return 0;
1685}
1686
1687#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1689 VPSlotTracker &SlotTracker) const {
1690 O << Indent << "IR " << I;
1691}
1692#endif
1693
1695 PHINode *Phi = &getIRPhi();
1696 for (const auto &[Idx, Op] : enumerate(operands())) {
1697 VPValue *ExitValue = Op;
1698 auto Lane = vputils::isSingleScalar(ExitValue)
1700 : VPLane::getLastLaneForVF(State.VF);
1701 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1702 auto *PredVPBB = Pred->getExitingBasicBlock();
1703 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1704 // Set insertion point in PredBB in case an extract needs to be generated.
1705 // TODO: Model extracts explicitly.
1706 State.Builder.SetInsertPoint(PredBB->getTerminator());
1707 Value *V = State.get(ExitValue, VPLane(Lane));
1708 // If there is no existing block for PredBB in the phi, add a new incoming
1709 // value. Otherwise update the existing incoming value for PredBB.
1710 if (Phi->getBasicBlockIndex(PredBB) == -1)
1711 Phi->addIncoming(V, PredBB);
1712 else
1713 Phi->setIncomingValueForBlock(PredBB, V);
1714 }
1715
1716 // Advance the insert point after the wrapped IR instruction. This allows
1717 // interleaving VPIRInstructions and other recipes.
1718 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1719}
1720
1722 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1723 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1724 "Number of phi operands must match number of predecessors");
1725 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1726 R->removeOperand(Position);
1727}
1728
1729VPValue *
1731 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1732 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1733}
1734
1736 VPValue *V) const {
1737 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1738 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1739}
1740
1741#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1743 VPSlotTracker &SlotTracker) const {
1744 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1745 [this, &O, &SlotTracker](auto Op) {
1746 O << "[ ";
1747 Op.value()->printAsOperand(O, SlotTracker);
1748 O << ", ";
1749 getIncomingBlock(Op.index())->printAsOperand(O);
1750 O << " ]";
1751 });
1752}
1753#endif
1754
1755#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1757 VPSlotTracker &SlotTracker) const {
1759
1760 if (getNumOperands() != 0) {
1761 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1763 [&O, &SlotTracker](auto Op) {
1764 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1765 O << " from ";
1766 std::get<1>(Op)->printAsOperand(O);
1767 });
1768 O << ")";
1769 }
1770}
1771#endif
1772
1774 for (const auto &[Kind, Node] : Metadata)
1775 I.setMetadata(Kind, Node);
1776}
1777
1779 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1780 for (const auto &[KindA, MDA] : Metadata) {
1781 for (const auto &[KindB, MDB] : Other.Metadata) {
1782 if (KindA == KindB && MDA == MDB) {
1783 MetadataIntersection.emplace_back(KindA, MDA);
1784 break;
1785 }
1786 }
1787 }
1788 Metadata = std::move(MetadataIntersection);
1789}
1790
1791#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1793 const Module *M = SlotTracker.getModule();
1794 if (Metadata.empty() || !M)
1795 return;
1796
1797 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1798 O << " (";
1799 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1800 auto [Kind, Node] = KindNodePair;
1801 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1802 "Unexpected unnamed metadata kind");
1803 O << "!" << MDNames[Kind] << " ";
1804 Node->printAsOperand(O, M);
1805 });
1806 O << ")";
1807}
1808#endif
1809
1811 assert(State.VF.isVector() && "not widening");
1812 assert(Variant != nullptr && "Can't create vector function.");
1813
1814 FunctionType *VFTy = Variant->getFunctionType();
1815 // Add return type if intrinsic is overloaded on it.
1817 for (const auto &I : enumerate(args())) {
1818 Value *Arg;
1819 // Some vectorized function variants may also take a scalar argument,
1820 // e.g. linear parameters for pointers. This needs to be the scalar value
1821 // from the start of the respective part when interleaving.
1822 if (!VFTy->getParamType(I.index())->isVectorTy())
1823 Arg = State.get(I.value(), VPLane(0));
1824 else
1825 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1826 Args.push_back(Arg);
1827 }
1828
1831 if (CI)
1832 CI->getOperandBundlesAsDefs(OpBundles);
1833
1834 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1835 applyFlags(*V);
1836 applyMetadata(*V);
1837 V->setCallingConv(Variant->getCallingConv());
1838
1839 if (!V->getType()->isVoidTy())
1840 State.set(this, V);
1841}
1842
1844 VPCostContext &Ctx) const {
1845 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1846 Variant->getFunctionType()->params(),
1847 Ctx.CostKind);
1848}
1849
1850#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1852 VPSlotTracker &SlotTracker) const {
1853 O << Indent << "WIDEN-CALL ";
1854
1855 Function *CalledFn = getCalledScalarFunction();
1856 if (CalledFn->getReturnType()->isVoidTy())
1857 O << "void ";
1858 else {
1860 O << " = ";
1861 }
1862
1863 O << "call";
1864 printFlags(O);
1865 O << " @" << CalledFn->getName() << "(";
1866 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1867 Op->printAsOperand(O, SlotTracker);
1868 });
1869 O << ")";
1870
1871 O << " (using library function";
1872 if (Variant->hasName())
1873 O << ": " << Variant->getName();
1874 O << ")";
1875}
1876#endif
1877
1879 assert(State.VF.isVector() && "not widening");
1880
1881 SmallVector<Type *, 2> TysForDecl;
1882 // Add return type if intrinsic is overloaded on it.
1883 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
1884 State.TTI)) {
1885 Type *RetTy = toVectorizedTy(getResultType(), State.VF);
1886 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
1887 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
1889 Idx, State.TTI))
1890 TysForDecl.push_back(Ty);
1891 }
1892 }
1894 for (const auto &I : enumerate(operands())) {
1895 // Some intrinsics have a scalar argument - don't replace it with a
1896 // vector.
1897 Value *Arg;
1898 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1899 State.TTI))
1900 Arg = State.get(I.value(), VPLane(0));
1901 else
1902 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1903 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1904 State.TTI))
1905 TysForDecl.push_back(Arg->getType());
1906 Args.push_back(Arg);
1907 }
1908
1909 // Use vector version of the intrinsic.
1910 Module *M = State.Builder.GetInsertBlock()->getModule();
1911 Function *VectorF =
1912 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1913 assert(VectorF &&
1914 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1915
1918 if (CI)
1919 CI->getOperandBundlesAsDefs(OpBundles);
1920
1921 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1922
1923 applyFlags(*V);
1924 applyMetadata(*V);
1925
1926 if (!V->getType()->isVoidTy())
1927 State.set(this, V);
1928}
1929
1930/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1933 const VPRecipeWithIRFlags &R,
1934 ElementCount VF,
1935 VPCostContext &Ctx) {
1936 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1937 // Skip the reverse operation cost for the mask.
1938 // FIXME: Remove this once redundant mask reverse operations can be eliminated
1939 // by VPlanTransforms::cse before cost computation.
1940 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
1941 return InstructionCost(0);
1942
1943 // Some backends analyze intrinsic arguments to determine cost. Use the
1944 // underlying value for the operand if it has one. Otherwise try to use the
1945 // operand of the underlying call instruction, if there is one. Otherwise
1946 // clear Arguments.
1947 // TODO: Rework TTI interface to be independent of concrete IR values.
1949 for (const auto &[Idx, Op] : enumerate(Operands)) {
1950 auto *V = Op->getUnderlyingValue();
1951 if (!V) {
1952 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1953 Arguments.push_back(UI->getArgOperand(Idx));
1954 continue;
1955 }
1956 Arguments.clear();
1957 break;
1958 }
1959 Arguments.push_back(V);
1960 }
1961
1962 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1963 SmallVector<Type *> ParamTys;
1964 for (const VPValue *Op : Operands) {
1965 ParamTys.push_back(VF.isVector()
1966 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1967 : Ctx.Types.inferScalarType(Op));
1968 }
1969
1970 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1971 IntrinsicCostAttributes CostAttrs(
1972 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1973 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1975 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1976}
1977
1979 VPCostContext &Ctx) const {
1981 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1982}
1983
1985 return Intrinsic::getBaseName(VectorIntrinsicID);
1986}
1987
1989 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1990 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1991 auto [Idx, V] = X;
1993 Idx, nullptr);
1994 });
1995}
1996
1997#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1999 VPSlotTracker &SlotTracker) const {
2000 O << Indent << "WIDEN-INTRINSIC ";
2001 if (ResultTy->isVoidTy()) {
2002 O << "void ";
2003 } else {
2005 O << " = ";
2006 }
2007
2008 O << "call";
2009 printFlags(O);
2010 O << getIntrinsicName() << "(";
2011
2013 Op->printAsOperand(O, SlotTracker);
2014 });
2015 O << ")";
2016}
2017#endif
2018
2020 IRBuilderBase &Builder = State.Builder;
2021
2022 Value *Address = State.get(getOperand(0));
2023 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2024 VectorType *VTy = cast<VectorType>(Address->getType());
2025
2026 // The histogram intrinsic requires a mask even if the recipe doesn't;
2027 // if the mask operand was omitted then all lanes should be executed and
2028 // we just need to synthesize an all-true mask.
2029 Value *Mask = nullptr;
2030 if (VPValue *VPMask = getMask())
2031 Mask = State.get(VPMask);
2032 else
2033 Mask =
2034 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2035
2036 // If this is a subtract, we want to invert the increment amount. We may
2037 // add a separate intrinsic in future, but for now we'll try this.
2038 if (Opcode == Instruction::Sub)
2039 IncAmt = Builder.CreateNeg(IncAmt);
2040 else
2041 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2042
2043 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
2044 {VTy, IncAmt->getType()},
2045 {Address, IncAmt, Mask});
2046}
2047
2049 VPCostContext &Ctx) const {
2050 // FIXME: Take the gather and scatter into account as well. For now we're
2051 // generating the same cost as the fallback path, but we'll likely
2052 // need to create a new TTI method for determining the cost, including
2053 // whether we can use base + vec-of-smaller-indices or just
2054 // vec-of-pointers.
2055 assert(VF.isVector() && "Invalid VF for histogram cost");
2056 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2057 VPValue *IncAmt = getOperand(1);
2058 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2059 VectorType *VTy = VectorType::get(IncTy, VF);
2060
2061 // Assume that a non-constant update value (or a constant != 1) requires
2062 // a multiply, and add that into the cost.
2063 InstructionCost MulCost =
2064 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2065 if (match(IncAmt, m_One()))
2066 MulCost = TTI::TCC_Free;
2067
2068 // Find the cost of the histogram operation itself.
2069 Type *PtrTy = VectorType::get(AddressTy, VF);
2070 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2071 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2072 Type::getVoidTy(Ctx.LLVMCtx),
2073 {PtrTy, IncTy, MaskTy});
2074
2075 // Add the costs together with the add/sub operation.
2076 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2077 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2078}
2079
2080#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2082 VPSlotTracker &SlotTracker) const {
2083 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2085
2086 if (Opcode == Instruction::Sub)
2087 O << ", dec: ";
2088 else {
2089 assert(Opcode == Instruction::Add);
2090 O << ", inc: ";
2091 }
2093
2094 if (VPValue *Mask = getMask()) {
2095 O << ", mask: ";
2096 Mask->printAsOperand(O, SlotTracker);
2097 }
2098}
2099#endif
2100
2101VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2102 AllowReassoc = FMF.allowReassoc();
2103 NoNaNs = FMF.noNaNs();
2104 NoInfs = FMF.noInfs();
2105 NoSignedZeros = FMF.noSignedZeros();
2106 AllowReciprocal = FMF.allowReciprocal();
2107 AllowContract = FMF.allowContract();
2108 ApproxFunc = FMF.approxFunc();
2109}
2110
2112 switch (Opcode) {
2113 case Instruction::Add:
2114 case Instruction::Sub:
2115 case Instruction::Mul:
2116 case Instruction::Shl:
2118 return WrapFlagsTy(false, false);
2119 case Instruction::Trunc:
2120 return TruncFlagsTy(false, false);
2121 case Instruction::Or:
2122 return DisjointFlagsTy(false);
2123 case Instruction::AShr:
2124 case Instruction::LShr:
2125 case Instruction::UDiv:
2126 case Instruction::SDiv:
2127 return ExactFlagsTy(false);
2128 case Instruction::GetElementPtr:
2131 return GEPNoWrapFlags::none();
2132 case Instruction::ZExt:
2133 case Instruction::UIToFP:
2134 return NonNegFlagsTy(false);
2135 case Instruction::FAdd:
2136 case Instruction::FSub:
2137 case Instruction::FMul:
2138 case Instruction::FDiv:
2139 case Instruction::FRem:
2140 case Instruction::FNeg:
2141 case Instruction::FPExt:
2142 case Instruction::FPTrunc:
2143 return FastMathFlags();
2144 case Instruction::ICmp:
2145 case Instruction::FCmp:
2147 llvm_unreachable("opcode requires explicit flags");
2148 default:
2149 return VPIRFlags();
2150 }
2151}
2152
2153#if !defined(NDEBUG)
2154bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2155 switch (OpType) {
2156 case OperationType::OverflowingBinOp:
2157 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2158 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2159 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2160 case OperationType::Trunc:
2161 return Opcode == Instruction::Trunc;
2162 case OperationType::DisjointOp:
2163 return Opcode == Instruction::Or;
2164 case OperationType::PossiblyExactOp:
2165 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2166 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2167 case OperationType::GEPOp:
2168 return Opcode == Instruction::GetElementPtr ||
2169 Opcode == VPInstruction::PtrAdd ||
2170 Opcode == VPInstruction::WidePtrAdd;
2171 case OperationType::FPMathOp:
2172 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2173 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2174 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2175 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2176 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2177 Opcode == Instruction::Select ||
2178 Opcode == VPInstruction::WideIVStep ||
2180 case OperationType::FCmp:
2181 return Opcode == Instruction::FCmp;
2182 case OperationType::NonNegOp:
2183 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2184 case OperationType::Cmp:
2185 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2186 case OperationType::ReductionOp:
2188 case OperationType::Other:
2189 return true;
2190 }
2191 llvm_unreachable("Unknown OperationType enum");
2192}
2193
2194bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2195 // Handle opcodes without default flags.
2196 if (Opcode == Instruction::ICmp)
2197 return OpType == OperationType::Cmp;
2198 if (Opcode == Instruction::FCmp)
2199 return OpType == OperationType::FCmp;
2201 return OpType == OperationType::ReductionOp;
2202
2203 OperationType Required = getDefaultFlags(Opcode).OpType;
2204 return Required == OperationType::Other || Required == OpType;
2205}
2206#endif
2207
2208#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2210 switch (OpType) {
2211 case OperationType::Cmp:
2213 break;
2214 case OperationType::FCmp:
2217 break;
2218 case OperationType::DisjointOp:
2219 if (DisjointFlags.IsDisjoint)
2220 O << " disjoint";
2221 break;
2222 case OperationType::PossiblyExactOp:
2223 if (ExactFlags.IsExact)
2224 O << " exact";
2225 break;
2226 case OperationType::OverflowingBinOp:
2227 if (WrapFlags.HasNUW)
2228 O << " nuw";
2229 if (WrapFlags.HasNSW)
2230 O << " nsw";
2231 break;
2232 case OperationType::Trunc:
2233 if (TruncFlags.HasNUW)
2234 O << " nuw";
2235 if (TruncFlags.HasNSW)
2236 O << " nsw";
2237 break;
2238 case OperationType::FPMathOp:
2240 break;
2241 case OperationType::GEPOp: {
2243 if (Flags.isInBounds())
2244 O << " inbounds";
2245 else if (Flags.hasNoUnsignedSignedWrap())
2246 O << " nusw";
2247 if (Flags.hasNoUnsignedWrap())
2248 O << " nuw";
2249 break;
2250 }
2251 case OperationType::NonNegOp:
2252 if (NonNegFlags.NonNeg)
2253 O << " nneg";
2254 break;
2255 case OperationType::ReductionOp: {
2256 RecurKind RK = getRecurKind();
2257 O << " (";
2258 switch (RK) {
2259 case RecurKind::AnyOf:
2260 O << "any-of";
2261 break;
2263 O << "find-last";
2264 break;
2265 case RecurKind::SMax:
2266 O << "smax";
2267 break;
2268 case RecurKind::SMin:
2269 O << "smin";
2270 break;
2271 case RecurKind::UMax:
2272 O << "umax";
2273 break;
2274 case RecurKind::UMin:
2275 O << "umin";
2276 break;
2277 case RecurKind::FMinNum:
2278 O << "fminnum";
2279 break;
2280 case RecurKind::FMaxNum:
2281 O << "fmaxnum";
2282 break;
2284 O << "fminimum";
2285 break;
2287 O << "fmaximum";
2288 break;
2290 O << "fminimumnum";
2291 break;
2293 O << "fmaximumnum";
2294 break;
2295 default:
2297 break;
2298 }
2299 if (isReductionInLoop())
2300 O << ", in-loop";
2301 if (isReductionOrdered())
2302 O << ", ordered";
2303 O << ")";
2305 break;
2306 }
2307 case OperationType::Other:
2308 break;
2309 }
2310 O << " ";
2311}
2312#endif
2313
2315 auto &Builder = State.Builder;
2316 switch (Opcode) {
2317 case Instruction::Call:
2318 case Instruction::UncondBr:
2319 case Instruction::CondBr:
2320 case Instruction::PHI:
2321 case Instruction::GetElementPtr:
2322 llvm_unreachable("This instruction is handled by a different recipe.");
2323 case Instruction::UDiv:
2324 case Instruction::SDiv:
2325 case Instruction::SRem:
2326 case Instruction::URem:
2327 case Instruction::Add:
2328 case Instruction::FAdd:
2329 case Instruction::Sub:
2330 case Instruction::FSub:
2331 case Instruction::FNeg:
2332 case Instruction::Mul:
2333 case Instruction::FMul:
2334 case Instruction::FDiv:
2335 case Instruction::FRem:
2336 case Instruction::Shl:
2337 case Instruction::LShr:
2338 case Instruction::AShr:
2339 case Instruction::And:
2340 case Instruction::Or:
2341 case Instruction::Xor: {
2342 // Just widen unops and binops.
2344 for (VPValue *VPOp : operands())
2345 Ops.push_back(State.get(VPOp));
2346
2347 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2348
2349 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2350 applyFlags(*VecOp);
2351 applyMetadata(*VecOp);
2352 }
2353
2354 // Use this vector value for all users of the original instruction.
2355 State.set(this, V);
2356 break;
2357 }
2358 case Instruction::ExtractValue: {
2359 assert(getNumOperands() == 2 && "expected single level extractvalue");
2360 Value *Op = State.get(getOperand(0));
2361 Value *Extract = Builder.CreateExtractValue(
2362 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2363 State.set(this, Extract);
2364 break;
2365 }
2366 case Instruction::Freeze: {
2367 Value *Op = State.get(getOperand(0));
2368 Value *Freeze = Builder.CreateFreeze(Op);
2369 State.set(this, Freeze);
2370 break;
2371 }
2372 case Instruction::ICmp:
2373 case Instruction::FCmp: {
2374 // Widen compares. Generate vector compares.
2375 bool FCmp = Opcode == Instruction::FCmp;
2376 Value *A = State.get(getOperand(0));
2377 Value *B = State.get(getOperand(1));
2378 Value *C = nullptr;
2379 if (FCmp) {
2380 C = Builder.CreateFCmp(getPredicate(), A, B);
2381 } else {
2382 C = Builder.CreateICmp(getPredicate(), A, B);
2383 }
2384 if (auto *I = dyn_cast<Instruction>(C)) {
2385 applyFlags(*I);
2386 applyMetadata(*I);
2387 }
2388 State.set(this, C);
2389 break;
2390 }
2391 case Instruction::Select: {
2392 VPValue *CondOp = getOperand(0);
2393 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2394 Value *Op0 = State.get(getOperand(1));
2395 Value *Op1 = State.get(getOperand(2));
2396 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2397 State.set(this, Sel);
2398 if (auto *I = dyn_cast<Instruction>(Sel)) {
2400 applyFlags(*I);
2401 applyMetadata(*I);
2402 }
2403 break;
2404 }
2405 default:
2406 // This instruction is not vectorized by simple widening.
2407 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2408 << Instruction::getOpcodeName(Opcode));
2409 llvm_unreachable("Unhandled instruction!");
2410 } // end of switch.
2411
2412#if !defined(NDEBUG)
2413 // Verify that VPlan type inference results agree with the type of the
2414 // generated values.
2415 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2416 State.get(this)->getType() &&
2417 "inferred type and type from generated instructions do not match");
2418#endif
2419}
2420
2422 VPCostContext &Ctx) const {
2423 switch (Opcode) {
2424 case Instruction::UDiv:
2425 case Instruction::SDiv:
2426 case Instruction::SRem:
2427 case Instruction::URem:
2428 // If the div/rem operation isn't safe to speculate and requires
2429 // predication, then the only way we can even create a vplan is to insert
2430 // a select on the second input operand to ensure we use the value of 1
2431 // for the inactive lanes. The select will be costed separately.
2432 case Instruction::FNeg:
2433 case Instruction::Add:
2434 case Instruction::FAdd:
2435 case Instruction::Sub:
2436 case Instruction::FSub:
2437 case Instruction::Mul:
2438 case Instruction::FMul:
2439 case Instruction::FDiv:
2440 case Instruction::FRem:
2441 case Instruction::Shl:
2442 case Instruction::LShr:
2443 case Instruction::AShr:
2444 case Instruction::And:
2445 case Instruction::Or:
2446 case Instruction::Xor:
2447 case Instruction::Freeze:
2448 case Instruction::ExtractValue:
2449 case Instruction::ICmp:
2450 case Instruction::FCmp:
2451 case Instruction::Select:
2452 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2453 default:
2454 llvm_unreachable("Unsupported opcode for instruction");
2455 }
2456}
2457
2458#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2460 VPSlotTracker &SlotTracker) const {
2461 O << Indent << "WIDEN ";
2463 O << " = " << Instruction::getOpcodeName(Opcode);
2464 printFlags(O);
2466}
2467#endif
2468
2470 auto &Builder = State.Builder;
2471 /// Vectorize casts.
2472 assert(State.VF.isVector() && "Not vectorizing?");
2473 Type *DestTy = VectorType::get(getResultType(), State.VF);
2474 VPValue *Op = getOperand(0);
2475 Value *A = State.get(Op);
2476 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2477 State.set(this, Cast);
2478 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2479 applyFlags(*CastOp);
2480 applyMetadata(*CastOp);
2481 }
2482}
2483
2485 VPCostContext &Ctx) const {
2486 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2487 // the legacy cost model, including truncates/extends when evaluating a
2488 // reduction in a smaller type.
2489 if (!getUnderlyingValue())
2490 return 0;
2491 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2492}
2493
2494#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2496 VPSlotTracker &SlotTracker) const {
2497 O << Indent << "WIDEN-CAST ";
2499 O << " = " << Instruction::getOpcodeName(Opcode);
2500 printFlags(O);
2502 O << " to " << *getResultType();
2503}
2504#endif
2505
2507 VPCostContext &Ctx) const {
2508 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2509}
2510
2511#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2513 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2514 O << Indent;
2516 O << " = WIDEN-INDUCTION";
2517 printFlags(O);
2519
2520 if (auto *TI = getTruncInst())
2521 O << " (truncated to " << *TI->getType() << ")";
2522}
2523#endif
2524
2526 // The step may be defined by a recipe in the preheader (e.g. if it requires
2527 // SCEV expansion), but for the canonical induction the step is required to be
2528 // 1, which is represented as live-in.
2529 return match(getStartValue(), m_ZeroInt()) &&
2530 match(getStepValue(), m_One()) &&
2531 getScalarType() == getRegion()->getCanonicalIVType();
2532}
2533
2535 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2536
2537 // Fast-math-flags propagate from the original induction instruction.
2538 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2539 if (FPBinOp)
2540 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2541
2542 Value *Step = State.get(getStepValue(), VPLane(0));
2543 Value *Index = State.get(getOperand(1), VPLane(0));
2544 Value *DerivedIV = emitTransformedIndex(
2545 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
2547 DerivedIV->setName(Name);
2548 State.set(this, DerivedIV, VPLane(0));
2549}
2550
2551#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2553 VPSlotTracker &SlotTracker) const {
2554 O << Indent;
2556 O << " = DERIVED-IV ";
2557 getStartValue()->printAsOperand(O, SlotTracker);
2558 O << " + ";
2559 getOperand(1)->printAsOperand(O, SlotTracker);
2560 O << " * ";
2561 getStepValue()->printAsOperand(O, SlotTracker);
2562}
2563#endif
2564
2566 // Fast-math-flags propagate from the original induction instruction.
2567 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2568 State.Builder.setFastMathFlags(getFastMathFlags());
2569
2570 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2571 /// variable on which to base the steps, \p Step is the size of the step.
2572
2573 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2574 Value *Step = State.get(getStepValue(), VPLane(0));
2575 IRBuilderBase &Builder = State.Builder;
2576
2577 // Ensure step has the same type as that of scalar IV.
2578 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2579 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2580
2581 // We build scalar steps for both integer and floating-point induction
2582 // variables. Here, we determine the kind of arithmetic we will perform.
2585 if (BaseIVTy->isIntegerTy()) {
2586 AddOp = Instruction::Add;
2587 MulOp = Instruction::Mul;
2588 } else {
2589 AddOp = InductionOpcode;
2590 MulOp = Instruction::FMul;
2591 }
2592
2593 // Determine the number of scalars we need to generate.
2594 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2595 // Compute the scalar steps and save the results in State.
2596
2597 unsigned StartLane = 0;
2598 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2599 if (State.Lane) {
2600 StartLane = State.Lane->getKnownLane();
2601 EndLane = StartLane + 1;
2602 }
2603 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2604 : Constant::getNullValue(BaseIVTy);
2605
2606 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2607 // It is okay if the induction variable type cannot hold the lane number,
2608 // we expect truncation in this case.
2609 Constant *LaneValue =
2610 BaseIVTy->isIntegerTy()
2611 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2612 /*ImplicitTrunc=*/true)
2613 : ConstantFP::get(BaseIVTy, Lane);
2614 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2615 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2616 "Expected StartIdx to be folded to a constant when VF is not "
2617 "scalable");
2618 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2619 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2620 State.set(this, Add, VPLane(Lane));
2621 }
2622}
2623
2624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2626 VPSlotTracker &SlotTracker) const {
2627 O << Indent;
2629 O << " = SCALAR-STEPS ";
2631}
2632#endif
2633
2635 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2637}
2638
2640 assert(State.VF.isVector() && "not widening");
2641 // Construct a vector GEP by widening the operands of the scalar GEP as
2642 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2643 // results in a vector of pointers when at least one operand of the GEP
2644 // is vector-typed. Thus, to keep the representation compact, we only use
2645 // vector-typed operands for loop-varying values.
2646
2647 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2648 return Op->isDefinedOutsideLoopRegions();
2649 });
2650 if (AllOperandsAreInvariant) {
2651 // If we are vectorizing, but the GEP has only loop-invariant operands,
2652 // the GEP we build (by only using vector-typed operands for
2653 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2654 // produce a vector of pointers, we need to either arbitrarily pick an
2655 // operand to broadcast, or broadcast a clone of the original GEP.
2656 // Here, we broadcast a clone of the original.
2657
2659 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2660 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2661
2662 auto *NewGEP =
2663 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2664 "", getGEPNoWrapFlags());
2665 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2666 State.set(this, Splat);
2667 return;
2668 }
2669
2670 // If the GEP has at least one loop-varying operand, we are sure to
2671 // produce a vector of pointers unless VF is scalar.
2672 // The pointer operand of the new GEP. If it's loop-invariant, we
2673 // won't broadcast it.
2674 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2675
2676 // Collect all the indices for the new GEP. If any index is
2677 // loop-invariant, we won't broadcast it.
2679 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2680 VPValue *Operand = getOperand(I);
2681 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2682 }
2683
2684 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2685 // but it should be a vector, otherwise.
2686 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2687 "", getGEPNoWrapFlags());
2688 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2689 "NewGEP is not a pointer vector");
2690 State.set(this, NewGEP);
2691}
2692
2693#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2695 VPSlotTracker &SlotTracker) const {
2696 O << Indent << "WIDEN-GEP ";
2697 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2698 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2699 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2700
2701 O << " ";
2703 O << " = getelementptr";
2704 printFlags(O);
2706}
2707#endif
2708
2710 assert(!getOffset() && "Unexpected offset operand");
2711 VPBuilder Builder(this);
2712 VPlan &Plan = *getParent()->getPlan();
2713 VPValue *VFVal = getVFValue();
2714 VPTypeAnalysis TypeInfo(Plan);
2715 const DataLayout &DL = Plan.getDataLayout();
2716 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
2717 VPValue *Stride =
2718 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
2719 Type *VFTy = TypeInfo.inferScalarType(VFVal);
2720 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
2722
2723 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2724 VPInstruction *VFMinusOne =
2725 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
2726 DebugLoc::getUnknown(), "", {true, true});
2727 VPInstruction *Offset0 =
2728 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
2729
2730 // Offset for PartN = Offset0 + Part * Stride * VF.
2731 VPValue *PartxStride =
2732 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
2733 VPValue *Offset = Builder.createAdd(
2734 Offset0,
2735 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
2737}
2738
2740 auto &Builder = State.Builder;
2741 assert(getOffset() && "Expected prior materialization of offset");
2742 Value *Ptr = State.get(getPointer(), true);
2743 Value *Offset = State.get(getOffset(), true);
2744 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2746 State.set(this, ResultPtr, /*IsScalar*/ true);
2747}
2748
2749#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2751 VPSlotTracker &SlotTracker) const {
2752 O << Indent;
2754 O << " = vector-end-pointer";
2755 printFlags(O);
2757}
2758#endif
2759
2761 auto &Builder = State.Builder;
2762 assert(getOffset() &&
2763 "Expected prior simplification of recipe without offset");
2764 Value *Ptr = State.get(getOperand(0), VPLane(0));
2765 Value *Offset = State.get(getOffset(), true);
2766 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2768 State.set(this, ResultPtr, /*IsScalar*/ true);
2769}
2770
2771#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2773 VPSlotTracker &SlotTracker) const {
2774 O << Indent;
2776 O << " = vector-pointer";
2777 printFlags(O);
2779}
2780#endif
2781
2783 VPCostContext &Ctx) const {
2784 // A blend will be expanded to a select VPInstruction, which will generate a
2785 // scalar select if only the first lane is used.
2787 VF = ElementCount::getFixed(1);
2788
2789 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2790 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2791 return (getNumIncomingValues() - 1) *
2792 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2793 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2794}
2795
2796#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2798 VPSlotTracker &SlotTracker) const {
2799 O << Indent << "BLEND ";
2801 O << " =";
2802 printFlags(O);
2803 if (getNumIncomingValues() == 1) {
2804 // Not a User of any mask: not really blending, this is a
2805 // single-predecessor phi.
2806 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2807 } else {
2808 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2809 if (I != 0)
2810 O << " ";
2811 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2812 if (I == 0 && isNormalized())
2813 continue;
2814 O << "/";
2815 getMask(I)->printAsOperand(O, SlotTracker);
2816 }
2817 }
2818}
2819#endif
2820
2822 assert(!State.Lane && "Reduction being replicated.");
2825 "In-loop AnyOf reductions aren't currently supported");
2826 // Propagate the fast-math flags carried by the underlying instruction.
2827 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2828 State.Builder.setFastMathFlags(getFastMathFlags());
2829 Value *NewVecOp = State.get(getVecOp());
2830 if (VPValue *Cond = getCondOp()) {
2831 Value *NewCond = State.get(Cond, State.VF.isScalar());
2832 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2833 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2834
2835 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2836 if (State.VF.isVector())
2837 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2838
2839 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2840 NewVecOp = Select;
2841 }
2842 Value *NewRed;
2843 Value *NextInChain;
2844 if (isOrdered()) {
2845 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2846 if (State.VF.isVector())
2847 NewRed =
2848 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2849 else
2850 NewRed = State.Builder.CreateBinOp(
2852 PrevInChain, NewVecOp);
2853 PrevInChain = NewRed;
2854 NextInChain = NewRed;
2855 } else if (isPartialReduction()) {
2856 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2857 "Unexpected partial reduction kind");
2858 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
2859 NewRed = State.Builder.CreateIntrinsic(
2860 PrevInChain->getType(),
2861 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2862 : Intrinsic::vector_partial_reduce_fadd,
2863 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
2864 "partial.reduce");
2865 PrevInChain = NewRed;
2866 NextInChain = NewRed;
2867 } else {
2868 assert(isInLoop() &&
2869 "The reduction must either be ordered, partial or in-loop");
2870 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2871 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2873 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2874 else
2875 NextInChain = State.Builder.CreateBinOp(
2877 PrevInChain, NewRed);
2878 }
2879 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
2880}
2881
2883 assert(!State.Lane && "Reduction being replicated.");
2884
2885 auto &Builder = State.Builder;
2886 // Propagate the fast-math flags carried by the underlying instruction.
2887 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2888 Builder.setFastMathFlags(getFastMathFlags());
2889
2891 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2892 Value *VecOp = State.get(getVecOp());
2893 Value *EVL = State.get(getEVL(), VPLane(0));
2894
2895 Value *Mask;
2896 if (VPValue *CondOp = getCondOp())
2897 Mask = State.get(CondOp);
2898 else
2899 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2900
2901 Value *NewRed;
2902 if (isOrdered()) {
2903 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2904 } else {
2905 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2907 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2908 else
2909 NewRed = Builder.CreateBinOp(
2911 Prev);
2912 }
2913 State.set(this, NewRed, /*IsScalar*/ true);
2914}
2915
2917 VPCostContext &Ctx) const {
2918 RecurKind RdxKind = getRecurrenceKind();
2919 Type *ElementTy = Ctx.Types.inferScalarType(this);
2920 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2921 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2923 std::optional<FastMathFlags> OptionalFMF =
2924 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2925
2926 if (isPartialReduction()) {
2927 InstructionCost CondCost = 0;
2928 if (isConditional()) {
2930 auto *CondTy = cast<VectorType>(
2931 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
2932 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
2933 CondTy, Pred, Ctx.CostKind);
2934 }
2935 return CondCost + Ctx.TTI.getPartialReductionCost(
2936 Opcode, ElementTy, ElementTy, ElementTy, VF,
2937 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
2938 OptionalFMF);
2939 }
2940
2941 // TODO: Support any-of reductions.
2942 assert(
2944 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2945 "Any-of reduction not implemented in VPlan-based cost model currently.");
2946
2947 // Note that TTI should model the cost of moving result to the scalar register
2948 // and the BinOp cost in the getMinMaxReductionCost().
2951 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2952 }
2953
2954 // Note that TTI should model the cost of moving result to the scalar register
2955 // and the BinOp cost in the getArithmeticReductionCost().
2956 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2957 Ctx.CostKind);
2958}
2959
2960VPExpressionRecipe::VPExpressionRecipe(
2961 ExpressionTypes ExpressionType,
2962 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2963 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2964 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2965 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2966 assert(
2967 none_of(ExpressionRecipes,
2968 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2969 "expression cannot contain recipes with side-effects");
2970
2971 // Maintain a copy of the expression recipes as a set of users.
2972 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2973 for (auto *R : ExpressionRecipes)
2974 ExpressionRecipesAsSetOfUsers.insert(R);
2975
2976 // Recipes in the expression, except the last one, must only be used by
2977 // (other) recipes inside the expression. If there are other users, external
2978 // to the expression, use a clone of the recipe for external users.
2979 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2980 if (R != ExpressionRecipes.back() &&
2981 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2982 return !ExpressionRecipesAsSetOfUsers.contains(U);
2983 })) {
2984 // There are users outside of the expression. Clone the recipe and use the
2985 // clone those external users.
2986 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2987 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2988 VPUser &U, unsigned) {
2989 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2990 });
2991 CopyForExtUsers->insertBefore(R);
2992 }
2993 if (R->getParent())
2994 R->removeFromParent();
2995 }
2996
2997 // Internalize all external operands to the expression recipes. To do so,
2998 // create new temporary VPValues for all operands defined by a recipe outside
2999 // the expression. The original operands are added as operands of the
3000 // VPExpressionRecipe itself.
3001 for (auto *R : ExpressionRecipes) {
3002 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3003 auto *Def = Op->getDefiningRecipe();
3004 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3005 continue;
3006 addOperand(Op);
3007 LiveInPlaceholders.push_back(new VPSymbolicValue());
3008 }
3009 }
3010
3011 // Replace each external operand with the first one created for it in
3012 // LiveInPlaceholders.
3013 for (auto *R : ExpressionRecipes)
3014 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3015 R->replaceUsesOfWith(LiveIn, Tmp);
3016}
3017
3019 for (auto *R : ExpressionRecipes)
3020 // Since the list could contain duplicates, make sure the recipe hasn't
3021 // already been inserted.
3022 if (!R->getParent())
3023 R->insertBefore(this);
3024
3025 for (const auto &[Idx, Op] : enumerate(operands()))
3026 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3027
3028 replaceAllUsesWith(ExpressionRecipes.back());
3029 ExpressionRecipes.clear();
3030}
3031
3033 VPCostContext &Ctx) const {
3034 Type *RedTy = Ctx.Types.inferScalarType(this);
3035 auto *SrcVecTy = cast<VectorType>(
3036 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
3037 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3038 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3039 switch (ExpressionType) {
3040 case ExpressionTypes::ExtendedReduction: {
3041 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3042 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
3043 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3044 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3045
3046 if (RedR->isPartialReduction())
3047 return Ctx.TTI.getPartialReductionCost(
3048 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
3050 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3051 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3052 : std::nullopt);
3053 else if (!RedTy->isFloatingPointTy())
3054 // TTI::getExtendedReductionCost only supports integer types.
3055 return Ctx.TTI.getExtendedReductionCost(
3056 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3057 std::nullopt, Ctx.CostKind);
3058 else
3060 }
3061 case ExpressionTypes::MulAccReduction:
3062 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3063 Ctx.CostKind);
3064
3065 case ExpressionTypes::ExtNegatedMulAccReduction:
3066 assert(Opcode == Instruction::Add && "Unexpected opcode");
3067 Opcode = Instruction::Sub;
3068 [[fallthrough]];
3069 case ExpressionTypes::ExtMulAccReduction: {
3070 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3071 if (RedR->isPartialReduction()) {
3072 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3073 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3074 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3075 return Ctx.TTI.getPartialReductionCost(
3076 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
3077 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
3079 Ext0R->getOpcode()),
3081 Ext1R->getOpcode()),
3082 Mul->getOpcode(), Ctx.CostKind,
3083 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3084 : std::nullopt);
3085 }
3086 return Ctx.TTI.getMulAccReductionCost(
3087 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3088 Instruction::ZExt,
3089 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3090 }
3091 }
3092 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3093}
3094
3096 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3097 return R->mayReadFromMemory() || R->mayWriteToMemory();
3098 });
3099}
3100
3102 assert(
3103 none_of(ExpressionRecipes,
3104 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3105 "expression cannot contain recipes with side-effects");
3106 return false;
3107}
3108
3110 // Cannot use vputils::isSingleScalar(), because all external operands
3111 // of the expression will be live-ins while bundled.
3112 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3113 return RR && !RR->isPartialReduction();
3114}
3115
3116#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3117
3119 VPSlotTracker &SlotTracker) const {
3120 O << Indent << "EXPRESSION ";
3122 O << " = ";
3123 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3124 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3125
3126 switch (ExpressionType) {
3127 case ExpressionTypes::ExtendedReduction: {
3129 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3130 O << Instruction::getOpcodeName(Opcode) << " (";
3132 Red->printFlags(O);
3133
3134 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3135 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3136 << *Ext0->getResultType();
3137 if (Red->isConditional()) {
3138 O << ", ";
3139 Red->getCondOp()->printAsOperand(O, SlotTracker);
3140 }
3141 O << ")";
3142 break;
3143 }
3144 case ExpressionTypes::ExtNegatedMulAccReduction: {
3146 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3148 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3149 << " (sub (0, mul";
3150 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3151 Mul->printFlags(O);
3152 O << "(";
3154 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3155 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3156 << *Ext0->getResultType() << "), (";
3158 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3159 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3160 << *Ext1->getResultType() << ")";
3161 if (Red->isConditional()) {
3162 O << ", ";
3163 Red->getCondOp()->printAsOperand(O, SlotTracker);
3164 }
3165 O << "))";
3166 break;
3167 }
3168 case ExpressionTypes::MulAccReduction:
3169 case ExpressionTypes::ExtMulAccReduction: {
3171 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3173 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3174 << " (";
3175 O << "mul";
3176 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3177 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3178 : ExpressionRecipes[0]);
3179 Mul->printFlags(O);
3180 if (IsExtended)
3181 O << "(";
3183 if (IsExtended) {
3184 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3185 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3186 << *Ext0->getResultType() << "), (";
3187 } else {
3188 O << ", ";
3189 }
3191 if (IsExtended) {
3192 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3193 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3194 << *Ext1->getResultType() << ")";
3195 }
3196 if (Red->isConditional()) {
3197 O << ", ";
3198 Red->getCondOp()->printAsOperand(O, SlotTracker);
3199 }
3200 O << ")";
3201 break;
3202 }
3203 }
3204}
3205
3207 VPSlotTracker &SlotTracker) const {
3208 if (isPartialReduction())
3209 O << Indent << "PARTIAL-REDUCE ";
3210 else
3211 O << Indent << "REDUCE ";
3213 O << " = ";
3215 O << " +";
3216 printFlags(O);
3217 O << " reduce."
3220 << " (";
3222 if (isConditional()) {
3223 O << ", ";
3225 }
3226 O << ")";
3227}
3228
3230 VPSlotTracker &SlotTracker) const {
3231 O << Indent << "REDUCE ";
3233 O << " = ";
3235 O << " +";
3236 printFlags(O);
3237 O << " vp.reduce."
3240 << " (";
3242 O << ", ";
3244 if (isConditional()) {
3245 O << ", ";
3247 }
3248 O << ")";
3249}
3250
3251#endif
3252
3253/// A helper function to scalarize a single Instruction in the innermost loop.
3254/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3255/// operands from \p RepRecipe instead of \p Instr's operands.
3256static void scalarizeInstruction(const Instruction *Instr,
3257 VPReplicateRecipe *RepRecipe,
3258 const VPLane &Lane, VPTransformState &State) {
3259 assert((!Instr->getType()->isAggregateType() ||
3260 canVectorizeTy(Instr->getType())) &&
3261 "Expected vectorizable or non-aggregate type.");
3262
3263 // Does this instruction return a value ?
3264 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3265
3266 Instruction *Cloned = Instr->clone();
3267 if (!IsVoidRetTy) {
3268 Cloned->setName(Instr->getName() + ".cloned");
3269 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3270 // The operands of the replicate recipe may have been narrowed, resulting in
3271 // a narrower result type. Update the type of the cloned instruction to the
3272 // correct type.
3273 if (ResultTy != Cloned->getType())
3274 Cloned->mutateType(ResultTy);
3275 }
3276
3277 RepRecipe->applyFlags(*Cloned);
3278 RepRecipe->applyMetadata(*Cloned);
3279
3280 if (RepRecipe->hasPredicate())
3281 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3282
3283 if (auto DL = RepRecipe->getDebugLoc())
3284 State.setDebugLocFrom(DL);
3285
3286 // Replace the operands of the cloned instructions with their scalar
3287 // equivalents in the new loop.
3288 for (const auto &I : enumerate(RepRecipe->operands())) {
3289 auto InputLane = Lane;
3290 VPValue *Operand = I.value();
3291 if (vputils::isSingleScalar(Operand))
3292 InputLane = VPLane::getFirstLane();
3293 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3294 }
3295
3296 // Place the cloned scalar in the new loop.
3297 State.Builder.Insert(Cloned);
3298
3299 State.set(RepRecipe, Cloned, Lane);
3300
3301 // If we just cloned a new assumption, add it the assumption cache.
3302 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3303 State.AC->registerAssumption(II);
3304
3305 assert(
3306 (RepRecipe->getRegion() ||
3307 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3308 all_of(RepRecipe->operands(),
3309 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3310 "Expected a recipe is either within a region or all of its operands "
3311 "are defined outside the vectorized region.");
3312}
3313
3316
3317 if (!State.Lane) {
3318 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3319 "must have already been unrolled");
3320 scalarizeInstruction(UI, this, VPLane(0), State);
3321 return;
3322 }
3323
3324 assert((State.VF.isScalar() || !isSingleScalar()) &&
3325 "uniform recipe shouldn't be predicated");
3326 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3327 scalarizeInstruction(UI, this, *State.Lane, State);
3328 // Insert scalar instance packing it into a vector.
3329 if (State.VF.isVector() && shouldPack()) {
3330 Value *WideValue =
3331 State.Lane->isFirstLane()
3332 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3333 : State.get(this);
3334 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3335 *State.Lane));
3336 }
3337}
3338
3340 // Find if the recipe is used by a widened recipe via an intervening
3341 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3342 return any_of(users(), [](const VPUser *U) {
3343 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3344 return !vputils::onlyScalarValuesUsed(PredR);
3345 return false;
3346 });
3347}
3348
3349/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3350/// which the legacy cost model computes a SCEV expression when computing the
3351/// address cost. Computing SCEVs for VPValues is incomplete and returns
3352/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3353/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3354static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3356 const Loop *L) {
3357 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3358 if (isa<SCEVCouldNotCompute>(Addr))
3359 return Addr;
3360
3361 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3362}
3363
3364/// Returns true if \p V is used as part of the address of another load or
3365/// store.
3366static bool isUsedByLoadStoreAddress(const VPUser *V) {
3368 SmallVector<const VPUser *> WorkList = {V};
3369
3370 while (!WorkList.empty()) {
3371 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3372 if (!Cur || !Seen.insert(Cur).second)
3373 continue;
3374
3375 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3376 // Skip blends that use V only through a compare by checking if any incoming
3377 // value was already visited.
3378 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3379 [&](unsigned I) {
3380 return Seen.contains(
3381 Blend->getIncomingValue(I)->getDefiningRecipe());
3382 }))
3383 continue;
3384
3385 for (VPUser *U : Cur->users()) {
3386 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3387 if (InterleaveR->getAddr() == Cur)
3388 return true;
3389 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3390 if (RepR->getOpcode() == Instruction::Load &&
3391 RepR->getOperand(0) == Cur)
3392 return true;
3393 if (RepR->getOpcode() == Instruction::Store &&
3394 RepR->getOperand(1) == Cur)
3395 return true;
3396 }
3397 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3398 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3399 return true;
3400 }
3401 }
3402
3403 // The legacy cost model only supports scalarization loads/stores with phi
3404 // addresses, if the phi is directly used as load/store address. Don't
3405 // traverse further for Blends.
3406 if (Blend)
3407 continue;
3408
3409 append_range(WorkList, Cur->users());
3410 }
3411 return false;
3412}
3413
3414/// Return true if \p R is a predicated load/store with a loop-invariant address
3415/// only masked by the header mask.
3417 const SCEV *PtrSCEV,
3418 VPCostContext &Ctx) {
3419 const VPRegionBlock *ParentRegion = R.getRegion();
3420 if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
3421 !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
3422 return false;
3423 auto *BOM =
3425 return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
3426}
3427
3429 VPCostContext &Ctx) const {
3431 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3432 // transform, avoid computing their cost multiple times for now.
3433 Ctx.SkipCostComputation.insert(UI);
3434
3435 if (VF.isScalable() && !isSingleScalar())
3437
3438 switch (UI->getOpcode()) {
3439 case Instruction::Alloca:
3440 if (VF.isScalable())
3442 return Ctx.TTI.getArithmeticInstrCost(
3443 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3444 case Instruction::GetElementPtr:
3445 // We mark this instruction as zero-cost because the cost of GEPs in
3446 // vectorized code depends on whether the corresponding memory instruction
3447 // is scalarized or not. Therefore, we handle GEPs with the memory
3448 // instruction cost.
3449 return 0;
3450 case Instruction::Call: {
3451 auto *CalledFn =
3453
3456 for (const VPValue *ArgOp : ArgOps)
3457 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3458
3459 if (CalledFn->isIntrinsic())
3460 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3461 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3462 switch (CalledFn->getIntrinsicID()) {
3463 case Intrinsic::assume:
3464 case Intrinsic::lifetime_end:
3465 case Intrinsic::lifetime_start:
3466 case Intrinsic::sideeffect:
3467 case Intrinsic::pseudoprobe:
3468 case Intrinsic::experimental_noalias_scope_decl: {
3469 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3470 ElementCount::getFixed(1), Ctx) == 0 &&
3471 "scalarizing intrinsic should be free");
3472 return InstructionCost(0);
3473 }
3474 default:
3475 break;
3476 }
3477
3478 Type *ResultTy = Ctx.Types.inferScalarType(this);
3479 InstructionCost ScalarCallCost =
3480 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3481 if (isSingleScalar()) {
3482 if (CalledFn->isIntrinsic())
3483 ScalarCallCost = std::min(
3484 ScalarCallCost,
3485 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3486 ElementCount::getFixed(1), Ctx));
3487 return ScalarCallCost;
3488 }
3489
3490 return ScalarCallCost * VF.getFixedValue() +
3491 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3492 }
3493 case Instruction::Add:
3494 case Instruction::Sub:
3495 case Instruction::FAdd:
3496 case Instruction::FSub:
3497 case Instruction::Mul:
3498 case Instruction::FMul:
3499 case Instruction::FDiv:
3500 case Instruction::FRem:
3501 case Instruction::Shl:
3502 case Instruction::LShr:
3503 case Instruction::AShr:
3504 case Instruction::And:
3505 case Instruction::Or:
3506 case Instruction::Xor:
3507 case Instruction::ICmp:
3508 case Instruction::FCmp:
3510 Ctx) *
3511 (isSingleScalar() ? 1 : VF.getFixedValue());
3512 case Instruction::SDiv:
3513 case Instruction::UDiv:
3514 case Instruction::SRem:
3515 case Instruction::URem: {
3516 InstructionCost ScalarCost =
3518 if (isSingleScalar())
3519 return ScalarCost;
3520
3521 // If any of the operands is from a different replicate region and has its
3522 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3523 // model to avoid cost mis-match.
3524 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3525 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3526 if (!PredR)
3527 return false;
3528 return Ctx.skipCostComputation(
3530 PredR->getOperand(0)->getUnderlyingValue()),
3531 VF.isVector());
3532 }))
3533 break;
3534
3535 ScalarCost = ScalarCost * VF.getFixedValue() +
3536 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3537 to_vector(operands()), VF);
3538 // If the recipe is not predicated (i.e. not in a replicate region), return
3539 // the scalar cost. Otherwise handle predicated cost.
3540 if (!getRegion()->isReplicator())
3541 return ScalarCost;
3542
3543 // Account for the phi nodes that we will create.
3544 ScalarCost += VF.getFixedValue() *
3545 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3546 // Scale the cost by the probability of executing the predicated blocks.
3547 // This assumes the predicated block for each vector lane is equally
3548 // likely.
3549 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3550 return ScalarCost;
3551 }
3552 case Instruction::Load:
3553 case Instruction::Store: {
3554 bool IsLoad = UI->getOpcode() == Instruction::Load;
3555 const VPValue *PtrOp = getOperand(!IsLoad);
3556 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3558 break;
3559
3560 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3561 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3562 const Align Alignment = getLoadStoreAlignment(UI);
3563 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3565 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3566 bool UsedByLoadStoreAddress =
3567 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3568 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3569 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3570 UsedByLoadStoreAddress ? UI : nullptr);
3571
3572 // Check if this is a predicated load/store with a loop-invariant address
3573 // only masked by the header mask. If so, return the uniform mem op cost.
3574 if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
3575 InstructionCost UniformCost =
3576 ScalarMemOpCost +
3577 Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
3578 /*Ptr=*/nullptr, Ctx.CostKind);
3579 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3580 if (IsLoad) {
3581 return UniformCost +
3582 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
3583 VectorTy, VectorTy, {}, Ctx.CostKind);
3584 }
3585
3586 VPValue *StoredVal = getOperand(0);
3587 if (!StoredVal->isDefinedOutsideLoopRegions())
3588 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3589 Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
3590 return UniformCost;
3591 }
3592
3593 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3594 InstructionCost ScalarCost =
3595 ScalarMemOpCost +
3596 Ctx.TTI.getAddressComputationCost(
3597 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3598 Ctx.CostKind);
3599 if (isSingleScalar())
3600 return ScalarCost;
3601
3602 SmallVector<const VPValue *> OpsToScalarize;
3603 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3604 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3605 // don't assign scalarization overhead in general, if the target prefers
3606 // vectorized addressing or the loaded value is used as part of an address
3607 // of another load or store.
3608 if (!UsedByLoadStoreAddress) {
3609 bool EfficientVectorLoadStore =
3610 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3611 if (!(IsLoad && !PreferVectorizedAddressing) &&
3612 !(!IsLoad && EfficientVectorLoadStore))
3613 append_range(OpsToScalarize, operands());
3614
3615 if (!EfficientVectorLoadStore)
3616 ResultTy = Ctx.Types.inferScalarType(this);
3617 }
3618
3622 (ScalarCost * VF.getFixedValue()) +
3623 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3624
3625 const VPRegionBlock *ParentRegion = getRegion();
3626 if (ParentRegion && ParentRegion->isReplicator()) {
3627 if (!PtrSCEV)
3628 break;
3629 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3630 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3631
3632 auto *VecI1Ty = VectorType::get(
3633 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3634 Cost += Ctx.TTI.getScalarizationOverhead(
3635 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3636 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3637
3638 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3639 // Artificially setting to a high enough value to practically disable
3640 // vectorization with such operations.
3641 return 3000000;
3642 }
3643 }
3644 return Cost;
3645 }
3646 case Instruction::SExt:
3647 case Instruction::ZExt:
3648 case Instruction::FPToUI:
3649 case Instruction::FPToSI:
3650 case Instruction::FPExt:
3651 case Instruction::PtrToInt:
3652 case Instruction::PtrToAddr:
3653 case Instruction::IntToPtr:
3654 case Instruction::SIToFP:
3655 case Instruction::UIToFP:
3656 case Instruction::Trunc:
3657 case Instruction::FPTrunc:
3658 case Instruction::Select:
3659 case Instruction::AddrSpaceCast: {
3661 Ctx) *
3662 (isSingleScalar() ? 1 : VF.getFixedValue());
3663 }
3664 case Instruction::ExtractValue:
3665 case Instruction::InsertValue:
3666 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3667 }
3668
3669 return Ctx.getLegacyCost(UI, VF);
3670}
3671
3672#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3674 VPSlotTracker &SlotTracker) const {
3675 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3676
3677 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3679 O << " = ";
3680 }
3681 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3682 O << "call";
3683 printFlags(O);
3684 O << "@" << CB->getCalledFunction()->getName() << "(";
3686 O, [&O, &SlotTracker](VPValue *Op) {
3687 Op->printAsOperand(O, SlotTracker);
3688 });
3689 O << ")";
3690 } else {
3692 printFlags(O);
3694 }
3695
3696 if (shouldPack())
3697 O << " (S->V)";
3698}
3699#endif
3700
3702 assert(State.Lane && "Branch on Mask works only on single instance.");
3703
3704 VPValue *BlockInMask = getOperand(0);
3705 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3706
3707 // Replace the temporary unreachable terminator with a new conditional branch,
3708 // whose two destinations will be set later when they are created.
3709 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3710 assert(isa<UnreachableInst>(CurrentTerminator) &&
3711 "Expected to replace unreachable terminator with conditional branch.");
3712 auto CondBr =
3713 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3714 CondBr->setSuccessor(0, nullptr);
3715 CurrentTerminator->eraseFromParent();
3716}
3717
3719 VPCostContext &Ctx) const {
3720 // The legacy cost model doesn't assign costs to branches for individual
3721 // replicate regions. Match the current behavior in the VPlan cost model for
3722 // now.
3723 return 0;
3724}
3725
3727 assert(State.Lane && "Predicated instruction PHI works per instance.");
3728 Instruction *ScalarPredInst =
3729 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3730 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3731 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3732 assert(PredicatingBB && "Predicated block has no single predecessor.");
3734 "operand must be VPReplicateRecipe");
3735
3736 // By current pack/unpack logic we need to generate only a single phi node: if
3737 // a vector value for the predicated instruction exists at this point it means
3738 // the instruction has vector users only, and a phi for the vector value is
3739 // needed. In this case the recipe of the predicated instruction is marked to
3740 // also do that packing, thereby "hoisting" the insert-element sequence.
3741 // Otherwise, a phi node for the scalar value is needed.
3742 if (State.hasVectorValue(getOperand(0))) {
3743 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3745 "Packed operands must generate an insertelement or insertvalue");
3746
3747 // If VectorI is a struct, it will be a sequence like:
3748 // %1 = insertvalue %unmodified, %x, 0
3749 // %2 = insertvalue %1, %y, 1
3750 // %VectorI = insertvalue %2, %z, 2
3751 // To get the unmodified vector we need to look through the chain.
3752 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3753 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3754 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3755
3756 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3757 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3758 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3759 if (State.hasVectorValue(this))
3760 State.reset(this, VPhi);
3761 else
3762 State.set(this, VPhi);
3763 // NOTE: Currently we need to update the value of the operand, so the next
3764 // predicated iteration inserts its generated value in the correct vector.
3765 State.reset(getOperand(0), VPhi);
3766 } else {
3767 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3768 return;
3769
3770 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3771 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3772 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3773 PredicatingBB);
3774 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3775 if (State.hasScalarValue(this, *State.Lane))
3776 State.reset(this, Phi, *State.Lane);
3777 else
3778 State.set(this, Phi, *State.Lane);
3779 // NOTE: Currently we need to update the value of the operand, so the next
3780 // predicated iteration inserts its generated value in the correct vector.
3781 State.reset(getOperand(0), Phi, *State.Lane);
3782 }
3783}
3784
3785#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3787 VPSlotTracker &SlotTracker) const {
3788 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3790 O << " = ";
3792}
3793#endif
3794
3796 VPCostContext &Ctx) const {
3798 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3799 ->getAddressSpace();
3800 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3801 ? Instruction::Load
3802 : Instruction::Store;
3803
3804 if (!Consecutive) {
3805 // TODO: Using the original IR may not be accurate.
3806 // Currently, ARM will use the underlying IR to calculate gather/scatter
3807 // instruction cost.
3808 [[maybe_unused]] auto IsReverseMask = [this]() {
3809 VPValue *Mask = getMask();
3810 if (!Mask)
3811 return false;
3812
3815
3816 return match(Mask, m_Reverse(m_VPValue()));
3817 };
3818 assert(!IsReverseMask() &&
3819 "Inconsecutive memory access should not have reverse order");
3821 Type *PtrTy = Ptr->getType();
3822
3823 // If the address value is uniform across all lanes, then the address can be
3824 // calculated with scalar type and broadcast.
3826 PtrTy = toVectorTy(PtrTy, VF);
3827
3828 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
3829 : isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
3830 : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
3831 : Intrinsic::vp_scatter;
3832 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3833 Ctx.CostKind) +
3834 Ctx.TTI.getMemIntrinsicInstrCost(
3836 &Ingredient),
3837 Ctx.CostKind);
3838 }
3839
3841 if (IsMasked) {
3842 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3843 : Intrinsic::masked_store;
3844 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3845 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3846 } else {
3847 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3849 : getOperand(1));
3850 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3851 OpInfo, &Ingredient);
3852 }
3853 return Cost;
3854}
3855
3857 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3858 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3859 bool CreateGather = !isConsecutive();
3860
3861 auto &Builder = State.Builder;
3862 Value *Mask = nullptr;
3863 if (auto *VPMask = getMask())
3864 Mask = State.get(VPMask);
3865
3866 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3867 Value *NewLI;
3868 if (CreateGather) {
3869 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3870 "wide.masked.gather");
3871 } else if (Mask) {
3872 NewLI =
3873 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3874 PoisonValue::get(DataTy), "wide.masked.load");
3875 } else {
3876 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3877 }
3879 State.set(this, NewLI);
3880}
3881
3882#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3884 VPSlotTracker &SlotTracker) const {
3885 O << Indent << "WIDEN ";
3887 O << " = load ";
3889}
3890#endif
3891
3893 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3894 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3895 bool CreateGather = !isConsecutive();
3896
3897 auto &Builder = State.Builder;
3898 CallInst *NewLI;
3899 Value *EVL = State.get(getEVL(), VPLane(0));
3900 Value *Addr = State.get(getAddr(), !CreateGather);
3901 Value *Mask = nullptr;
3902 if (VPValue *VPMask = getMask())
3903 Mask = State.get(VPMask);
3904 else
3905 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3906
3907 if (CreateGather) {
3908 NewLI =
3909 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3910 nullptr, "wide.masked.gather");
3911 } else {
3912 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3913 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3914 }
3915 NewLI->addParamAttr(
3917 applyMetadata(*NewLI);
3918 Instruction *Res = NewLI;
3919 State.set(this, Res);
3920}
3921
3923 VPCostContext &Ctx) const {
3924 if (!Consecutive || IsMasked)
3925 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3926
3927 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3928 // here because the EVL recipes using EVL to replace the tail mask. But in the
3929 // legacy model, it will always calculate the cost of mask.
3930 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3931 // don't need to compare to the legacy cost model.
3933 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3934 ->getAddressSpace();
3935 return Ctx.TTI.getMemIntrinsicInstrCost(
3936 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3937 Ctx.CostKind);
3938}
3939
3940#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3942 VPSlotTracker &SlotTracker) const {
3943 O << Indent << "WIDEN ";
3945 O << " = vp.load ";
3947}
3948#endif
3949
3951 VPValue *StoredVPValue = getStoredValue();
3952 bool CreateScatter = !isConsecutive();
3953
3954 auto &Builder = State.Builder;
3955
3956 Value *Mask = nullptr;
3957 if (auto *VPMask = getMask())
3958 Mask = State.get(VPMask);
3959
3960 Value *StoredVal = State.get(StoredVPValue);
3961 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3962 Instruction *NewSI = nullptr;
3963 if (CreateScatter)
3964 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3965 else if (Mask)
3966 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3967 else
3968 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3969 applyMetadata(*NewSI);
3970}
3971
3972#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3974 VPSlotTracker &SlotTracker) const {
3975 O << Indent << "WIDEN store ";
3977}
3978#endif
3979
3981 VPValue *StoredValue = getStoredValue();
3982 bool CreateScatter = !isConsecutive();
3983
3984 auto &Builder = State.Builder;
3985
3986 CallInst *NewSI = nullptr;
3987 Value *StoredVal = State.get(StoredValue);
3988 Value *EVL = State.get(getEVL(), VPLane(0));
3989 Value *Mask = nullptr;
3990 if (VPValue *VPMask = getMask())
3991 Mask = State.get(VPMask);
3992 else
3993 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3994
3995 Value *Addr = State.get(getAddr(), !CreateScatter);
3996 if (CreateScatter) {
3997 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3998 Intrinsic::vp_scatter,
3999 {StoredVal, Addr, Mask, EVL});
4000 } else {
4001 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4002 Intrinsic::vp_store,
4003 {StoredVal, Addr, Mask, EVL});
4004 }
4005 NewSI->addParamAttr(
4007 applyMetadata(*NewSI);
4008}
4009
4011 VPCostContext &Ctx) const {
4012 if (!Consecutive || IsMasked)
4013 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4014
4015 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4016 // here because the EVL recipes using EVL to replace the tail mask. But in the
4017 // legacy model, it will always calculate the cost of mask.
4018 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4019 // don't need to compare to the legacy cost model.
4021 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4022 ->getAddressSpace();
4023 return Ctx.TTI.getMemIntrinsicInstrCost(
4024 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4025 Ctx.CostKind);
4026}
4027
4028#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4030 VPSlotTracker &SlotTracker) const {
4031 O << Indent << "WIDEN vp.store ";
4033}
4034#endif
4035
4037 VectorType *DstVTy, const DataLayout &DL) {
4038 // Verify that V is a vector type with same number of elements as DstVTy.
4039 auto VF = DstVTy->getElementCount();
4040 auto *SrcVecTy = cast<VectorType>(V->getType());
4041 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4042 Type *SrcElemTy = SrcVecTy->getElementType();
4043 Type *DstElemTy = DstVTy->getElementType();
4044 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4045 "Vector elements must have same size");
4046
4047 // Do a direct cast if element types are castable.
4048 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4049 return Builder.CreateBitOrPointerCast(V, DstVTy);
4050 }
4051 // V cannot be directly casted to desired vector type.
4052 // May happen when V is a floating point vector but DstVTy is a vector of
4053 // pointers or vice-versa. Handle this using a two-step bitcast using an
4054 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4055 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4056 "Only one type should be a pointer type");
4057 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4058 "Only one type should be a floating point type");
4059 Type *IntTy =
4060 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4061 auto *VecIntTy = VectorType::get(IntTy, VF);
4062 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4063 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4064}
4065
4066/// Return a vector containing interleaved elements from multiple
4067/// smaller input vectors.
4069 const Twine &Name) {
4070 unsigned Factor = Vals.size();
4071 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4072
4073 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4074#ifndef NDEBUG
4075 for (Value *Val : Vals)
4076 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4077#endif
4078
4079 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4080 // must use intrinsics to interleave.
4081 if (VecTy->isScalableTy()) {
4082 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4083 return Builder.CreateVectorInterleave(Vals, Name);
4084 }
4085
4086 // Fixed length. Start by concatenating all vectors into a wide vector.
4087 Value *WideVec = concatenateVectors(Builder, Vals);
4088
4089 // Interleave the elements into the wide vector.
4090 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4091 return Builder.CreateShuffleVector(
4092 WideVec, createInterleaveMask(NumElts, Factor), Name);
4093}
4094
4095// Try to vectorize the interleave group that \p Instr belongs to.
4096//
4097// E.g. Translate following interleaved load group (factor = 3):
4098// for (i = 0; i < N; i+=3) {
4099// R = Pic[i]; // Member of index 0
4100// G = Pic[i+1]; // Member of index 1
4101// B = Pic[i+2]; // Member of index 2
4102// ... // do something to R, G, B
4103// }
4104// To:
4105// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4106// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4107// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4108// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4109//
4110// Or translate following interleaved store group (factor = 3):
4111// for (i = 0; i < N; i+=3) {
4112// ... do something to R, G, B
4113// Pic[i] = R; // Member of index 0
4114// Pic[i+1] = G; // Member of index 1
4115// Pic[i+2] = B; // Member of index 2
4116// }
4117// To:
4118// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4119// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4120// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4121// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4122// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4124 assert(!State.Lane && "Interleave group being replicated.");
4125 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4126 "Masking gaps for scalable vectors is not yet supported.");
4128 Instruction *Instr = Group->getInsertPos();
4129
4130 // Prepare for the vector type of the interleaved load/store.
4131 Type *ScalarTy = getLoadStoreType(Instr);
4132 unsigned InterleaveFactor = Group->getFactor();
4133 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4134
4135 VPValue *BlockInMask = getMask();
4136 VPValue *Addr = getAddr();
4137 Value *ResAddr = State.get(Addr, VPLane(0));
4138
4139 auto CreateGroupMask = [&BlockInMask, &State,
4140 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4141 if (State.VF.isScalable()) {
4142 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4143 assert(InterleaveFactor <= 8 &&
4144 "Unsupported deinterleave factor for scalable vectors");
4145 auto *ResBlockInMask = State.get(BlockInMask);
4146 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4147 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4148 }
4149
4150 if (!BlockInMask)
4151 return MaskForGaps;
4152
4153 Value *ResBlockInMask = State.get(BlockInMask);
4154 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4155 ResBlockInMask,
4156 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4157 "interleaved.mask");
4158 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4159 ShuffledMask, MaskForGaps)
4160 : ShuffledMask;
4161 };
4162
4163 const DataLayout &DL = Instr->getDataLayout();
4164 // Vectorize the interleaved load group.
4165 if (isa<LoadInst>(Instr)) {
4166 Value *MaskForGaps = nullptr;
4167 if (needsMaskForGaps()) {
4168 MaskForGaps =
4169 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4170 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4171 }
4172
4173 Instruction *NewLoad;
4174 if (BlockInMask || MaskForGaps) {
4175 Value *GroupMask = CreateGroupMask(MaskForGaps);
4176 Value *PoisonVec = PoisonValue::get(VecTy);
4177 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4178 Group->getAlign(), GroupMask,
4179 PoisonVec, "wide.masked.vec");
4180 } else
4181 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4182 Group->getAlign(), "wide.vec");
4183 applyMetadata(*NewLoad);
4184 // TODO: Also manage existing metadata using VPIRMetadata.
4185 Group->addMetadata(NewLoad);
4186
4188 if (VecTy->isScalableTy()) {
4189 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4190 // so must use intrinsics to deinterleave.
4191 assert(InterleaveFactor <= 8 &&
4192 "Unsupported deinterleave factor for scalable vectors");
4193 NewLoad = State.Builder.CreateIntrinsic(
4194 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4195 NewLoad->getType(), NewLoad,
4196 /*FMFSource=*/nullptr, "strided.vec");
4197 }
4198
4199 auto CreateStridedVector = [&InterleaveFactor, &State,
4200 &NewLoad](unsigned Index) -> Value * {
4201 assert(Index < InterleaveFactor && "Illegal group index");
4202 if (State.VF.isScalable())
4203 return State.Builder.CreateExtractValue(NewLoad, Index);
4204
4205 // For fixed length VF, use shuffle to extract the sub-vectors from the
4206 // wide load.
4207 auto StrideMask =
4208 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4209 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4210 "strided.vec");
4211 };
4212
4213 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4214 Instruction *Member = Group->getMember(I);
4215
4216 // Skip the gaps in the group.
4217 if (!Member)
4218 continue;
4219
4220 Value *StridedVec = CreateStridedVector(I);
4221
4222 // If this member has different type, cast the result type.
4223 if (Member->getType() != ScalarTy) {
4224 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4225 StridedVec =
4226 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4227 }
4228
4229 if (Group->isReverse())
4230 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4231
4232 State.set(VPDefs[J], StridedVec);
4233 ++J;
4234 }
4235 return;
4236 }
4237
4238 // The sub vector type for current instruction.
4239 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4240
4241 // Vectorize the interleaved store group.
4242 Value *MaskForGaps =
4243 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4244 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4245 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4246 ArrayRef<VPValue *> StoredValues = getStoredValues();
4247 // Collect the stored vector from each member.
4248 SmallVector<Value *, 4> StoredVecs;
4249 unsigned StoredIdx = 0;
4250 for (unsigned i = 0; i < InterleaveFactor; i++) {
4251 assert((Group->getMember(i) || MaskForGaps) &&
4252 "Fail to get a member from an interleaved store group");
4253 Instruction *Member = Group->getMember(i);
4254
4255 // Skip the gaps in the group.
4256 if (!Member) {
4257 Value *Undef = PoisonValue::get(SubVT);
4258 StoredVecs.push_back(Undef);
4259 continue;
4260 }
4261
4262 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4263 ++StoredIdx;
4264
4265 if (Group->isReverse())
4266 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4267
4268 // If this member has different type, cast it to a unified type.
4269
4270 if (StoredVec->getType() != SubVT)
4271 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4272
4273 StoredVecs.push_back(StoredVec);
4274 }
4275
4276 // Interleave all the smaller vectors into one wider vector.
4277 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4278 Instruction *NewStoreInstr;
4279 if (BlockInMask || MaskForGaps) {
4280 Value *GroupMask = CreateGroupMask(MaskForGaps);
4281 NewStoreInstr = State.Builder.CreateMaskedStore(
4282 IVec, ResAddr, Group->getAlign(), GroupMask);
4283 } else
4284 NewStoreInstr =
4285 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4286
4287 applyMetadata(*NewStoreInstr);
4288 // TODO: Also manage existing metadata using VPIRMetadata.
4289 Group->addMetadata(NewStoreInstr);
4290}
4291
4292#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4294 VPSlotTracker &SlotTracker) const {
4296 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4297 IG->getInsertPos()->printAsOperand(O, false);
4298 O << ", ";
4300 VPValue *Mask = getMask();
4301 if (Mask) {
4302 O << ", ";
4303 Mask->printAsOperand(O, SlotTracker);
4304 }
4305
4306 unsigned OpIdx = 0;
4307 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4308 if (!IG->getMember(i))
4309 continue;
4310 if (getNumStoreOperands() > 0) {
4311 O << "\n" << Indent << " store ";
4313 O << " to index " << i;
4314 } else {
4315 O << "\n" << Indent << " ";
4317 O << " = load from index " << i;
4318 }
4319 ++OpIdx;
4320 }
4321}
4322#endif
4323
4325 assert(!State.Lane && "Interleave group being replicated.");
4326 assert(State.VF.isScalable() &&
4327 "Only support scalable VF for EVL tail-folding.");
4329 "Masking gaps for scalable vectors is not yet supported.");
4331 Instruction *Instr = Group->getInsertPos();
4332
4333 // Prepare for the vector type of the interleaved load/store.
4334 Type *ScalarTy = getLoadStoreType(Instr);
4335 unsigned InterleaveFactor = Group->getFactor();
4336 assert(InterleaveFactor <= 8 &&
4337 "Unsupported deinterleave/interleave factor for scalable vectors");
4338 ElementCount WideVF = State.VF * InterleaveFactor;
4339 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4340
4341 VPValue *Addr = getAddr();
4342 Value *ResAddr = State.get(Addr, VPLane(0));
4343 Value *EVL = State.get(getEVL(), VPLane(0));
4344 Value *InterleaveEVL = State.Builder.CreateMul(
4345 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4346 /* NUW= */ true, /* NSW= */ true);
4347 LLVMContext &Ctx = State.Builder.getContext();
4348
4349 Value *GroupMask = nullptr;
4350 if (VPValue *BlockInMask = getMask()) {
4351 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4352 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4353 } else {
4354 GroupMask =
4355 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4356 }
4357
4358 // Vectorize the interleaved load group.
4359 if (isa<LoadInst>(Instr)) {
4360 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4361 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4362 "wide.vp.load");
4363 NewLoad->addParamAttr(0,
4364 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4365
4366 applyMetadata(*NewLoad);
4367 // TODO: Also manage existing metadata using VPIRMetadata.
4368 Group->addMetadata(NewLoad);
4369
4370 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4371 // so must use intrinsics to deinterleave.
4372 NewLoad = State.Builder.CreateIntrinsic(
4373 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4374 NewLoad->getType(), NewLoad,
4375 /*FMFSource=*/nullptr, "strided.vec");
4376
4377 const DataLayout &DL = Instr->getDataLayout();
4378 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4379 Instruction *Member = Group->getMember(I);
4380 // Skip the gaps in the group.
4381 if (!Member)
4382 continue;
4383
4384 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4385 // If this member has different type, cast the result type.
4386 if (Member->getType() != ScalarTy) {
4387 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4388 StridedVec =
4389 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4390 }
4391
4392 State.set(getVPValue(J), StridedVec);
4393 ++J;
4394 }
4395 return;
4396 } // End for interleaved load.
4397
4398 // The sub vector type for current instruction.
4399 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4400 // Vectorize the interleaved store group.
4401 ArrayRef<VPValue *> StoredValues = getStoredValues();
4402 // Collect the stored vector from each member.
4403 SmallVector<Value *, 4> StoredVecs;
4404 const DataLayout &DL = Instr->getDataLayout();
4405 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4406 Instruction *Member = Group->getMember(I);
4407 // Skip the gaps in the group.
4408 if (!Member) {
4409 StoredVecs.push_back(PoisonValue::get(SubVT));
4410 continue;
4411 }
4412
4413 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4414 // If this member has different type, cast it to a unified type.
4415 if (StoredVec->getType() != SubVT)
4416 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4417
4418 StoredVecs.push_back(StoredVec);
4419 ++StoredIdx;
4420 }
4421
4422 // Interleave all the smaller vectors into one wider vector.
4423 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4424 CallInst *NewStore =
4425 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4426 {IVec, ResAddr, GroupMask, InterleaveEVL});
4427 NewStore->addParamAttr(1,
4428 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4429
4430 applyMetadata(*NewStore);
4431 // TODO: Also manage existing metadata using VPIRMetadata.
4432 Group->addMetadata(NewStore);
4433}
4434
4435#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4437 VPSlotTracker &SlotTracker) const {
4439 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4440 IG->getInsertPos()->printAsOperand(O, false);
4441 O << ", ";
4443 O << ", ";
4445 if (VPValue *Mask = getMask()) {
4446 O << ", ";
4447 Mask->printAsOperand(O, SlotTracker);
4448 }
4449
4450 unsigned OpIdx = 0;
4451 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4452 if (!IG->getMember(i))
4453 continue;
4454 if (getNumStoreOperands() > 0) {
4455 O << "\n" << Indent << " vp.store ";
4457 O << " to index " << i;
4458 } else {
4459 O << "\n" << Indent << " ";
4461 O << " = vp.load from index " << i;
4462 }
4463 ++OpIdx;
4464 }
4465}
4466#endif
4467
4469 VPCostContext &Ctx) const {
4470 Instruction *InsertPos = getInsertPos();
4471 // Find the VPValue index of the interleave group. We need to skip gaps.
4472 unsigned InsertPosIdx = 0;
4473 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4474 if (auto *Member = IG->getMember(Idx)) {
4475 if (Member == InsertPos)
4476 break;
4477 InsertPosIdx++;
4478 }
4479 Type *ValTy = Ctx.Types.inferScalarType(
4480 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4481 : getStoredValues()[InsertPosIdx]);
4482 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4483 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4484 ->getAddressSpace();
4485
4486 unsigned InterleaveFactor = IG->getFactor();
4487 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4488
4489 // Holds the indices of existing members in the interleaved group.
4491 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4492 if (IG->getMember(IF))
4493 Indices.push_back(IF);
4494
4495 // Calculate the cost of the whole interleaved group.
4496 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4497 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4498 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4499
4500 if (!IG->isReverse())
4501 return Cost;
4502
4503 return Cost + IG->getNumMembers() *
4504 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4505 VectorTy, VectorTy, {}, Ctx.CostKind,
4506 0);
4507}
4508
4510 return vputils::onlyScalarValuesUsed(this) &&
4511 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4512}
4513
4514#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4516 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4517 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4518 "unexpected number of operands");
4519 O << Indent << "EMIT ";
4521 O << " = WIDEN-POINTER-INDUCTION ";
4523 O << ", ";
4525 O << ", ";
4527 if (getNumOperands() == 5) {
4528 O << ", ";
4530 O << ", ";
4532 }
4533}
4534
4536 VPSlotTracker &SlotTracker) const {
4537 O << Indent << "EMIT ";
4539 O << " = EXPAND SCEV " << *Expr;
4540}
4541#endif
4542
4544 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4545 Type *STy = CanonicalIV->getType();
4546 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4547 ElementCount VF = State.VF;
4548 Value *VStart = VF.isScalar()
4549 ? CanonicalIV
4550 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4551 Value *VStep = Builder.CreateElementCount(
4552 STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
4553 if (VF.isVector()) {
4554 VStep = Builder.CreateVectorSplat(VF, VStep);
4555 VStep =
4556 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4557 }
4558 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4559 State.set(this, CanonicalVectorIV);
4560}
4561
4562#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4564 VPSlotTracker &SlotTracker) const {
4565 O << Indent << "EMIT ";
4567 O << " = WIDEN-CANONICAL-INDUCTION ";
4569}
4570#endif
4571
4573 auto &Builder = State.Builder;
4574 // Create a vector from the initial value.
4575 auto *VectorInit = getStartValue()->getLiveInIRValue();
4576
4577 Type *VecTy = State.VF.isScalar()
4578 ? VectorInit->getType()
4579 : VectorType::get(VectorInit->getType(), State.VF);
4580
4581 BasicBlock *VectorPH =
4582 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4583 if (State.VF.isVector()) {
4584 auto *IdxTy = Builder.getInt32Ty();
4585 auto *One = ConstantInt::get(IdxTy, 1);
4586 IRBuilder<>::InsertPointGuard Guard(Builder);
4587 Builder.SetInsertPoint(VectorPH->getTerminator());
4588 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4589 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4590 VectorInit = Builder.CreateInsertElement(
4591 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4592 }
4593
4594 // Create a phi node for the new recurrence.
4595 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4596 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4597 Phi->addIncoming(VectorInit, VectorPH);
4598 State.set(this, Phi);
4599}
4600
4603 VPCostContext &Ctx) const {
4604 if (VF.isScalar())
4605 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4606
4607 return 0;
4608}
4609
4610#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4612 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4613 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4615 O << " = phi ";
4617}
4618#endif
4619
4621 // Reductions do not have to start at zero. They can start with
4622 // any loop invariant values.
4623 VPValue *StartVPV = getStartValue();
4624
4625 // In order to support recurrences we need to be able to vectorize Phi nodes.
4626 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4627 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4628 // this value when we vectorize all of the instructions that use the PHI.
4629 BasicBlock *VectorPH =
4630 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4631 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4632 Value *StartV = State.get(StartVPV, ScalarPHI);
4633 Type *VecTy = StartV->getType();
4634
4635 BasicBlock *HeaderBB = State.CFG.PrevBB;
4636 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4637 "recipe must be in the vector loop header");
4638 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4639 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4640 State.set(this, Phi, isInLoop());
4641
4642 Phi->addIncoming(StartV, VectorPH);
4643}
4644
4645#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4647 VPSlotTracker &SlotTracker) const {
4648 O << Indent << "WIDEN-REDUCTION-PHI ";
4649
4651 O << " = phi";
4652 printFlags(O);
4654 if (getVFScaleFactor() > 1)
4655 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4656}
4657#endif
4658
4660 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4661 return vputils::onlyFirstLaneUsed(this);
4662}
4663
4665 Value *Op0 = State.get(getOperand(0));
4666 Type *VecTy = Op0->getType();
4667 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4668 State.set(this, VecPhi);
4669}
4670
4672 VPCostContext &Ctx) const {
4673 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4674}
4675
4676#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4678 VPSlotTracker &SlotTracker) const {
4679 O << Indent << "WIDEN-PHI ";
4680
4682 O << " = phi ";
4684}
4685#endif
4686
4688 BasicBlock *VectorPH =
4689 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4690 Value *StartMask = State.get(getOperand(0));
4691 PHINode *Phi =
4692 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4693 Phi->addIncoming(StartMask, VectorPH);
4694 State.set(this, Phi);
4695}
4696
4697#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4699 VPSlotTracker &SlotTracker) const {
4700 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4701
4703 O << " = phi ";
4705}
4706#endif
4707
4708#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4710 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4711 O << Indent << "CURRENT-ITERATION-PHI ";
4712
4714 O << " = phi ";
4716}
4717#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, VPCostContext &Ctx)
Return true if R is a predicated load/store with a loop-invariant address only masked by the header m...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI)
For call VPInstructions, return the operand index of the called function.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:93
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
void setAllowReciprocal(bool B=true)
Definition FMF.h:90
bool allowReciprocal() const
Definition FMF.h:71
void setNoSignedZeros(bool B=true)
Definition FMF.h:87
bool allowReassoc() const
Flag queries.
Definition FMF.h:67
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:68
void setApproxFunc(bool B=true)
Definition FMF.h:96
void setNoInfs(bool B=true)
Definition FMF.h:84
bool allowContract() const
Definition FMF.h:72
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2595
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:564
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2649
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2583
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1224
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2642
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2661
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:579
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2059
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2346
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1752
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2476
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1836
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2342
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1162
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1447
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2088
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1430
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1735
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2354
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1760
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2452
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1600
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1464
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2822
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4168
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4221
iterator end()
Definition VPlan.h:4205
const VPRecipeBase & front() const
Definition VPlan.h:4215
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4234
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2819
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2814
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2810
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:368
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:498
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:471
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:483
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:493
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPIRValue * getStartValue() const
Definition VPlan.h:3963
VPValue * getStepValue() const
Definition VPlan.h:3964
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2331
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2074
Class to record and manage LLVM IR flags.
Definition VPlan.h:688
FastMathFlagsTy FMFs
Definition VPlan.h:776
ReductionFlagsTy ReductionFlags
Definition VPlan.h:778
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:770
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:993
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1057
TruncFlagsTy TruncFlags
Definition VPlan.h:771
CmpInst::Predicate getPredicate() const
Definition VPlan.h:965
ExactFlagsTy ExactFlags
Definition VPlan.h:773
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:774
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:983
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:988
DisjointFlagsTy DisjointFlags
Definition VPlan.h:772
FCmpFlagsTy FCmpFlags
Definition VPlan.h:777
NonNegFlagsTy NonNegFlags
Definition VPlan.h:775
bool isReductionInLoop() const
Definition VPlan.h:1063
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:922
uint8_t CmpPredStorage
Definition VPlan.h:769
RecurKind getRecurKind() const
Definition VPlan.h:1051
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1691
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1223
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1333
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1324
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1340
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1314
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1327
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1267
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1318
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1262
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1259
@ VScale
Returns the value for vscale.
Definition VPlan.h:1336
@ CanonicalIVIncrementForPart
Definition VPlan.h:1243
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
bool hasResult() const
Definition VPlan.h:1418
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1498
unsigned getOpcode() const
Definition VPlan.h:1402
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1443
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2924
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2928
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2926
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2918
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2947
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2912
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3021
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3034
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2984
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1605
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4312
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1630
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1590
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4513
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:116
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:480
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:526
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:470
friend class VPValue
Definition VPlanValue.h:304
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3182
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2734
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2758
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3124
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3135
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3137
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3120
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3126
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3133
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3128
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4378
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4454
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3204
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3245
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3274
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPValue * getStepValue() const
Definition VPlan.h:4032
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4040
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:606
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:673
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:608
This class can be used to assign names to VPValues.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:1156
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:329
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1544
operand_range operands()
Definition VPlanValue.h:397
unsigned getNumOperands() const
Definition VPlanValue.h:367
operand_iterator op_begin()
Definition VPlanValue.h:393
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:368
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:412
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1495
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1540
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1498
VPValue * getVFValue() const
Definition VPlan.h:2172
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2169
int64_t getStride() const
Definition VPlan.h:2170
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2241
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:2029
Function * getCalledScalarFunction() const
Definition VPlan.h:2025
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1878
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2126
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2394
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2397
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2495
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2510
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2519
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1960
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1963
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3526
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3561
Instruction & Ingredient
Definition VPlan.h:3517
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3523
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3571
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3520
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3564
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4526
const DataLayout & getDataLayout() const
Definition VPlan.h:4722
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1095
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4824
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:816
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMaximum
FP max with llvm.maximum semantics.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
LLVMContext & LLVMCtx
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1749
PHINode & getIRPhi()
Definition VPlan.h:1762
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1110
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1111
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:279
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3647
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3730
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3733
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3693