LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCurrentIterationPHISC:
78 case VPBranchOnMaskSC:
79 case VPDerivedIVSC:
80 case VPFirstOrderRecurrencePHISC:
81 case VPReductionPHISC:
82 case VPScalarIVStepsSC:
83 case VPPredInstPHISC:
84 return false;
85 case VPBlendSC:
86 case VPReductionEVLSC:
87 case VPReductionSC:
88 case VPVectorPointerSC:
89 case VPWidenCanonicalIVSC:
90 case VPWidenCastSC:
91 case VPWidenGEPSC:
92 case VPWidenIntOrFpInductionSC:
93 case VPWidenLoadEVLSC:
94 case VPWidenLoadSC:
95 case VPWidenPHISC:
96 case VPWidenPointerInductionSC:
97 case VPWidenSC: {
98 const Instruction *I =
99 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
100 (void)I;
101 assert((!I || !I->mayWriteToMemory()) &&
102 "underlying instruction may write to memory");
103 return false;
104 }
105 default:
106 return true;
107 }
108}
109
111 switch (getVPRecipeID()) {
112 case VPExpressionSC:
113 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
114 case VPInstructionSC:
115 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
116 case VPWidenLoadEVLSC:
117 case VPWidenLoadSC:
118 return true;
119 case VPReplicateSC:
120 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
121 ->mayReadFromMemory();
122 case VPWidenCallSC:
123 return !cast<VPWidenCallRecipe>(this)
124 ->getCalledScalarFunction()
125 ->onlyWritesMemory();
126 case VPWidenIntrinsicSC:
127 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
128 case VPBranchOnMaskSC:
129 case VPDerivedIVSC:
130 case VPCurrentIterationPHISC:
131 case VPFirstOrderRecurrencePHISC:
132 case VPReductionPHISC:
133 case VPPredInstPHISC:
134 case VPScalarIVStepsSC:
135 case VPWidenStoreEVLSC:
136 case VPWidenStoreSC:
137 return false;
138 case VPBlendSC:
139 case VPReductionEVLSC:
140 case VPReductionSC:
141 case VPVectorPointerSC:
142 case VPWidenCanonicalIVSC:
143 case VPWidenCastSC:
144 case VPWidenGEPSC:
145 case VPWidenIntOrFpInductionSC:
146 case VPWidenPHISC:
147 case VPWidenPointerInductionSC:
148 case VPWidenSC: {
149 const Instruction *I =
150 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
151 (void)I;
152 assert((!I || !I->mayReadFromMemory()) &&
153 "underlying instruction may read from memory");
154 return false;
155 }
156 default:
157 // FIXME: Return false if the recipe represents an interleaved store.
158 return true;
159 }
160}
161
163 switch (getVPRecipeID()) {
164 case VPExpressionSC:
165 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
166 case VPActiveLaneMaskPHISC:
167 case VPDerivedIVSC:
168 case VPCurrentIterationPHISC:
169 case VPFirstOrderRecurrencePHISC:
170 case VPReductionPHISC:
171 case VPPredInstPHISC:
172 case VPVectorEndPointerSC:
173 return false;
174 case VPInstructionSC: {
175 auto *VPI = cast<VPInstruction>(this);
176 return mayWriteToMemory() ||
177 VPI->getOpcode() == VPInstruction::BranchOnCount ||
178 VPI->getOpcode() == VPInstruction::BranchOnCond ||
179 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
180 }
181 case VPWidenCallSC: {
182 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
183 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
184 }
185 case VPWidenIntrinsicSC:
186 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
187 case VPBlendSC:
188 case VPReductionEVLSC:
189 case VPReductionSC:
190 case VPScalarIVStepsSC:
191 case VPVectorPointerSC:
192 case VPWidenCanonicalIVSC:
193 case VPWidenCastSC:
194 case VPWidenGEPSC:
195 case VPWidenIntOrFpInductionSC:
196 case VPWidenPHISC:
197 case VPWidenPointerInductionSC:
198 case VPWidenSC: {
199 const Instruction *I =
200 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
201 (void)I;
202 assert((!I || !I->mayHaveSideEffects()) &&
203 "underlying instruction has side-effects");
204 return false;
205 }
206 case VPInterleaveEVLSC:
207 case VPInterleaveSC:
208 return mayWriteToMemory();
209 case VPWidenLoadEVLSC:
210 case VPWidenLoadSC:
211 case VPWidenStoreEVLSC:
212 case VPWidenStoreSC:
213 assert(
214 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
216 "mayHaveSideffects result for ingredient differs from this "
217 "implementation");
218 return mayWriteToMemory();
219 case VPReplicateSC: {
220 auto *R = cast<VPReplicateRecipe>(this);
221 return R->getUnderlyingInstr()->mayHaveSideEffects();
222 }
223 default:
224 return true;
225 }
226}
227
229 assert(!Parent && "Recipe already in some VPBasicBlock");
230 assert(InsertPos->getParent() &&
231 "Insertion position not in any VPBasicBlock");
232 InsertPos->getParent()->insert(this, InsertPos->getIterator());
233}
234
235void VPRecipeBase::insertBefore(VPBasicBlock &BB,
237 assert(!Parent && "Recipe already in some VPBasicBlock");
238 assert(I == BB.end() || I->getParent() == &BB);
239 BB.insert(this, I);
240}
241
243 assert(!Parent && "Recipe already in some VPBasicBlock");
244 assert(InsertPos->getParent() &&
245 "Insertion position not in any VPBasicBlock");
246 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
247}
248
250 assert(getParent() && "Recipe not in any VPBasicBlock");
252 Parent = nullptr;
253}
254
256 assert(getParent() && "Recipe not in any VPBasicBlock");
258}
259
262 insertAfter(InsertPos);
263}
264
270
272 // Get the underlying instruction for the recipe, if there is one. It is used
273 // to
274 // * decide if cost computation should be skipped for this recipe,
275 // * apply forced target instruction cost.
276 Instruction *UI = nullptr;
277 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
278 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
279 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
280 UI = IG->getInsertPos();
281 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
282 UI = &WidenMem->getIngredient();
283
284 InstructionCost RecipeCost;
285 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
286 RecipeCost = 0;
287 } else {
288 RecipeCost = computeCost(VF, Ctx);
289 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
290 RecipeCost.isValid()) {
291 if (UI)
293 else
294 RecipeCost = InstructionCost(0);
295 }
296 }
297
298 LLVM_DEBUG({
299 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
300 dump();
301 });
302 return RecipeCost;
303}
304
306 VPCostContext &Ctx) const {
307 llvm_unreachable("subclasses should implement computeCost");
308}
309
311 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
313}
314
316 auto *VPI = dyn_cast<VPInstruction>(this);
317 return VPI && Instruction::isCast(VPI->getOpcode());
318}
319
321 assert(OpType == Other.OpType && "OpType must match");
322 switch (OpType) {
323 case OperationType::OverflowingBinOp:
324 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
325 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
326 break;
327 case OperationType::Trunc:
328 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
329 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
330 break;
331 case OperationType::DisjointOp:
332 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
333 break;
334 case OperationType::PossiblyExactOp:
335 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
336 break;
337 case OperationType::GEPOp:
338 GEPFlagsStorage &= Other.GEPFlagsStorage;
339 break;
340 case OperationType::FPMathOp:
341 case OperationType::FCmp:
342 assert((OpType != OperationType::FCmp ||
343 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
344 "Cannot drop CmpPredicate");
345 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
346 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
347 break;
348 case OperationType::NonNegOp:
349 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
350 break;
351 case OperationType::Cmp:
352 assert(CmpPredStorage == Other.CmpPredStorage &&
353 "Cannot drop CmpPredicate");
354 break;
355 case OperationType::ReductionOp:
356 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
357 "Cannot change RecurKind");
358 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
359 "Cannot change IsOrdered");
360 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
361 "Cannot change IsInLoop");
362 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
363 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
364 break;
365 case OperationType::Other:
366 break;
367 }
368}
369
371 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
372 OpType == OperationType::ReductionOp ||
373 OpType == OperationType::Other) &&
374 "recipe doesn't have fast math flags");
375 if (OpType == OperationType::Other)
376 return FastMathFlags();
377 const FastMathFlagsTy &F = getFMFsRef();
378 FastMathFlags Res;
379 Res.setAllowReassoc(F.AllowReassoc);
380 Res.setNoNaNs(F.NoNaNs);
381 Res.setNoInfs(F.NoInfs);
382 Res.setNoSignedZeros(F.NoSignedZeros);
383 Res.setAllowReciprocal(F.AllowReciprocal);
384 Res.setAllowContract(F.AllowContract);
385 Res.setApproxFunc(F.ApproxFunc);
386 return Res;
387}
388
389#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
391
392void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
393 VPSlotTracker &SlotTracker) const {
394 printRecipe(O, Indent, SlotTracker);
395 if (auto DL = getDebugLoc()) {
396 O << ", !dbg ";
397 DL.print(O);
398 }
399
400 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
402}
403#endif
404
405template <unsigned PartOpIdx>
406VPValue *
408 if (U.getNumOperands() == PartOpIdx + 1)
409 return U.getOperand(PartOpIdx);
410 return nullptr;
411}
412
413template <unsigned PartOpIdx>
415 if (auto *UnrollPartOp = getUnrollPartOperand(U))
416 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
417 return 0;
418}
419
420namespace llvm {
421template class VPUnrollPartAccessor<1>;
422template class VPUnrollPartAccessor<2>;
423template class VPUnrollPartAccessor<3>;
424}
425
427 const VPIRFlags &Flags, const VPIRMetadata &MD,
428 DebugLoc DL, const Twine &Name)
429 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
430 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
432 "Set flags not supported for the provided opcode");
434 "Opcode requires specific flags to be set");
438 "number of operands does not match opcode");
439}
440
441/// For call VPInstructions, return the operand index of the called function.
442/// The function is either the last operand (for unmasked calls) or the
443/// second-to-last operand (for masked calls).
444static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
445 assert(VPI.getOpcode() == Instruction::Call && "must be a call");
446 unsigned NumOps = VPI.getNumOperands();
447 auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
448 if (LastOp && isa<Function>(LastOp->getValue()))
449 return NumOps - 1;
450 assert(
451 isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
452 "expected function operand");
453 return NumOps - 2;
454}
455
456/// For call VPInstructions, return the called function.
458 unsigned Idx = getCalledFnOperandIndex(VPI);
459 return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
460}
461
463 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
464 return 1;
465
466 if (Instruction::isBinaryOp(Opcode))
467 return 2;
468
469 switch (Opcode) {
472 return 0;
473 case Instruction::Alloca:
474 case Instruction::ExtractValue:
475 case Instruction::Freeze:
476 case Instruction::Load:
489 return 1;
490 case Instruction::ICmp:
491 case Instruction::FCmp:
492 case Instruction::ExtractElement:
493 case Instruction::Store:
503 return 2;
504 case Instruction::Select:
507 return 3;
508 case Instruction::Call:
509 return getCalledFnOperandIndex(*this) + 1;
510 case Instruction::GetElementPtr:
511 case Instruction::PHI:
512 case Instruction::Switch:
522 // Cannot determine the number of operands from the opcode.
523 return -1u;
524 }
525 llvm_unreachable("all cases should be handled above");
526}
527
531
532bool VPInstruction::canGenerateScalarForFirstLane() const {
534 return true;
536 return true;
537 switch (Opcode) {
538 case Instruction::Freeze:
539 case Instruction::ICmp:
540 case Instruction::PHI:
541 case Instruction::Select:
551 return true;
552 default:
553 return false;
554 }
555}
556
557Value *VPInstruction::generate(VPTransformState &State) {
558 IRBuilderBase &Builder = State.Builder;
559
561 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
562 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
563 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
564 auto *Res =
565 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
566 if (auto *I = dyn_cast<Instruction>(Res))
567 applyFlags(*I);
568 return Res;
569 }
570
571 switch (getOpcode()) {
572 case VPInstruction::Not: {
573 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
574 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
575 return Builder.CreateNot(A, Name);
576 }
577 case Instruction::ExtractElement: {
578 assert(State.VF.isVector() && "Only extract elements from vectors");
579 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
580 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
581 Value *Vec = State.get(getOperand(0));
582 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
583 return Builder.CreateExtractElement(Vec, Idx, Name);
584 }
585 case Instruction::Freeze: {
587 return Builder.CreateFreeze(Op, Name);
588 }
589 case Instruction::FCmp:
590 case Instruction::ICmp: {
591 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
592 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
593 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
594 return Builder.CreateCmp(getPredicate(), A, B, Name);
595 }
596 case Instruction::PHI: {
597 llvm_unreachable("should be handled by VPPhi::execute");
598 }
599 case Instruction::Select: {
600 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
601 Value *Cond =
602 State.get(getOperand(0),
603 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
604 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
605 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
606 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
607 }
609 // Get first lane of vector induction variable.
610 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
611 // Get the original loop tripcount.
612 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
613
614 // If this part of the active lane mask is scalar, generate the CMP directly
615 // to avoid unnecessary extracts.
616 if (State.VF.isScalar())
617 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
618 Name);
619
620 ElementCount EC = State.VF.multiplyCoefficientBy(
621 cast<VPConstantInt>(getOperand(2))->getZExtValue());
622 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
623 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
624 {PredTy, ScalarTC->getType()},
625 {VIVElem0, ScalarTC}, nullptr, Name);
626 }
628 // Generate code to combine the previous and current values in vector v3.
629 //
630 // vector.ph:
631 // v_init = vector(..., ..., ..., a[-1])
632 // br vector.body
633 //
634 // vector.body
635 // i = phi [0, vector.ph], [i+4, vector.body]
636 // v1 = phi [v_init, vector.ph], [v2, vector.body]
637 // v2 = a[i, i+1, i+2, i+3];
638 // v3 = vector(v1(3), v2(0, 1, 2))
639
640 auto *V1 = State.get(getOperand(0));
641 if (!V1->getType()->isVectorTy())
642 return V1;
643 Value *V2 = State.get(getOperand(1));
644 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
645 }
647 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
648 Value *VFxUF = State.get(getOperand(1), VPLane(0));
649 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
650 Value *Cmp =
651 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
653 return Builder.CreateSelect(Cmp, Sub, Zero);
654 }
656 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
657 // be outside of the main loop.
658 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
659 // Compute EVL
660 assert(AVL->getType()->isIntegerTy() &&
661 "Requested vector length should be an integer.");
662
663 assert(State.VF.isScalable() && "Expected scalable vector factor.");
664 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
665
666 Value *EVL = Builder.CreateIntrinsic(
667 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
668 {AVL, VFArg, Builder.getTrue()});
669 return EVL;
670 }
672 Value *Cond = State.get(getOperand(0), VPLane(0));
673 // Replace the temporary unreachable terminator with a new conditional
674 // branch, hooking it up to backward destination for latch blocks now, and
675 // to forward destination(s) later when they are created.
676 // Second successor may be backwards - iff it is already in VPBB2IRBB.
677 VPBasicBlock *SecondVPSucc =
678 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
679 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
680 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
681 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
682 // First successor is always forward, reset it to nullptr.
683 Br->setSuccessor(0, nullptr);
685 applyMetadata(*Br);
686 return Br;
687 }
689 return Builder.CreateVectorSplat(
690 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
691 }
693 // For struct types, we need to build a new 'wide' struct type, where each
694 // element is widened, i.e., we create a struct of vectors.
695 auto *StructTy =
697 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
698 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
699 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
700 FieldIndex++) {
701 Value *ScalarValue =
702 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
703 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
704 VectorValue =
705 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
706 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
707 }
708 }
709 return Res;
710 }
712 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
713 auto NumOfElements = ElementCount::getFixed(getNumOperands());
714 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
715 for (const auto &[Idx, Op] : enumerate(operands()))
716 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
717 Builder.getInt32(Idx));
718 return Res;
719 }
721 if (State.VF.isScalar())
722 return State.get(getOperand(0), true);
723 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
725 // If this start vector is scaled then it should produce a vector with fewer
726 // elements than the VF.
727 ElementCount VF = State.VF.divideCoefficientBy(
728 cast<VPConstantInt>(getOperand(2))->getZExtValue());
729 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
730 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
731 Builder.getInt32(0));
732 }
734 RecurKind RK = getRecurKind();
735 bool IsOrdered = isReductionOrdered();
736 bool IsInLoop = isReductionInLoop();
738 "FindIV should use min/max reduction kinds");
739
740 // The recipe may have multiple operands to be reduced together.
741 unsigned NumOperandsToReduce = getNumOperands();
742 VectorParts RdxParts(NumOperandsToReduce);
743 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
744 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
745
746 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
748
749 // Reduce multiple operands into one.
750 Value *ReducedPartRdx = RdxParts[0];
751 if (IsOrdered) {
752 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
753 } else {
754 // Floating-point operations should have some FMF to enable the reduction.
755 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
756 Value *RdxPart = RdxParts[Part];
758 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
759 else {
760 // For sub-recurrences, each part's reduction variable is already
761 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
763 RK == RecurKind::Sub
764 ? Instruction::Add
766 ReducedPartRdx =
767 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
768 }
769 }
770 }
771
772 // Create the reduction after the loop. Note that inloop reductions create
773 // the target reduction in the loop using a Reduction recipe.
774 if (State.VF.isVector() && !IsInLoop) {
775 // TODO: Support in-order reductions based on the recurrence descriptor.
776 // All ops in the reduction inherit fast-math-flags from the recurrence
777 // descriptor.
778 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
779 }
780
781 return ReducedPartRdx;
782 }
785 unsigned Offset =
787 Value *Res;
788 if (State.VF.isVector()) {
789 assert(Offset <= State.VF.getKnownMinValue() &&
790 "invalid offset to extract from");
791 // Extract lane VF - Offset from the operand.
792 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
793 } else {
794 // TODO: Remove ExtractLastLane for scalar VFs.
795 assert(Offset <= 1 && "invalid offset to extract from");
796 Res = State.get(getOperand(0));
797 }
799 Res->setName(Name);
800 return Res;
801 }
803 Value *A = State.get(getOperand(0));
804 Value *B = State.get(getOperand(1));
805 return Builder.CreateLogicalAnd(A, B, Name);
806 }
808 Value *A = State.get(getOperand(0));
809 Value *B = State.get(getOperand(1));
810 return Builder.CreateLogicalOr(A, B, Name);
811 }
813 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
814 "can only generate first lane for PtrAdd");
815 Value *Ptr = State.get(getOperand(0), VPLane(0));
816 Value *Addend = State.get(getOperand(1), VPLane(0));
817 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
818 }
820 Value *Ptr =
822 Value *Addend = State.get(getOperand(1));
823 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
824 }
826 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
827 for (VPValue *Op : drop_begin(operands()))
828 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
829 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
830 }
832 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
833 "simplified to ExtractElement.");
834 Value *LaneToExtract = State.get(getOperand(0), true);
835 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
836 Value *Res = nullptr;
837 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
838
839 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
840 Value *VectorStart =
841 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
842 Value *VectorIdx = Idx == 1
843 ? LaneToExtract
844 : Builder.CreateSub(LaneToExtract, VectorStart);
845 Value *Ext = State.VF.isScalar()
846 ? State.get(getOperand(Idx))
847 : Builder.CreateExtractElement(
848 State.get(getOperand(Idx)), VectorIdx);
849 if (Res) {
850 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
851 Res = Builder.CreateSelect(Cmp, Ext, Res);
852 } else {
853 Res = Ext;
854 }
855 }
856 return Res;
857 }
859 Type *Ty = State.TypeAnalysis.inferScalarType(this);
860 if (getNumOperands() == 1) {
861 Value *Mask = State.get(getOperand(0));
862 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
863 /*ZeroIsPoison=*/false, Name);
864 }
865 // If there are multiple operands, create a chain of selects to pick the
866 // first operand with an active lane and add the number of lanes of the
867 // preceding operands.
868 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
869 unsigned LastOpIdx = getNumOperands() - 1;
870 Value *Res = nullptr;
871 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
872 Value *TrailingZeros =
873 State.VF.isScalar()
874 ? Builder.CreateZExt(
875 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
876 Builder.getFalse()),
877 Ty)
879 Ty, State.get(getOperand(Idx)),
880 /*ZeroIsPoison=*/false, Name);
881 Value *Current = Builder.CreateAdd(
882 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
883 TrailingZeros);
884 if (Res) {
885 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
886 Res = Builder.CreateSelect(Cmp, Current, Res);
887 } else {
888 Res = Current;
889 }
890 }
891
892 return Res;
893 }
895 return State.get(getOperand(0), true);
897 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
899 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
900 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
901 Value *Data = State.get(getOperand(Idx));
902 Value *Mask = State.get(getOperand(Idx + 1));
903 Type *VTy = Data->getType();
904
905 if (State.VF.isScalar())
906 Result = Builder.CreateSelect(Mask, Data, Result);
907 else
908 Result = Builder.CreateIntrinsic(
909 Intrinsic::experimental_vector_extract_last_active, {VTy},
910 {Data, Mask, Result});
911 }
912
913 return Result;
914 }
915 default:
916 llvm_unreachable("Unsupported opcode for instruction");
917 }
918}
919
921 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
922 Type *ScalarTy = Ctx.Types.inferScalarType(this);
923 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
924 switch (Opcode) {
925 case Instruction::FNeg:
926 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
927 case Instruction::UDiv:
928 case Instruction::SDiv:
929 case Instruction::SRem:
930 case Instruction::URem:
931 case Instruction::Add:
932 case Instruction::FAdd:
933 case Instruction::Sub:
934 case Instruction::FSub:
935 case Instruction::Mul:
936 case Instruction::FMul:
937 case Instruction::FDiv:
938 case Instruction::FRem:
939 case Instruction::Shl:
940 case Instruction::LShr:
941 case Instruction::AShr:
942 case Instruction::And:
943 case Instruction::Or:
944 case Instruction::Xor: {
945 // Certain instructions can be cheaper if they have a constant second
946 // operand. One example of this are shifts on x86.
947 VPValue *RHS = getOperand(1);
948 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
949
950 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
953
956 if (CtxI)
957 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
958 return Ctx.TTI.getArithmeticInstrCost(
959 Opcode, ResultTy, Ctx.CostKind,
960 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
961 RHSInfo, Operands, CtxI, &Ctx.TLI);
962 }
963 case Instruction::Freeze:
964 // This opcode is unknown. Assume that it is the same as 'mul'.
965 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
966 Ctx.CostKind);
967 case Instruction::ExtractValue:
968 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
969 Ctx.CostKind);
970 case Instruction::ICmp:
971 case Instruction::FCmp: {
972 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
973 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
975 return Ctx.TTI.getCmpSelInstrCost(
976 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
977 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
978 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
979 }
980 case Instruction::BitCast: {
981 Type *ScalarTy = Ctx.Types.inferScalarType(this);
982 if (ScalarTy->isPointerTy())
983 return 0;
984 [[fallthrough]];
985 }
986 case Instruction::SExt:
987 case Instruction::ZExt:
988 case Instruction::FPToUI:
989 case Instruction::FPToSI:
990 case Instruction::FPExt:
991 case Instruction::PtrToInt:
992 case Instruction::PtrToAddr:
993 case Instruction::IntToPtr:
994 case Instruction::SIToFP:
995 case Instruction::UIToFP:
996 case Instruction::Trunc:
997 case Instruction::FPTrunc:
998 case Instruction::AddrSpaceCast: {
999 // Computes the CastContextHint from a recipe that may access memory.
1000 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1001 if (isa<VPInterleaveBase>(R))
1003 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1004 // Only compute CCH for memory operations, matching the legacy model
1005 // which only considers loads/stores for cast context hints.
1006 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1007 if (!isa<LoadInst, StoreInst>(UI))
1009 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1011 }
1012 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1013 if (WidenMemoryRecipe == nullptr)
1015 if (VF.isScalar())
1017 if (!WidenMemoryRecipe->isConsecutive())
1019 if (WidenMemoryRecipe->isMasked())
1022 };
1023
1024 VPValue *Operand = getOperand(0);
1026 bool IsReverse = false;
1027 // For Trunc/FPTrunc, get the context from the only user.
1028 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1029 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1030 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1031 return nullptr;
1032 return dyn_cast<VPRecipeBase>(*R->user_begin());
1033 };
1034 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1035 if (match(Recipe,
1039 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1040 IsReverse = true;
1041 }
1042 if (Recipe)
1043 CCH = ComputeCCH(Recipe);
1044 }
1045 }
1046 // For Z/Sext, get the context from the operand.
1047 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1048 Opcode == Instruction::FPExt) {
1049 if (auto *Recipe = Operand->getDefiningRecipe()) {
1050 VPValue *ReverseOp;
1051 if (match(Recipe,
1052 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1054 m_VPValue(ReverseOp))))) {
1055 Recipe = ReverseOp->getDefiningRecipe();
1056 IsReverse = true;
1057 }
1058 if (Recipe)
1059 CCH = ComputeCCH(Recipe);
1060 }
1061 }
1062 if (IsReverse && CCH != TTI::CastContextHint::None)
1064
1065 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1066 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1067 // Arm TTI will use the underlying instruction to determine the cost.
1068 return Ctx.TTI.getCastInstrCost(
1069 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1071 }
1072 case Instruction::Select: {
1074 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1075 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1076
1077 VPValue *Op0, *Op1;
1078 bool IsLogicalAnd =
1079 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1080 bool IsLogicalOr =
1081 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1082 // Also match the inverted forms:
1083 // select x, false, y --> !x & y (still AND)
1084 // select x, y, true --> !x | y (still OR)
1085 IsLogicalAnd |=
1086 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1087 IsLogicalOr |=
1088 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1089
1090 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1091 (IsLogicalAnd || IsLogicalOr)) {
1092 // select x, y, false --> x & y
1093 // select x, true, y --> x | y
1094 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1095 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1096
1098 if (SI && all_of(operands(),
1099 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1100 append_range(Operands, SI->operands());
1101 return Ctx.TTI.getArithmeticInstrCost(
1102 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1103 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1104 }
1105
1106 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1107 if (!IsScalarCond && VF.isVector())
1108 CondTy = VectorType::get(CondTy, VF);
1109
1110 llvm::CmpPredicate Pred;
1111 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1112 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1113 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1114 Pred = Cmp->getPredicate();
1115 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1116 return Ctx.TTI.getCmpSelInstrCost(
1117 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1118 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1119 }
1120 }
1121 llvm_unreachable("called for unsupported opcode");
1122}
1123
1125 VPCostContext &Ctx) const {
1127 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1128 // TODO: Compute cost for VPInstructions without underlying values once
1129 // the legacy cost model has been retired.
1130 return 0;
1131 }
1132
1134 "Should only generate a vector value or single scalar, not scalars "
1135 "for all lanes.");
1137 getOpcode(),
1139 }
1140
1141 switch (getOpcode()) {
1142 case Instruction::Select: {
1144 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1145 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1146 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1147 if (!vputils::onlyFirstLaneUsed(this)) {
1148 CondTy = toVectorTy(CondTy, VF);
1149 VecTy = toVectorTy(VecTy, VF);
1150 }
1151 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1152 Ctx.CostKind);
1153 }
1154 case Instruction::ExtractElement:
1156 if (VF.isScalar()) {
1157 // ExtractLane with VF=1 takes care of handling extracting across multiple
1158 // parts.
1159 return 0;
1160 }
1161
1162 // Add on the cost of extracting the element.
1163 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1164 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1165 Ctx.CostKind);
1166 }
1167 case VPInstruction::AnyOf: {
1168 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1169 return Ctx.TTI.getArithmeticReductionCost(
1170 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1171 }
1173 Type *Ty = Ctx.Types.inferScalarType(this);
1174 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1175 if (VF.isScalar())
1176 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1178 CmpInst::ICMP_EQ, Ctx.CostKind);
1179 // Calculate the cost of determining the lane index.
1180 auto *PredTy = toVectorTy(ScalarTy, VF);
1181 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1182 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1183 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1184 }
1186 Type *Ty = Ctx.Types.inferScalarType(this);
1187 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1188 if (VF.isScalar())
1189 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1192 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1193 auto *PredTy = toVectorTy(ScalarTy, VF);
1194 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1195 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1197 // Add cost of NOT operation on the predicate.
1199 Instruction::Xor, PredTy, Ctx.CostKind,
1200 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1201 {TargetTransformInfo::OK_UniformConstantValue,
1202 TargetTransformInfo::OP_None});
1203 // Add cost of SUB operation on the index.
1204 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1205 return Cost;
1206 }
1208 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1209 Type *VecTy = toVectorTy(ScalarTy, VF);
1210 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1211 IntrinsicCostAttributes ICA(
1212 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1213 {VecTy, MaskTy, ScalarTy});
1214 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1215 }
1217 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1218 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1219 return Ctx.TTI.getShuffleCost(
1221 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1222 }
1224 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1225 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1226 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1227 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1228 {ArgTy, ArgTy});
1229 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1230 }
1232 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1233 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1234 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1235 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1236 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1237 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1238 }
1240 assert(VF.isVector() && "Reverse operation must be vector type");
1241 Type *EltTy = Ctx.Types.inferScalarType(this);
1242 // Skip the reverse operation cost for the mask.
1243 // FIXME: Remove this once redundant mask reverse operations can be
1244 // eliminated by VPlanTransforms::cse before cost computation.
1245 if (EltTy->isIntegerTy(1))
1246 return 0;
1247 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1249 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1250 /*Index=*/0);
1251 }
1253 // Add on the cost of extracting the element.
1254 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1255 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1256 VecTy, Ctx.CostKind, 0);
1257 }
1258 case Instruction::FCmp:
1259 case Instruction::ICmp:
1260 // FIXME: We don't handle scalar compares here yet. Scalar compares used for
1261 // the loop exit condition are handled by the legacy cost model, but other
1262 // scalar compares (e.g. in the middle block deciding whether to execute the
1263 // scalar epilogue) aren't accounted for.
1265 return 0;
1266 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
1268 if (VF == ElementCount::getScalable(1))
1270 [[fallthrough]];
1271 default:
1272 // TODO: Compute cost other VPInstructions once the legacy cost model has
1273 // been retired.
1275 "unexpected VPInstruction witht underlying value");
1276 return 0;
1277 }
1278}
1279
1291
1293 switch (getOpcode()) {
1294 case Instruction::Load:
1295 case Instruction::PHI:
1299 return true;
1300 default:
1301 return isScalarCast();
1302 }
1303}
1304
1306 assert(!isMasked() && "cannot execute masked VPInstruction");
1307 assert(!State.Lane && "VPInstruction executing an Lane");
1308 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1310 "Set flags not supported for the provided opcode");
1312 "Opcode requires specific flags to be set");
1313 if (hasFastMathFlags())
1314 State.Builder.setFastMathFlags(getFastMathFlags());
1315 Value *GeneratedValue = generate(State);
1316 if (!hasResult())
1317 return;
1318 assert(GeneratedValue && "generate must produce a value");
1319 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1322 assert((((GeneratedValue->getType()->isVectorTy() ||
1323 GeneratedValue->getType()->isStructTy()) ==
1324 !GeneratesPerFirstLaneOnly) ||
1325 State.VF.isScalar()) &&
1326 "scalar value but not only first lane defined");
1327 State.set(this, GeneratedValue,
1328 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1330 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1331 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1332 // vectorization explicitly connects VPlans.
1333 setUnderlyingValue(GeneratedValue);
1334 }
1335}
1336
1340 return false;
1341 switch (getOpcode()) {
1342 case Instruction::ExtractValue:
1343 case Instruction::InsertValue:
1344 case Instruction::GetElementPtr:
1345 case Instruction::ExtractElement:
1346 case Instruction::Freeze:
1347 case Instruction::FCmp:
1348 case Instruction::ICmp:
1349 case Instruction::Select:
1350 case Instruction::PHI:
1374 case VPInstruction::Not:
1383 return false;
1384 case Instruction::Call:
1385 return !getCalledFunction(*this)->doesNotAccessMemory();
1386 default:
1387 return true;
1388 }
1389}
1390
1392 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1394 return vputils::onlyFirstLaneUsed(this);
1395
1396 switch (getOpcode()) {
1397 default:
1398 return false;
1399 case Instruction::ExtractElement:
1400 return Op == getOperand(1);
1401 case Instruction::PHI:
1402 return true;
1403 case Instruction::FCmp:
1404 case Instruction::ICmp:
1405 case Instruction::Select:
1406 case Instruction::Or:
1407 case Instruction::Freeze:
1408 case VPInstruction::Not:
1409 // TODO: Cover additional opcodes.
1410 return vputils::onlyFirstLaneUsed(this);
1411 case Instruction::Load:
1421 return true;
1424 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1425 // operand, after replicating its operands only the first lane is used.
1426 // Before replicating, it will have only a single operand.
1427 return getNumOperands() > 1;
1429 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1431 // WidePtrAdd supports scalar and vector base addresses.
1432 return false;
1435 return Op == getOperand(0);
1436 };
1437 llvm_unreachable("switch should return");
1438}
1439
1441 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1443 return vputils::onlyFirstPartUsed(this);
1444
1445 switch (getOpcode()) {
1446 default:
1447 return false;
1448 case Instruction::FCmp:
1449 case Instruction::ICmp:
1450 case Instruction::Select:
1451 return vputils::onlyFirstPartUsed(this);
1456 return true;
1457 };
1458 llvm_unreachable("switch should return");
1459}
1460
1461#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1463 VPSlotTracker SlotTracker(getParent()->getPlan());
1465}
1466
1468 VPSlotTracker &SlotTracker) const {
1469 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1470
1471 if (hasResult()) {
1473 O << " = ";
1474 }
1475
1476 switch (getOpcode()) {
1477 case VPInstruction::Not:
1478 O << "not";
1479 break;
1481 O << "active lane mask";
1482 break;
1484 O << "EXPLICIT-VECTOR-LENGTH";
1485 break;
1487 O << "first-order splice";
1488 break;
1490 O << "branch-on-cond";
1491 break;
1493 O << "branch-on-two-conds";
1494 break;
1496 O << "TC > VF ? TC - VF : 0";
1497 break;
1499 O << "VF * Part +";
1500 break;
1502 O << "branch-on-count";
1503 break;
1505 O << "broadcast";
1506 break;
1508 O << "buildstructvector";
1509 break;
1511 O << "buildvector";
1512 break;
1514 O << "exiting-iv-value";
1515 break;
1517 O << "masked-cond";
1518 break;
1520 O << "extract-lane";
1521 break;
1523 O << "extract-last-lane";
1524 break;
1526 O << "extract-last-part";
1527 break;
1529 O << "extract-penultimate-element";
1530 break;
1532 O << "compute-reduction-result";
1533 break;
1535 O << "logical-and";
1536 break;
1538 O << "logical-or";
1539 break;
1541 O << "ptradd";
1542 break;
1544 O << "wide-ptradd";
1545 break;
1547 O << "any-of";
1548 break;
1550 O << "first-active-lane";
1551 break;
1553 O << "last-active-lane";
1554 break;
1556 O << "reduction-start-vector";
1557 break;
1559 O << "resume-for-epilogue";
1560 break;
1562 O << "reverse";
1563 break;
1565 O << "unpack";
1566 break;
1568 O << "extract-last-active";
1569 break;
1570 default:
1572 }
1573
1574 printFlags(O);
1576}
1577#endif
1578
1580 State.setDebugLocFrom(getDebugLoc());
1581 if (isScalarCast()) {
1582 Value *Op = State.get(getOperand(0), VPLane(0));
1583 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1584 Op, ResultTy);
1585 State.set(this, Cast, VPLane(0));
1586 return;
1587 }
1588 switch (getOpcode()) {
1590 Value *StepVector =
1591 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1592 State.set(this, StepVector);
1593 break;
1594 }
1595 case VPInstruction::VScale: {
1596 Value *VScale = State.Builder.CreateVScale(ResultTy);
1597 State.set(this, VScale, true);
1598 break;
1599 }
1600
1601 default:
1602 llvm_unreachable("opcode not implemented yet");
1603 }
1604}
1605
1606#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1608 VPSlotTracker &SlotTracker) const {
1609 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1611 O << " = ";
1612
1613 switch (getOpcode()) {
1615 O << "wide-iv-step ";
1617 break;
1619 O << "step-vector " << *ResultTy;
1620 break;
1622 O << "vscale " << *ResultTy;
1623 break;
1624 case Instruction::Load:
1625 O << "load ";
1627 break;
1628 default:
1629 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1632 O << " to " << *ResultTy;
1633 }
1634}
1635#endif
1636
1638 State.setDebugLocFrom(getDebugLoc());
1639 PHINode *NewPhi = State.Builder.CreatePHI(
1640 State.TypeAnalysis.inferScalarType(this), 2, getName());
1641 unsigned NumIncoming = getNumIncoming();
1642 // Detect header phis: the parent block dominates its second incoming block
1643 // (the latch). Those IR incoming values have not been generated yet and need
1644 // to be added after they have been executed.
1645 if (NumIncoming == 2 &&
1646 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1647 NumIncoming = 1;
1648 }
1649 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1650 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1651 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1652 NewPhi->addIncoming(IncV, PredBB);
1653 }
1654 State.set(this, NewPhi, VPLane(0));
1655}
1656
1657#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1658void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1659 VPSlotTracker &SlotTracker) const {
1660 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1662 O << " = phi";
1663 printFlags(O);
1665}
1666#endif
1667
1668VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1669 if (auto *Phi = dyn_cast<PHINode>(&I))
1670 return new VPIRPhi(*Phi);
1671 return new VPIRInstruction(I);
1672}
1673
1675 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1676 "PHINodes must be handled by VPIRPhi");
1677 // Advance the insert point after the wrapped IR instruction. This allows
1678 // interleaving VPIRInstructions and other recipes.
1679 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1680}
1681
1683 VPCostContext &Ctx) const {
1684 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1685 // hence it does not contribute to the cost-modeling for the VPlan.
1686 return 0;
1687}
1688
1689#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1691 VPSlotTracker &SlotTracker) const {
1692 O << Indent << "IR " << I;
1693}
1694#endif
1695
1697 PHINode *Phi = &getIRPhi();
1698 for (const auto &[Idx, Op] : enumerate(operands())) {
1699 VPValue *ExitValue = Op;
1700 auto Lane = vputils::isSingleScalar(ExitValue)
1702 : VPLane::getLastLaneForVF(State.VF);
1703 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1704 auto *PredVPBB = Pred->getExitingBasicBlock();
1705 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1706 // Set insertion point in PredBB in case an extract needs to be generated.
1707 // TODO: Model extracts explicitly.
1708 State.Builder.SetInsertPoint(PredBB->getTerminator());
1709 Value *V = State.get(ExitValue, VPLane(Lane));
1710 // If there is no existing block for PredBB in the phi, add a new incoming
1711 // value. Otherwise update the existing incoming value for PredBB.
1712 if (Phi->getBasicBlockIndex(PredBB) == -1)
1713 Phi->addIncoming(V, PredBB);
1714 else
1715 Phi->setIncomingValueForBlock(PredBB, V);
1716 }
1717
1718 // Advance the insert point after the wrapped IR instruction. This allows
1719 // interleaving VPIRInstructions and other recipes.
1720 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1721}
1722
1724 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1725 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1726 "Number of phi operands must match number of predecessors");
1727 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1728 R->removeOperand(Position);
1729}
1730
1731VPValue *
1733 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1734 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1735}
1736
1738 VPValue *V) const {
1739 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1740 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1741}
1742
1743#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1745 VPSlotTracker &SlotTracker) const {
1746 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1747 [this, &O, &SlotTracker](auto Op) {
1748 O << "[ ";
1749 Op.value()->printAsOperand(O, SlotTracker);
1750 O << ", ";
1751 getIncomingBlock(Op.index())->printAsOperand(O);
1752 O << " ]";
1753 });
1754}
1755#endif
1756
1757#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1759 VPSlotTracker &SlotTracker) const {
1761
1762 if (getNumOperands() != 0) {
1763 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1765 [&O, &SlotTracker](auto Op) {
1766 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1767 O << " from ";
1768 std::get<1>(Op)->printAsOperand(O);
1769 });
1770 O << ")";
1771 }
1772}
1773#endif
1774
1776 for (const auto &[Kind, Node] : Metadata)
1777 I.setMetadata(Kind, Node);
1778}
1779
1781 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1782 for (const auto &[KindA, MDA] : Metadata) {
1783 for (const auto &[KindB, MDB] : Other.Metadata) {
1784 if (KindA == KindB && MDA == MDB) {
1785 MetadataIntersection.emplace_back(KindA, MDA);
1786 break;
1787 }
1788 }
1789 }
1790 Metadata = std::move(MetadataIntersection);
1791}
1792
1793#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1795 const Module *M = SlotTracker.getModule();
1796 if (Metadata.empty() || !M)
1797 return;
1798
1799 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1800 O << " (";
1801 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1802 auto [Kind, Node] = KindNodePair;
1803 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1804 "Unexpected unnamed metadata kind");
1805 O << "!" << MDNames[Kind] << " ";
1806 Node->printAsOperand(O, M);
1807 });
1808 O << ")";
1809}
1810#endif
1811
1813 assert(State.VF.isVector() && "not widening");
1814 assert(Variant != nullptr && "Can't create vector function.");
1815
1816 FunctionType *VFTy = Variant->getFunctionType();
1817 // Add return type if intrinsic is overloaded on it.
1819 for (const auto &I : enumerate(args())) {
1820 Value *Arg;
1821 // Some vectorized function variants may also take a scalar argument,
1822 // e.g. linear parameters for pointers. This needs to be the scalar value
1823 // from the start of the respective part when interleaving.
1824 if (!VFTy->getParamType(I.index())->isVectorTy())
1825 Arg = State.get(I.value(), VPLane(0));
1826 else
1827 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1828 Args.push_back(Arg);
1829 }
1830
1833 if (CI)
1834 CI->getOperandBundlesAsDefs(OpBundles);
1835
1836 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1837 applyFlags(*V);
1838 applyMetadata(*V);
1839 V->setCallingConv(Variant->getCallingConv());
1840
1841 if (!V->getType()->isVoidTy())
1842 State.set(this, V);
1843}
1844
1846 VPCostContext &Ctx) const {
1847 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1848 Variant->getFunctionType()->params(),
1849 Ctx.CostKind);
1850}
1851
1852#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1854 VPSlotTracker &SlotTracker) const {
1855 O << Indent << "WIDEN-CALL ";
1856
1857 Function *CalledFn = getCalledScalarFunction();
1858 if (CalledFn->getReturnType()->isVoidTy())
1859 O << "void ";
1860 else {
1862 O << " = ";
1863 }
1864
1865 O << "call";
1866 printFlags(O);
1867 O << " @" << CalledFn->getName() << "(";
1868 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1869 Op->printAsOperand(O, SlotTracker);
1870 });
1871 O << ")";
1872
1873 O << " (using library function";
1874 if (Variant->hasName())
1875 O << ": " << Variant->getName();
1876 O << ")";
1877}
1878#endif
1879
1881 assert(State.VF.isVector() && "not widening");
1882
1883 SmallVector<Type *, 2> TysForDecl;
1884 // Add return type if intrinsic is overloaded on it.
1885 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
1886 State.TTI)) {
1887 Type *RetTy = toVectorizedTy(getResultType(), State.VF);
1888 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
1889 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
1891 Idx, State.TTI))
1892 TysForDecl.push_back(Ty);
1893 }
1894 }
1896 for (const auto &I : enumerate(operands())) {
1897 // Some intrinsics have a scalar argument - don't replace it with a
1898 // vector.
1899 Value *Arg;
1900 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1901 State.TTI))
1902 Arg = State.get(I.value(), VPLane(0));
1903 else
1904 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1905 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1906 State.TTI))
1907 TysForDecl.push_back(Arg->getType());
1908 Args.push_back(Arg);
1909 }
1910
1911 // Use vector version of the intrinsic.
1912 Module *M = State.Builder.GetInsertBlock()->getModule();
1913 Function *VectorF =
1914 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1915 assert(VectorF &&
1916 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1917
1920 if (CI)
1921 CI->getOperandBundlesAsDefs(OpBundles);
1922
1923 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1924
1925 applyFlags(*V);
1926 applyMetadata(*V);
1927
1928 if (!V->getType()->isVoidTy())
1929 State.set(this, V);
1930}
1931
1932/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1935 const VPRecipeWithIRFlags &R,
1936 ElementCount VF,
1937 VPCostContext &Ctx) {
1938 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1939 // Skip the reverse operation cost for the mask.
1940 // FIXME: Remove this once redundant mask reverse operations can be eliminated
1941 // by VPlanTransforms::cse before cost computation.
1942 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
1943 return InstructionCost(0);
1944
1945 // Some backends analyze intrinsic arguments to determine cost. Use the
1946 // underlying value for the operand if it has one. Otherwise try to use the
1947 // operand of the underlying call instruction, if there is one. Otherwise
1948 // clear Arguments.
1949 // TODO: Rework TTI interface to be independent of concrete IR values.
1951 for (const auto &[Idx, Op] : enumerate(Operands)) {
1952 auto *V = Op->getUnderlyingValue();
1953 if (!V) {
1954 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1955 Arguments.push_back(UI->getArgOperand(Idx));
1956 continue;
1957 }
1958 Arguments.clear();
1959 break;
1960 }
1961 Arguments.push_back(V);
1962 }
1963
1964 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1965 SmallVector<Type *> ParamTys;
1966 for (const VPValue *Op : Operands) {
1967 ParamTys.push_back(VF.isVector()
1968 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1969 : Ctx.Types.inferScalarType(Op));
1970 }
1971
1972 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1973 IntrinsicCostAttributes CostAttrs(
1974 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1975 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1977 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1978}
1979
1981 VPCostContext &Ctx) const {
1983 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1984}
1985
1987 return Intrinsic::getBaseName(VectorIntrinsicID);
1988}
1989
1991 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1992 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1993 auto [Idx, V] = X;
1995 Idx, nullptr);
1996 });
1997}
1998
1999#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2001 VPSlotTracker &SlotTracker) const {
2002 O << Indent << "WIDEN-INTRINSIC ";
2003 if (ResultTy->isVoidTy()) {
2004 O << "void ";
2005 } else {
2007 O << " = ";
2008 }
2009
2010 O << "call";
2011 printFlags(O);
2012 O << getIntrinsicName() << "(";
2013
2015 Op->printAsOperand(O, SlotTracker);
2016 });
2017 O << ")";
2018}
2019#endif
2020
2022 IRBuilderBase &Builder = State.Builder;
2023
2024 Value *Address = State.get(getOperand(0));
2025 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2026 VectorType *VTy = cast<VectorType>(Address->getType());
2027
2028 // The histogram intrinsic requires a mask even if the recipe doesn't;
2029 // if the mask operand was omitted then all lanes should be executed and
2030 // we just need to synthesize an all-true mask.
2031 Value *Mask = nullptr;
2032 if (VPValue *VPMask = getMask())
2033 Mask = State.get(VPMask);
2034 else
2035 Mask =
2036 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2037
2038 // If this is a subtract, we want to invert the increment amount. We may
2039 // add a separate intrinsic in future, but for now we'll try this.
2040 if (Opcode == Instruction::Sub)
2041 IncAmt = Builder.CreateNeg(IncAmt);
2042 else
2043 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2044
2045 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
2046 {VTy, IncAmt->getType()},
2047 {Address, IncAmt, Mask});
2048}
2049
2051 VPCostContext &Ctx) const {
2052 // FIXME: Take the gather and scatter into account as well. For now we're
2053 // generating the same cost as the fallback path, but we'll likely
2054 // need to create a new TTI method for determining the cost, including
2055 // whether we can use base + vec-of-smaller-indices or just
2056 // vec-of-pointers.
2057 assert(VF.isVector() && "Invalid VF for histogram cost");
2058 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2059 VPValue *IncAmt = getOperand(1);
2060 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2061 VectorType *VTy = VectorType::get(IncTy, VF);
2062
2063 // Assume that a non-constant update value (or a constant != 1) requires
2064 // a multiply, and add that into the cost.
2065 InstructionCost MulCost =
2066 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2067 if (match(IncAmt, m_One()))
2068 MulCost = TTI::TCC_Free;
2069
2070 // Find the cost of the histogram operation itself.
2071 Type *PtrTy = VectorType::get(AddressTy, VF);
2072 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2073 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2074 Type::getVoidTy(Ctx.LLVMCtx),
2075 {PtrTy, IncTy, MaskTy});
2076
2077 // Add the costs together with the add/sub operation.
2078 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2079 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2080}
2081
2082#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2084 VPSlotTracker &SlotTracker) const {
2085 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2087
2088 if (Opcode == Instruction::Sub)
2089 O << ", dec: ";
2090 else {
2091 assert(Opcode == Instruction::Add);
2092 O << ", inc: ";
2093 }
2095
2096 if (VPValue *Mask = getMask()) {
2097 O << ", mask: ";
2098 Mask->printAsOperand(O, SlotTracker);
2099 }
2100}
2101#endif
2102
2103VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2104 AllowReassoc = FMF.allowReassoc();
2105 NoNaNs = FMF.noNaNs();
2106 NoInfs = FMF.noInfs();
2107 NoSignedZeros = FMF.noSignedZeros();
2108 AllowReciprocal = FMF.allowReciprocal();
2109 AllowContract = FMF.allowContract();
2110 ApproxFunc = FMF.approxFunc();
2111}
2112
2114 switch (Opcode) {
2115 case Instruction::Add:
2116 case Instruction::Sub:
2117 case Instruction::Mul:
2118 case Instruction::Shl:
2120 return WrapFlagsTy(false, false);
2121 case Instruction::Trunc:
2122 return TruncFlagsTy(false, false);
2123 case Instruction::Or:
2124 return DisjointFlagsTy(false);
2125 case Instruction::AShr:
2126 case Instruction::LShr:
2127 case Instruction::UDiv:
2128 case Instruction::SDiv:
2129 return ExactFlagsTy(false);
2130 case Instruction::GetElementPtr:
2133 return GEPNoWrapFlags::none();
2134 case Instruction::ZExt:
2135 case Instruction::UIToFP:
2136 return NonNegFlagsTy(false);
2137 case Instruction::FAdd:
2138 case Instruction::FSub:
2139 case Instruction::FMul:
2140 case Instruction::FDiv:
2141 case Instruction::FRem:
2142 case Instruction::FNeg:
2143 case Instruction::FPExt:
2144 case Instruction::FPTrunc:
2145 return FastMathFlags();
2146 case Instruction::ICmp:
2147 case Instruction::FCmp:
2149 llvm_unreachable("opcode requires explicit flags");
2150 default:
2151 return VPIRFlags();
2152 }
2153}
2154
2155#if !defined(NDEBUG)
2156bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2157 switch (OpType) {
2158 case OperationType::OverflowingBinOp:
2159 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2160 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2161 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2162 case OperationType::Trunc:
2163 return Opcode == Instruction::Trunc;
2164 case OperationType::DisjointOp:
2165 return Opcode == Instruction::Or;
2166 case OperationType::PossiblyExactOp:
2167 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2168 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2169 case OperationType::GEPOp:
2170 return Opcode == Instruction::GetElementPtr ||
2171 Opcode == VPInstruction::PtrAdd ||
2172 Opcode == VPInstruction::WidePtrAdd;
2173 case OperationType::FPMathOp:
2174 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2175 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2176 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2177 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2178 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2179 Opcode == Instruction::Select ||
2180 Opcode == VPInstruction::WideIVStep ||
2182 case OperationType::FCmp:
2183 return Opcode == Instruction::FCmp;
2184 case OperationType::NonNegOp:
2185 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2186 case OperationType::Cmp:
2187 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2188 case OperationType::ReductionOp:
2190 case OperationType::Other:
2191 return true;
2192 }
2193 llvm_unreachable("Unknown OperationType enum");
2194}
2195
2196bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2197 // Handle opcodes without default flags.
2198 if (Opcode == Instruction::ICmp)
2199 return OpType == OperationType::Cmp;
2200 if (Opcode == Instruction::FCmp)
2201 return OpType == OperationType::FCmp;
2203 return OpType == OperationType::ReductionOp;
2204
2205 OperationType Required = getDefaultFlags(Opcode).OpType;
2206 return Required == OperationType::Other || Required == OpType;
2207}
2208#endif
2209
2210#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2212 switch (OpType) {
2213 case OperationType::Cmp:
2215 break;
2216 case OperationType::FCmp:
2219 break;
2220 case OperationType::DisjointOp:
2221 if (DisjointFlags.IsDisjoint)
2222 O << " disjoint";
2223 break;
2224 case OperationType::PossiblyExactOp:
2225 if (ExactFlags.IsExact)
2226 O << " exact";
2227 break;
2228 case OperationType::OverflowingBinOp:
2229 if (WrapFlags.HasNUW)
2230 O << " nuw";
2231 if (WrapFlags.HasNSW)
2232 O << " nsw";
2233 break;
2234 case OperationType::Trunc:
2235 if (TruncFlags.HasNUW)
2236 O << " nuw";
2237 if (TruncFlags.HasNSW)
2238 O << " nsw";
2239 break;
2240 case OperationType::FPMathOp:
2242 break;
2243 case OperationType::GEPOp: {
2245 if (Flags.isInBounds())
2246 O << " inbounds";
2247 else if (Flags.hasNoUnsignedSignedWrap())
2248 O << " nusw";
2249 if (Flags.hasNoUnsignedWrap())
2250 O << " nuw";
2251 break;
2252 }
2253 case OperationType::NonNegOp:
2254 if (NonNegFlags.NonNeg)
2255 O << " nneg";
2256 break;
2257 case OperationType::ReductionOp: {
2258 RecurKind RK = getRecurKind();
2259 O << " (";
2260 switch (RK) {
2261 case RecurKind::AnyOf:
2262 O << "any-of";
2263 break;
2265 O << "find-last";
2266 break;
2267 case RecurKind::SMax:
2268 O << "smax";
2269 break;
2270 case RecurKind::SMin:
2271 O << "smin";
2272 break;
2273 case RecurKind::UMax:
2274 O << "umax";
2275 break;
2276 case RecurKind::UMin:
2277 O << "umin";
2278 break;
2279 case RecurKind::FMinNum:
2280 O << "fminnum";
2281 break;
2282 case RecurKind::FMaxNum:
2283 O << "fmaxnum";
2284 break;
2286 O << "fminimum";
2287 break;
2289 O << "fmaximum";
2290 break;
2292 O << "fminimumnum";
2293 break;
2295 O << "fmaximumnum";
2296 break;
2297 default:
2299 break;
2300 }
2301 if (isReductionInLoop())
2302 O << ", in-loop";
2303 if (isReductionOrdered())
2304 O << ", ordered";
2305 O << ")";
2307 break;
2308 }
2309 case OperationType::Other:
2310 break;
2311 }
2312 O << " ";
2313}
2314#endif
2315
2317 auto &Builder = State.Builder;
2318 switch (Opcode) {
2319 case Instruction::Call:
2320 case Instruction::UncondBr:
2321 case Instruction::CondBr:
2322 case Instruction::PHI:
2323 case Instruction::GetElementPtr:
2324 llvm_unreachable("This instruction is handled by a different recipe.");
2325 case Instruction::UDiv:
2326 case Instruction::SDiv:
2327 case Instruction::SRem:
2328 case Instruction::URem:
2329 case Instruction::Add:
2330 case Instruction::FAdd:
2331 case Instruction::Sub:
2332 case Instruction::FSub:
2333 case Instruction::FNeg:
2334 case Instruction::Mul:
2335 case Instruction::FMul:
2336 case Instruction::FDiv:
2337 case Instruction::FRem:
2338 case Instruction::Shl:
2339 case Instruction::LShr:
2340 case Instruction::AShr:
2341 case Instruction::And:
2342 case Instruction::Or:
2343 case Instruction::Xor: {
2344 // Just widen unops and binops.
2346 for (VPValue *VPOp : operands())
2347 Ops.push_back(State.get(VPOp));
2348
2349 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2350
2351 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2352 applyFlags(*VecOp);
2353 applyMetadata(*VecOp);
2354 }
2355
2356 // Use this vector value for all users of the original instruction.
2357 State.set(this, V);
2358 break;
2359 }
2360 case Instruction::ExtractValue: {
2361 assert(getNumOperands() == 2 && "expected single level extractvalue");
2362 Value *Op = State.get(getOperand(0));
2363 Value *Extract = Builder.CreateExtractValue(
2364 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2365 State.set(this, Extract);
2366 break;
2367 }
2368 case Instruction::Freeze: {
2369 Value *Op = State.get(getOperand(0));
2370 Value *Freeze = Builder.CreateFreeze(Op);
2371 State.set(this, Freeze);
2372 break;
2373 }
2374 case Instruction::ICmp:
2375 case Instruction::FCmp: {
2376 // Widen compares. Generate vector compares.
2377 bool FCmp = Opcode == Instruction::FCmp;
2378 Value *A = State.get(getOperand(0));
2379 Value *B = State.get(getOperand(1));
2380 Value *C = nullptr;
2381 if (FCmp) {
2382 C = Builder.CreateFCmp(getPredicate(), A, B);
2383 } else {
2384 C = Builder.CreateICmp(getPredicate(), A, B);
2385 }
2386 if (auto *I = dyn_cast<Instruction>(C)) {
2387 applyFlags(*I);
2388 applyMetadata(*I);
2389 }
2390 State.set(this, C);
2391 break;
2392 }
2393 case Instruction::Select: {
2394 VPValue *CondOp = getOperand(0);
2395 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2396 Value *Op0 = State.get(getOperand(1));
2397 Value *Op1 = State.get(getOperand(2));
2398 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2399 State.set(this, Sel);
2400 if (auto *I = dyn_cast<Instruction>(Sel)) {
2402 applyFlags(*I);
2403 applyMetadata(*I);
2404 }
2405 break;
2406 }
2407 default:
2408 // This instruction is not vectorized by simple widening.
2409 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2410 << Instruction::getOpcodeName(Opcode));
2411 llvm_unreachable("Unhandled instruction!");
2412 } // end of switch.
2413
2414#if !defined(NDEBUG)
2415 // Verify that VPlan type inference results agree with the type of the
2416 // generated values.
2417 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2418 State.get(this)->getType() &&
2419 "inferred type and type from generated instructions do not match");
2420#endif
2421}
2422
2424 VPCostContext &Ctx) const {
2425 switch (Opcode) {
2426 case Instruction::UDiv:
2427 case Instruction::SDiv:
2428 case Instruction::SRem:
2429 case Instruction::URem:
2430 // If the div/rem operation isn't safe to speculate and requires
2431 // predication, then the only way we can even create a vplan is to insert
2432 // a select on the second input operand to ensure we use the value of 1
2433 // for the inactive lanes. The select will be costed separately.
2434 case Instruction::FNeg:
2435 case Instruction::Add:
2436 case Instruction::FAdd:
2437 case Instruction::Sub:
2438 case Instruction::FSub:
2439 case Instruction::Mul:
2440 case Instruction::FMul:
2441 case Instruction::FDiv:
2442 case Instruction::FRem:
2443 case Instruction::Shl:
2444 case Instruction::LShr:
2445 case Instruction::AShr:
2446 case Instruction::And:
2447 case Instruction::Or:
2448 case Instruction::Xor:
2449 case Instruction::Freeze:
2450 case Instruction::ExtractValue:
2451 case Instruction::ICmp:
2452 case Instruction::FCmp:
2453 case Instruction::Select:
2454 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2455 default:
2456 llvm_unreachable("Unsupported opcode for instruction");
2457 }
2458}
2459
2460#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2462 VPSlotTracker &SlotTracker) const {
2463 O << Indent << "WIDEN ";
2465 O << " = " << Instruction::getOpcodeName(Opcode);
2466 printFlags(O);
2468}
2469#endif
2470
2472 auto &Builder = State.Builder;
2473 /// Vectorize casts.
2474 assert(State.VF.isVector() && "Not vectorizing?");
2475 Type *DestTy = VectorType::get(getResultType(), State.VF);
2476 VPValue *Op = getOperand(0);
2477 Value *A = State.get(Op);
2478 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2479 State.set(this, Cast);
2480 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2481 applyFlags(*CastOp);
2482 applyMetadata(*CastOp);
2483 }
2484}
2485
2487 VPCostContext &Ctx) const {
2488 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2489 // the legacy cost model, including truncates/extends when evaluating a
2490 // reduction in a smaller type.
2491 if (!getUnderlyingValue())
2492 return 0;
2493 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2494}
2495
2496#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2498 VPSlotTracker &SlotTracker) const {
2499 O << Indent << "WIDEN-CAST ";
2501 O << " = " << Instruction::getOpcodeName(Opcode);
2502 printFlags(O);
2504 O << " to " << *getResultType();
2505}
2506#endif
2507
2509 VPCostContext &Ctx) const {
2510 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2511}
2512
2513#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2515 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2516 O << Indent;
2518 O << " = WIDEN-INDUCTION";
2519 printFlags(O);
2521
2522 if (auto *TI = getTruncInst())
2523 O << " (truncated to " << *TI->getType() << ")";
2524}
2525#endif
2526
2528 // The step may be defined by a recipe in the preheader (e.g. if it requires
2529 // SCEV expansion), but for the canonical induction the step is required to be
2530 // 1, which is represented as live-in.
2531 return match(getStartValue(), m_ZeroInt()) &&
2532 match(getStepValue(), m_One()) &&
2533 getScalarType() == getRegion()->getCanonicalIVType();
2534}
2535
2536#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2538 VPSlotTracker &SlotTracker) const {
2539 O << Indent;
2541 O << " = DERIVED-IV ";
2542 getStartValue()->printAsOperand(O, SlotTracker);
2543 O << " + ";
2544 getOperand(1)->printAsOperand(O, SlotTracker);
2545 O << " * ";
2546 getStepValue()->printAsOperand(O, SlotTracker);
2547}
2548#endif
2549
2551 // Fast-math-flags propagate from the original induction instruction.
2552 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2553 State.Builder.setFastMathFlags(getFastMathFlags());
2554
2555 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2556 /// variable on which to base the steps, \p Step is the size of the step.
2557
2558 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2559 Value *Step = State.get(getStepValue(), VPLane(0));
2560 IRBuilderBase &Builder = State.Builder;
2561
2562 // Ensure step has the same type as that of scalar IV.
2563 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2564 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2565
2566 // We build scalar steps for both integer and floating-point induction
2567 // variables. Here, we determine the kind of arithmetic we will perform.
2570 if (BaseIVTy->isIntegerTy()) {
2571 AddOp = Instruction::Add;
2572 MulOp = Instruction::Mul;
2573 } else {
2574 AddOp = InductionOpcode;
2575 MulOp = Instruction::FMul;
2576 }
2577
2578 // Determine the number of scalars we need to generate.
2579 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2580 // Compute the scalar steps and save the results in State.
2581
2582 unsigned StartLane = 0;
2583 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2584 if (State.Lane) {
2585 StartLane = State.Lane->getKnownLane();
2586 EndLane = StartLane + 1;
2587 }
2588 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2589 : Constant::getNullValue(BaseIVTy);
2590
2591 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2592 // It is okay if the induction variable type cannot hold the lane number,
2593 // we expect truncation in this case.
2594 Constant *LaneValue =
2595 BaseIVTy->isIntegerTy()
2596 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2597 /*ImplicitTrunc=*/true)
2598 : ConstantFP::get(BaseIVTy, Lane);
2599 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2600 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2601 "Expected StartIdx to be folded to a constant when VF is not "
2602 "scalable");
2603 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2604 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2605 State.set(this, Add, VPLane(Lane));
2606 }
2607}
2608
2609#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2611 VPSlotTracker &SlotTracker) const {
2612 O << Indent;
2614 O << " = SCALAR-STEPS ";
2616}
2617#endif
2618
2620 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2622}
2623
2625 assert(State.VF.isVector() && "not widening");
2626 // Construct a vector GEP by widening the operands of the scalar GEP as
2627 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2628 // results in a vector of pointers when at least one operand of the GEP
2629 // is vector-typed. Thus, to keep the representation compact, we only use
2630 // vector-typed operands for loop-varying values.
2631
2632 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2633 return Op->isDefinedOutsideLoopRegions();
2634 });
2635 if (AllOperandsAreInvariant) {
2636 // If we are vectorizing, but the GEP has only loop-invariant operands,
2637 // the GEP we build (by only using vector-typed operands for
2638 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2639 // produce a vector of pointers, we need to either arbitrarily pick an
2640 // operand to broadcast, or broadcast a clone of the original GEP.
2641 // Here, we broadcast a clone of the original.
2642
2644 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2645 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2646
2647 auto *NewGEP =
2648 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2649 "", getGEPNoWrapFlags());
2650 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2651 State.set(this, Splat);
2652 return;
2653 }
2654
2655 // If the GEP has at least one loop-varying operand, we are sure to
2656 // produce a vector of pointers unless VF is scalar.
2657 // The pointer operand of the new GEP. If it's loop-invariant, we
2658 // won't broadcast it.
2659 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2660
2661 // Collect all the indices for the new GEP. If any index is
2662 // loop-invariant, we won't broadcast it.
2664 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2665 VPValue *Operand = getOperand(I);
2666 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2667 }
2668
2669 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2670 // but it should be a vector, otherwise.
2671 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2672 "", getGEPNoWrapFlags());
2673 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2674 "NewGEP is not a pointer vector");
2675 State.set(this, NewGEP);
2676}
2677
2678#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2680 VPSlotTracker &SlotTracker) const {
2681 O << Indent << "WIDEN-GEP ";
2682 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2683 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2684 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2685
2686 O << " ";
2688 O << " = getelementptr";
2689 printFlags(O);
2691}
2692#endif
2693
2695 assert(!getOffset() && "Unexpected offset operand");
2696 VPBuilder Builder(this);
2697 VPlan &Plan = *getParent()->getPlan();
2698 VPValue *VFVal = getVFValue();
2699 VPTypeAnalysis TypeInfo(Plan);
2700 const DataLayout &DL = Plan.getDataLayout();
2701 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
2702 VPValue *Stride =
2703 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
2704 Type *VFTy = TypeInfo.inferScalarType(VFVal);
2705 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
2707
2708 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2709 VPInstruction *VFMinusOne =
2710 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
2711 DebugLoc::getUnknown(), "", {true, true});
2712 VPInstruction *Offset0 =
2713 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
2714
2715 // Offset for PartN = Offset0 + Part * Stride * VF.
2716 VPValue *PartxStride =
2717 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
2718 VPValue *Offset = Builder.createAdd(
2719 Offset0,
2720 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
2722}
2723
2725 auto &Builder = State.Builder;
2726 assert(getOffset() && "Expected prior materialization of offset");
2727 Value *Ptr = State.get(getPointer(), true);
2728 Value *Offset = State.get(getOffset(), true);
2729 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2731 State.set(this, ResultPtr, /*IsScalar*/ true);
2732}
2733
2734#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2736 VPSlotTracker &SlotTracker) const {
2737 O << Indent;
2739 O << " = vector-end-pointer";
2740 printFlags(O);
2742}
2743#endif
2744
2746 auto &Builder = State.Builder;
2747 assert(getOffset() &&
2748 "Expected prior simplification of recipe without offset");
2749 Value *Ptr = State.get(getOperand(0), VPLane(0));
2750 Value *Offset = State.get(getOffset(), true);
2751 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2753 State.set(this, ResultPtr, /*IsScalar*/ true);
2754}
2755
2756#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2758 VPSlotTracker &SlotTracker) const {
2759 O << Indent;
2761 O << " = vector-pointer";
2762 printFlags(O);
2764}
2765#endif
2766
2768 VPCostContext &Ctx) const {
2769 // A blend will be expanded to a select VPInstruction, which will generate a
2770 // scalar select if only the first lane is used.
2772 VF = ElementCount::getFixed(1);
2773
2774 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2775 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2776 return (getNumIncomingValues() - 1) *
2777 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2778 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2779}
2780
2781#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2783 VPSlotTracker &SlotTracker) const {
2784 O << Indent << "BLEND ";
2786 O << " =";
2787 printFlags(O);
2788 if (getNumIncomingValues() == 1) {
2789 // Not a User of any mask: not really blending, this is a
2790 // single-predecessor phi.
2791 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2792 } else {
2793 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2794 if (I != 0)
2795 O << " ";
2796 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2797 if (I == 0 && isNormalized())
2798 continue;
2799 O << "/";
2800 getMask(I)->printAsOperand(O, SlotTracker);
2801 }
2802 }
2803}
2804#endif
2805
2807 assert(!State.Lane && "Reduction being replicated.");
2810 "In-loop AnyOf reductions aren't currently supported");
2811 // Propagate the fast-math flags carried by the underlying instruction.
2812 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2813 State.Builder.setFastMathFlags(getFastMathFlags());
2814 Value *NewVecOp = State.get(getVecOp());
2815 if (VPValue *Cond = getCondOp()) {
2816 Value *NewCond = State.get(Cond, State.VF.isScalar());
2817 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2818 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2819
2820 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2821 if (State.VF.isVector())
2822 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2823
2824 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2825 NewVecOp = Select;
2826 }
2827 Value *NewRed;
2828 Value *NextInChain;
2829 if (isOrdered()) {
2830 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2831 if (State.VF.isVector())
2832 NewRed =
2833 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2834 else
2835 NewRed = State.Builder.CreateBinOp(
2837 PrevInChain, NewVecOp);
2838 PrevInChain = NewRed;
2839 NextInChain = NewRed;
2840 } else if (isPartialReduction()) {
2841 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2842 "Unexpected partial reduction kind");
2843 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
2844 NewRed = State.Builder.CreateIntrinsic(
2845 PrevInChain->getType(),
2846 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2847 : Intrinsic::vector_partial_reduce_fadd,
2848 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
2849 "partial.reduce");
2850 PrevInChain = NewRed;
2851 NextInChain = NewRed;
2852 } else {
2853 assert(isInLoop() &&
2854 "The reduction must either be ordered, partial or in-loop");
2855 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2856 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2858 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2859 else
2860 NextInChain = State.Builder.CreateBinOp(
2862 PrevInChain, NewRed);
2863 }
2864 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
2865}
2866
2868 assert(!State.Lane && "Reduction being replicated.");
2869
2870 auto &Builder = State.Builder;
2871 // Propagate the fast-math flags carried by the underlying instruction.
2872 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2873 Builder.setFastMathFlags(getFastMathFlags());
2874
2876 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2877 Value *VecOp = State.get(getVecOp());
2878 Value *EVL = State.get(getEVL(), VPLane(0));
2879
2880 Value *Mask;
2881 if (VPValue *CondOp = getCondOp())
2882 Mask = State.get(CondOp);
2883 else
2884 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2885
2886 Value *NewRed;
2887 if (isOrdered()) {
2888 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2889 } else {
2890 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2892 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2893 else
2894 NewRed = Builder.CreateBinOp(
2896 Prev);
2897 }
2898 State.set(this, NewRed, /*IsScalar*/ true);
2899}
2900
2902 VPCostContext &Ctx) const {
2903 RecurKind RdxKind = getRecurrenceKind();
2904 Type *ElementTy = Ctx.Types.inferScalarType(this);
2905 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2906 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2908 std::optional<FastMathFlags> OptionalFMF =
2909 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2910
2911 if (isPartialReduction()) {
2912 InstructionCost CondCost = 0;
2913 if (isConditional()) {
2915 auto *CondTy = cast<VectorType>(
2916 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
2917 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
2918 CondTy, Pred, Ctx.CostKind);
2919 }
2920 return CondCost + Ctx.TTI.getPartialReductionCost(
2921 Opcode, ElementTy, ElementTy, ElementTy, VF,
2922 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
2923 OptionalFMF);
2924 }
2925
2926 // TODO: Support any-of reductions.
2927 assert(
2929 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2930 "Any-of reduction not implemented in VPlan-based cost model currently.");
2931
2932 // Note that TTI should model the cost of moving result to the scalar register
2933 // and the BinOp cost in the getMinMaxReductionCost().
2936 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2937 }
2938
2939 // Note that TTI should model the cost of moving result to the scalar register
2940 // and the BinOp cost in the getArithmeticReductionCost().
2941 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2942 Ctx.CostKind);
2943}
2944
2945VPExpressionRecipe::VPExpressionRecipe(
2946 ExpressionTypes ExpressionType,
2947 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2948 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2949 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2950 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2951 assert(
2952 none_of(ExpressionRecipes,
2953 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2954 "expression cannot contain recipes with side-effects");
2955
2956 // Maintain a copy of the expression recipes as a set of users.
2957 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2958 for (auto *R : ExpressionRecipes)
2959 ExpressionRecipesAsSetOfUsers.insert(R);
2960
2961 // Recipes in the expression, except the last one, must only be used by
2962 // (other) recipes inside the expression. If there are other users, external
2963 // to the expression, use a clone of the recipe for external users.
2964 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2965 if (R != ExpressionRecipes.back() &&
2966 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2967 return !ExpressionRecipesAsSetOfUsers.contains(U);
2968 })) {
2969 // There are users outside of the expression. Clone the recipe and use the
2970 // clone those external users.
2971 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2972 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2973 VPUser &U, unsigned) {
2974 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2975 });
2976 CopyForExtUsers->insertBefore(R);
2977 }
2978 if (R->getParent())
2979 R->removeFromParent();
2980 }
2981
2982 // Internalize all external operands to the expression recipes. To do so,
2983 // create new temporary VPValues for all operands defined by a recipe outside
2984 // the expression. The original operands are added as operands of the
2985 // VPExpressionRecipe itself.
2986 for (auto *R : ExpressionRecipes) {
2987 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2988 auto *Def = Op->getDefiningRecipe();
2989 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2990 continue;
2991 addOperand(Op);
2992 LiveInPlaceholders.push_back(new VPSymbolicValue());
2993 }
2994 }
2995
2996 // Replace each external operand with the first one created for it in
2997 // LiveInPlaceholders.
2998 for (auto *R : ExpressionRecipes)
2999 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3000 R->replaceUsesOfWith(LiveIn, Tmp);
3001}
3002
3004 for (auto *R : ExpressionRecipes)
3005 // Since the list could contain duplicates, make sure the recipe hasn't
3006 // already been inserted.
3007 if (!R->getParent())
3008 R->insertBefore(this);
3009
3010 for (const auto &[Idx, Op] : enumerate(operands()))
3011 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3012
3013 replaceAllUsesWith(ExpressionRecipes.back());
3014 ExpressionRecipes.clear();
3015}
3016
3018 VPCostContext &Ctx) const {
3019 Type *RedTy = Ctx.Types.inferScalarType(this);
3020 auto *SrcVecTy = cast<VectorType>(
3021 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
3022 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3023 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3024 switch (ExpressionType) {
3025 case ExpressionTypes::ExtendedReduction: {
3026 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3027 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
3028 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3029 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3030
3031 if (RedR->isPartialReduction())
3032 return Ctx.TTI.getPartialReductionCost(
3033 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
3035 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3036 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3037 : std::nullopt);
3038 else if (!RedTy->isFloatingPointTy())
3039 // TTI::getExtendedReductionCost only supports integer types.
3040 return Ctx.TTI.getExtendedReductionCost(
3041 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3042 std::nullopt, Ctx.CostKind);
3043 else
3045 }
3046 case ExpressionTypes::MulAccReduction:
3047 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3048 Ctx.CostKind);
3049
3050 case ExpressionTypes::ExtNegatedMulAccReduction:
3051 assert(Opcode == Instruction::Add && "Unexpected opcode");
3052 Opcode = Instruction::Sub;
3053 [[fallthrough]];
3054 case ExpressionTypes::ExtMulAccReduction: {
3055 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3056 if (RedR->isPartialReduction()) {
3057 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3058 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3059 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3060 return Ctx.TTI.getPartialReductionCost(
3061 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
3062 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
3064 Ext0R->getOpcode()),
3066 Ext1R->getOpcode()),
3067 Mul->getOpcode(), Ctx.CostKind,
3068 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3069 : std::nullopt);
3070 }
3071 return Ctx.TTI.getMulAccReductionCost(
3072 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3073 Instruction::ZExt,
3074 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3075 }
3076 }
3077 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3078}
3079
3081 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3082 return R->mayReadFromMemory() || R->mayWriteToMemory();
3083 });
3084}
3085
3087 assert(
3088 none_of(ExpressionRecipes,
3089 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3090 "expression cannot contain recipes with side-effects");
3091 return false;
3092}
3093
3095 // Cannot use vputils::isSingleScalar(), because all external operands
3096 // of the expression will be live-ins while bundled.
3097 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3098 return RR && !RR->isPartialReduction();
3099}
3100
3101#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3102
3104 VPSlotTracker &SlotTracker) const {
3105 O << Indent << "EXPRESSION ";
3107 O << " = ";
3108 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3109 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3110
3111 switch (ExpressionType) {
3112 case ExpressionTypes::ExtendedReduction: {
3114 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3115 O << Instruction::getOpcodeName(Opcode) << " (";
3117 Red->printFlags(O);
3118
3119 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3120 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3121 << *Ext0->getResultType();
3122 if (Red->isConditional()) {
3123 O << ", ";
3124 Red->getCondOp()->printAsOperand(O, SlotTracker);
3125 }
3126 O << ")";
3127 break;
3128 }
3129 case ExpressionTypes::ExtNegatedMulAccReduction: {
3131 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3133 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3134 << " (sub (0, mul";
3135 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3136 Mul->printFlags(O);
3137 O << "(";
3139 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3140 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3141 << *Ext0->getResultType() << "), (";
3143 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3144 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3145 << *Ext1->getResultType() << ")";
3146 if (Red->isConditional()) {
3147 O << ", ";
3148 Red->getCondOp()->printAsOperand(O, SlotTracker);
3149 }
3150 O << "))";
3151 break;
3152 }
3153 case ExpressionTypes::MulAccReduction:
3154 case ExpressionTypes::ExtMulAccReduction: {
3156 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3158 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3159 << " (";
3160 O << "mul";
3161 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3162 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3163 : ExpressionRecipes[0]);
3164 Mul->printFlags(O);
3165 if (IsExtended)
3166 O << "(";
3168 if (IsExtended) {
3169 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3170 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3171 << *Ext0->getResultType() << "), (";
3172 } else {
3173 O << ", ";
3174 }
3176 if (IsExtended) {
3177 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3178 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3179 << *Ext1->getResultType() << ")";
3180 }
3181 if (Red->isConditional()) {
3182 O << ", ";
3183 Red->getCondOp()->printAsOperand(O, SlotTracker);
3184 }
3185 O << ")";
3186 break;
3187 }
3188 }
3189}
3190
3192 VPSlotTracker &SlotTracker) const {
3193 if (isPartialReduction())
3194 O << Indent << "PARTIAL-REDUCE ";
3195 else
3196 O << Indent << "REDUCE ";
3198 O << " = ";
3200 O << " +";
3201 printFlags(O);
3202 O << " reduce."
3205 << " (";
3207 if (isConditional()) {
3208 O << ", ";
3210 }
3211 O << ")";
3212}
3213
3215 VPSlotTracker &SlotTracker) const {
3216 O << Indent << "REDUCE ";
3218 O << " = ";
3220 O << " +";
3221 printFlags(O);
3222 O << " vp.reduce."
3225 << " (";
3227 O << ", ";
3229 if (isConditional()) {
3230 O << ", ";
3232 }
3233 O << ")";
3234}
3235
3236#endif
3237
3238/// A helper function to scalarize a single Instruction in the innermost loop.
3239/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3240/// operands from \p RepRecipe instead of \p Instr's operands.
3241static void scalarizeInstruction(const Instruction *Instr,
3242 VPReplicateRecipe *RepRecipe,
3243 const VPLane &Lane, VPTransformState &State) {
3244 assert((!Instr->getType()->isAggregateType() ||
3245 canVectorizeTy(Instr->getType())) &&
3246 "Expected vectorizable or non-aggregate type.");
3247
3248 // Does this instruction return a value ?
3249 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3250
3251 Instruction *Cloned = Instr->clone();
3252 if (!IsVoidRetTy) {
3253 Cloned->setName(Instr->getName() + ".cloned");
3254 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3255 // The operands of the replicate recipe may have been narrowed, resulting in
3256 // a narrower result type. Update the type of the cloned instruction to the
3257 // correct type.
3258 if (ResultTy != Cloned->getType())
3259 Cloned->mutateType(ResultTy);
3260 }
3261
3262 RepRecipe->applyFlags(*Cloned);
3263 RepRecipe->applyMetadata(*Cloned);
3264
3265 if (RepRecipe->hasPredicate())
3266 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3267
3268 if (auto DL = RepRecipe->getDebugLoc())
3269 State.setDebugLocFrom(DL);
3270
3271 // Replace the operands of the cloned instructions with their scalar
3272 // equivalents in the new loop.
3273 for (const auto &I : enumerate(RepRecipe->operands())) {
3274 auto InputLane = Lane;
3275 VPValue *Operand = I.value();
3276 if (vputils::isSingleScalar(Operand))
3277 InputLane = VPLane::getFirstLane();
3278 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3279 }
3280
3281 // Place the cloned scalar in the new loop.
3282 State.Builder.Insert(Cloned);
3283
3284 State.set(RepRecipe, Cloned, Lane);
3285
3286 // If we just cloned a new assumption, add it the assumption cache.
3287 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3288 State.AC->registerAssumption(II);
3289
3290 assert(
3291 (RepRecipe->getRegion() ||
3292 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3293 all_of(RepRecipe->operands(),
3294 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3295 "Expected a recipe is either within a region or all of its operands "
3296 "are defined outside the vectorized region.");
3297}
3298
3301
3302 if (!State.Lane) {
3303 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3304 "must have already been unrolled");
3305 scalarizeInstruction(UI, this, VPLane(0), State);
3306 return;
3307 }
3308
3309 assert((State.VF.isScalar() || !isSingleScalar()) &&
3310 "uniform recipe shouldn't be predicated");
3311 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3312 scalarizeInstruction(UI, this, *State.Lane, State);
3313 // Insert scalar instance packing it into a vector.
3314 if (State.VF.isVector() && shouldPack()) {
3315 Value *WideValue =
3316 State.Lane->isFirstLane()
3317 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3318 : State.get(this);
3319 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3320 *State.Lane));
3321 }
3322}
3323
3325 // Find if the recipe is used by a widened recipe via an intervening
3326 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3327 return any_of(users(), [](const VPUser *U) {
3328 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3329 return !vputils::onlyScalarValuesUsed(PredR);
3330 return false;
3331 });
3332}
3333
3334/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3335/// which the legacy cost model computes a SCEV expression when computing the
3336/// address cost. Computing SCEVs for VPValues is incomplete and returns
3337/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3338/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3339static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3341 const Loop *L) {
3342 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3343 if (isa<SCEVCouldNotCompute>(Addr))
3344 return Addr;
3345
3346 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3347}
3348
3349/// Returns true if \p V is used as part of the address of another load or
3350/// store.
3351static bool isUsedByLoadStoreAddress(const VPUser *V) {
3353 SmallVector<const VPUser *> WorkList = {V};
3354
3355 while (!WorkList.empty()) {
3356 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3357 if (!Cur || !Seen.insert(Cur).second)
3358 continue;
3359
3360 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3361 // Skip blends that use V only through a compare by checking if any incoming
3362 // value was already visited.
3363 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3364 [&](unsigned I) {
3365 return Seen.contains(
3366 Blend->getIncomingValue(I)->getDefiningRecipe());
3367 }))
3368 continue;
3369
3370 for (VPUser *U : Cur->users()) {
3371 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3372 if (InterleaveR->getAddr() == Cur)
3373 return true;
3374 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3375 if (RepR->getOpcode() == Instruction::Load &&
3376 RepR->getOperand(0) == Cur)
3377 return true;
3378 if (RepR->getOpcode() == Instruction::Store &&
3379 RepR->getOperand(1) == Cur)
3380 return true;
3381 }
3382 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3383 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3384 return true;
3385 }
3386 }
3387
3388 // The legacy cost model only supports scalarization loads/stores with phi
3389 // addresses, if the phi is directly used as load/store address. Don't
3390 // traverse further for Blends.
3391 if (Blend)
3392 continue;
3393
3394 append_range(WorkList, Cur->users());
3395 }
3396 return false;
3397}
3398
3399/// Return true if \p R is a predicated load/store with a loop-invariant address
3400/// only masked by the header mask.
3402 const SCEV *PtrSCEV,
3403 VPCostContext &Ctx) {
3404 const VPRegionBlock *ParentRegion = R.getRegion();
3405 if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
3406 !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
3407 return false;
3408 auto *BOM =
3410 return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
3411}
3412
3414 VPCostContext &Ctx) const {
3416 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3417 // transform, avoid computing their cost multiple times for now.
3418 Ctx.SkipCostComputation.insert(UI);
3419
3420 if (VF.isScalable() && !isSingleScalar())
3422
3423 switch (UI->getOpcode()) {
3424 case Instruction::Alloca:
3425 if (VF.isScalable())
3427 return Ctx.TTI.getArithmeticInstrCost(
3428 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3429 case Instruction::GetElementPtr:
3430 // We mark this instruction as zero-cost because the cost of GEPs in
3431 // vectorized code depends on whether the corresponding memory instruction
3432 // is scalarized or not. Therefore, we handle GEPs with the memory
3433 // instruction cost.
3434 return 0;
3435 case Instruction::Call: {
3436 auto *CalledFn =
3438
3441 for (const VPValue *ArgOp : ArgOps)
3442 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3443
3444 if (CalledFn->isIntrinsic())
3445 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3446 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3447 switch (CalledFn->getIntrinsicID()) {
3448 case Intrinsic::assume:
3449 case Intrinsic::lifetime_end:
3450 case Intrinsic::lifetime_start:
3451 case Intrinsic::sideeffect:
3452 case Intrinsic::pseudoprobe:
3453 case Intrinsic::experimental_noalias_scope_decl: {
3454 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3455 ElementCount::getFixed(1), Ctx) == 0 &&
3456 "scalarizing intrinsic should be free");
3457 return InstructionCost(0);
3458 }
3459 default:
3460 break;
3461 }
3462
3463 Type *ResultTy = Ctx.Types.inferScalarType(this);
3464 InstructionCost ScalarCallCost =
3465 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3466 if (isSingleScalar()) {
3467 if (CalledFn->isIntrinsic())
3468 ScalarCallCost = std::min(
3469 ScalarCallCost,
3470 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3471 ElementCount::getFixed(1), Ctx));
3472 return ScalarCallCost;
3473 }
3474
3475 return ScalarCallCost * VF.getFixedValue() +
3476 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3477 }
3478 case Instruction::Add:
3479 case Instruction::Sub:
3480 case Instruction::FAdd:
3481 case Instruction::FSub:
3482 case Instruction::Mul:
3483 case Instruction::FMul:
3484 case Instruction::FDiv:
3485 case Instruction::FRem:
3486 case Instruction::Shl:
3487 case Instruction::LShr:
3488 case Instruction::AShr:
3489 case Instruction::And:
3490 case Instruction::Or:
3491 case Instruction::Xor:
3492 case Instruction::ICmp:
3493 case Instruction::FCmp:
3495 Ctx) *
3496 (isSingleScalar() ? 1 : VF.getFixedValue());
3497 case Instruction::SDiv:
3498 case Instruction::UDiv:
3499 case Instruction::SRem:
3500 case Instruction::URem: {
3501 InstructionCost ScalarCost =
3503 if (isSingleScalar())
3504 return ScalarCost;
3505
3506 // If any of the operands is from a different replicate region and has its
3507 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3508 // model to avoid cost mis-match.
3509 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3510 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3511 if (!PredR)
3512 return false;
3513 return Ctx.skipCostComputation(
3515 PredR->getOperand(0)->getUnderlyingValue()),
3516 VF.isVector());
3517 }))
3518 break;
3519
3520 ScalarCost = ScalarCost * VF.getFixedValue() +
3521 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3522 to_vector(operands()), VF);
3523 // If the recipe is not predicated (i.e. not in a replicate region), return
3524 // the scalar cost. Otherwise handle predicated cost.
3525 if (!getRegion()->isReplicator())
3526 return ScalarCost;
3527
3528 // Account for the phi nodes that we will create.
3529 ScalarCost += VF.getFixedValue() *
3530 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3531 // Scale the cost by the probability of executing the predicated blocks.
3532 // This assumes the predicated block for each vector lane is equally
3533 // likely.
3534 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3535 return ScalarCost;
3536 }
3537 case Instruction::Load:
3538 case Instruction::Store: {
3539 bool IsLoad = UI->getOpcode() == Instruction::Load;
3540 const VPValue *PtrOp = getOperand(!IsLoad);
3541 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3543 break;
3544
3545 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3546 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3547 const Align Alignment = getLoadStoreAlignment(UI);
3548 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3550 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3551 bool UsedByLoadStoreAddress =
3552 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3553 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3554 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3555 UsedByLoadStoreAddress ? UI : nullptr);
3556
3557 // Check if this is a predicated load/store with a loop-invariant address
3558 // only masked by the header mask. If so, return the uniform mem op cost.
3559 if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
3560 InstructionCost UniformCost =
3561 ScalarMemOpCost +
3562 Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
3563 /*Ptr=*/nullptr, Ctx.CostKind);
3564 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3565 if (IsLoad) {
3566 return UniformCost +
3567 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
3568 VectorTy, VectorTy, {}, Ctx.CostKind);
3569 }
3570
3571 VPValue *StoredVal = getOperand(0);
3572 if (!StoredVal->isDefinedOutsideLoopRegions())
3573 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3574 Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
3575 return UniformCost;
3576 }
3577
3578 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3579 InstructionCost ScalarCost =
3580 ScalarMemOpCost +
3581 Ctx.TTI.getAddressComputationCost(
3582 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3583 Ctx.CostKind);
3584 if (isSingleScalar())
3585 return ScalarCost;
3586
3587 SmallVector<const VPValue *> OpsToScalarize;
3588 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3589 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3590 // don't assign scalarization overhead in general, if the target prefers
3591 // vectorized addressing or the loaded value is used as part of an address
3592 // of another load or store.
3593 if (!UsedByLoadStoreAddress) {
3594 bool EfficientVectorLoadStore =
3595 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3596 if (!(IsLoad && !PreferVectorizedAddressing) &&
3597 !(!IsLoad && EfficientVectorLoadStore))
3598 append_range(OpsToScalarize, operands());
3599
3600 if (!EfficientVectorLoadStore)
3601 ResultTy = Ctx.Types.inferScalarType(this);
3602 }
3603
3607 (ScalarCost * VF.getFixedValue()) +
3608 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3609
3610 const VPRegionBlock *ParentRegion = getRegion();
3611 if (ParentRegion && ParentRegion->isReplicator()) {
3612 if (!PtrSCEV)
3613 break;
3614 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3615 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3616
3617 auto *VecI1Ty = VectorType::get(
3618 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3619 Cost += Ctx.TTI.getScalarizationOverhead(
3620 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3621 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3622
3623 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3624 // Artificially setting to a high enough value to practically disable
3625 // vectorization with such operations.
3626 return 3000000;
3627 }
3628 }
3629 return Cost;
3630 }
3631 case Instruction::SExt:
3632 case Instruction::ZExt:
3633 case Instruction::FPToUI:
3634 case Instruction::FPToSI:
3635 case Instruction::FPExt:
3636 case Instruction::PtrToInt:
3637 case Instruction::PtrToAddr:
3638 case Instruction::IntToPtr:
3639 case Instruction::SIToFP:
3640 case Instruction::UIToFP:
3641 case Instruction::Trunc:
3642 case Instruction::FPTrunc:
3643 case Instruction::Select:
3644 case Instruction::AddrSpaceCast: {
3646 Ctx) *
3647 (isSingleScalar() ? 1 : VF.getFixedValue());
3648 }
3649 case Instruction::ExtractValue:
3650 case Instruction::InsertValue:
3651 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3652 }
3653
3654 return Ctx.getLegacyCost(UI, VF);
3655}
3656
3657#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3659 VPSlotTracker &SlotTracker) const {
3660 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3661
3662 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3664 O << " = ";
3665 }
3666 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3667 O << "call";
3668 printFlags(O);
3669 O << "@" << CB->getCalledFunction()->getName() << "(";
3671 O, [&O, &SlotTracker](VPValue *Op) {
3672 Op->printAsOperand(O, SlotTracker);
3673 });
3674 O << ")";
3675 } else {
3677 printFlags(O);
3679 }
3680
3681 if (shouldPack())
3682 O << " (S->V)";
3683}
3684#endif
3685
3687 assert(State.Lane && "Branch on Mask works only on single instance.");
3688
3689 VPValue *BlockInMask = getOperand(0);
3690 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3691
3692 // Replace the temporary unreachable terminator with a new conditional branch,
3693 // whose two destinations will be set later when they are created.
3694 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3695 assert(isa<UnreachableInst>(CurrentTerminator) &&
3696 "Expected to replace unreachable terminator with conditional branch.");
3697 auto CondBr =
3698 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3699 CondBr->setSuccessor(0, nullptr);
3700 CurrentTerminator->eraseFromParent();
3701}
3702
3704 VPCostContext &Ctx) const {
3705 // The legacy cost model doesn't assign costs to branches for individual
3706 // replicate regions. Match the current behavior in the VPlan cost model for
3707 // now.
3708 return 0;
3709}
3710
3712 assert(State.Lane && "Predicated instruction PHI works per instance.");
3713 Instruction *ScalarPredInst =
3714 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3715 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3716 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3717 assert(PredicatingBB && "Predicated block has no single predecessor.");
3719 "operand must be VPReplicateRecipe");
3720
3721 // By current pack/unpack logic we need to generate only a single phi node: if
3722 // a vector value for the predicated instruction exists at this point it means
3723 // the instruction has vector users only, and a phi for the vector value is
3724 // needed. In this case the recipe of the predicated instruction is marked to
3725 // also do that packing, thereby "hoisting" the insert-element sequence.
3726 // Otherwise, a phi node for the scalar value is needed.
3727 if (State.hasVectorValue(getOperand(0))) {
3728 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3730 "Packed operands must generate an insertelement or insertvalue");
3731
3732 // If VectorI is a struct, it will be a sequence like:
3733 // %1 = insertvalue %unmodified, %x, 0
3734 // %2 = insertvalue %1, %y, 1
3735 // %VectorI = insertvalue %2, %z, 2
3736 // To get the unmodified vector we need to look through the chain.
3737 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3738 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3739 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3740
3741 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3742 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3743 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3744 if (State.hasVectorValue(this))
3745 State.reset(this, VPhi);
3746 else
3747 State.set(this, VPhi);
3748 // NOTE: Currently we need to update the value of the operand, so the next
3749 // predicated iteration inserts its generated value in the correct vector.
3750 State.reset(getOperand(0), VPhi);
3751 } else {
3752 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3753 return;
3754
3755 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3756 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3757 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3758 PredicatingBB);
3759 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3760 if (State.hasScalarValue(this, *State.Lane))
3761 State.reset(this, Phi, *State.Lane);
3762 else
3763 State.set(this, Phi, *State.Lane);
3764 // NOTE: Currently we need to update the value of the operand, so the next
3765 // predicated iteration inserts its generated value in the correct vector.
3766 State.reset(getOperand(0), Phi, *State.Lane);
3767 }
3768}
3769
3770#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3772 VPSlotTracker &SlotTracker) const {
3773 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3775 O << " = ";
3777}
3778#endif
3779
3781 VPCostContext &Ctx) const {
3783 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3784 ->getAddressSpace();
3785 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3786 ? Instruction::Load
3787 : Instruction::Store;
3788
3789 if (!Consecutive) {
3790 // TODO: Using the original IR may not be accurate.
3791 // Currently, ARM will use the underlying IR to calculate gather/scatter
3792 // instruction cost.
3793 [[maybe_unused]] auto IsReverseMask = [this]() {
3794 VPValue *Mask = getMask();
3795 if (!Mask)
3796 return false;
3797
3800
3801 return match(Mask, m_Reverse(m_VPValue()));
3802 };
3803 assert(!IsReverseMask() &&
3804 "Inconsecutive memory access should not have reverse order");
3806 Type *PtrTy = Ptr->getType();
3807
3808 // If the address value is uniform across all lanes, then the address can be
3809 // calculated with scalar type and broadcast.
3811 PtrTy = toVectorTy(PtrTy, VF);
3812
3813 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
3814 : isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
3815 : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
3816 : Intrinsic::vp_scatter;
3817 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3818 Ctx.CostKind) +
3819 Ctx.TTI.getMemIntrinsicInstrCost(
3821 &Ingredient),
3822 Ctx.CostKind);
3823 }
3824
3826 if (IsMasked) {
3827 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3828 : Intrinsic::masked_store;
3829 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3830 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3831 } else {
3832 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3834 : getOperand(1));
3835 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3836 OpInfo, &Ingredient);
3837 }
3838 return Cost;
3839}
3840
3842 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3843 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3844 bool CreateGather = !isConsecutive();
3845
3846 auto &Builder = State.Builder;
3847 Value *Mask = nullptr;
3848 if (auto *VPMask = getMask())
3849 Mask = State.get(VPMask);
3850
3851 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3852 Value *NewLI;
3853 if (CreateGather) {
3854 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3855 "wide.masked.gather");
3856 } else if (Mask) {
3857 NewLI =
3858 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3859 PoisonValue::get(DataTy), "wide.masked.load");
3860 } else {
3861 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3862 }
3864 State.set(this, NewLI);
3865}
3866
3867#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3869 VPSlotTracker &SlotTracker) const {
3870 O << Indent << "WIDEN ";
3872 O << " = load ";
3874}
3875#endif
3876
3878 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3879 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3880 bool CreateGather = !isConsecutive();
3881
3882 auto &Builder = State.Builder;
3883 CallInst *NewLI;
3884 Value *EVL = State.get(getEVL(), VPLane(0));
3885 Value *Addr = State.get(getAddr(), !CreateGather);
3886 Value *Mask = nullptr;
3887 if (VPValue *VPMask = getMask())
3888 Mask = State.get(VPMask);
3889 else
3890 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3891
3892 if (CreateGather) {
3893 NewLI =
3894 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3895 nullptr, "wide.masked.gather");
3896 } else {
3897 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3898 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3899 }
3900 NewLI->addParamAttr(
3902 applyMetadata(*NewLI);
3903 Instruction *Res = NewLI;
3904 State.set(this, Res);
3905}
3906
3908 VPCostContext &Ctx) const {
3909 if (!Consecutive || IsMasked)
3910 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3911
3912 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3913 // here because the EVL recipes using EVL to replace the tail mask. But in the
3914 // legacy model, it will always calculate the cost of mask.
3915 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3916 // don't need to compare to the legacy cost model.
3918 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3919 ->getAddressSpace();
3920 return Ctx.TTI.getMemIntrinsicInstrCost(
3921 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3922 Ctx.CostKind);
3923}
3924
3925#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3927 VPSlotTracker &SlotTracker) const {
3928 O << Indent << "WIDEN ";
3930 O << " = vp.load ";
3932}
3933#endif
3934
3936 VPValue *StoredVPValue = getStoredValue();
3937 bool CreateScatter = !isConsecutive();
3938
3939 auto &Builder = State.Builder;
3940
3941 Value *Mask = nullptr;
3942 if (auto *VPMask = getMask())
3943 Mask = State.get(VPMask);
3944
3945 Value *StoredVal = State.get(StoredVPValue);
3946 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3947 Instruction *NewSI = nullptr;
3948 if (CreateScatter)
3949 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3950 else if (Mask)
3951 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3952 else
3953 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3954 applyMetadata(*NewSI);
3955}
3956
3957#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3959 VPSlotTracker &SlotTracker) const {
3960 O << Indent << "WIDEN store ";
3962}
3963#endif
3964
3966 VPValue *StoredValue = getStoredValue();
3967 bool CreateScatter = !isConsecutive();
3968
3969 auto &Builder = State.Builder;
3970
3971 CallInst *NewSI = nullptr;
3972 Value *StoredVal = State.get(StoredValue);
3973 Value *EVL = State.get(getEVL(), VPLane(0));
3974 Value *Mask = nullptr;
3975 if (VPValue *VPMask = getMask())
3976 Mask = State.get(VPMask);
3977 else
3978 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3979
3980 Value *Addr = State.get(getAddr(), !CreateScatter);
3981 if (CreateScatter) {
3982 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3983 Intrinsic::vp_scatter,
3984 {StoredVal, Addr, Mask, EVL});
3985 } else {
3986 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3987 Intrinsic::vp_store,
3988 {StoredVal, Addr, Mask, EVL});
3989 }
3990 NewSI->addParamAttr(
3992 applyMetadata(*NewSI);
3993}
3994
3996 VPCostContext &Ctx) const {
3997 if (!Consecutive || IsMasked)
3998 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3999
4000 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4001 // here because the EVL recipes using EVL to replace the tail mask. But in the
4002 // legacy model, it will always calculate the cost of mask.
4003 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4004 // don't need to compare to the legacy cost model.
4006 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4007 ->getAddressSpace();
4008 return Ctx.TTI.getMemIntrinsicInstrCost(
4009 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4010 Ctx.CostKind);
4011}
4012
4013#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4015 VPSlotTracker &SlotTracker) const {
4016 O << Indent << "WIDEN vp.store ";
4018}
4019#endif
4020
4022 VectorType *DstVTy, const DataLayout &DL) {
4023 // Verify that V is a vector type with same number of elements as DstVTy.
4024 auto VF = DstVTy->getElementCount();
4025 auto *SrcVecTy = cast<VectorType>(V->getType());
4026 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4027 Type *SrcElemTy = SrcVecTy->getElementType();
4028 Type *DstElemTy = DstVTy->getElementType();
4029 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4030 "Vector elements must have same size");
4031
4032 // Do a direct cast if element types are castable.
4033 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4034 return Builder.CreateBitOrPointerCast(V, DstVTy);
4035 }
4036 // V cannot be directly casted to desired vector type.
4037 // May happen when V is a floating point vector but DstVTy is a vector of
4038 // pointers or vice-versa. Handle this using a two-step bitcast using an
4039 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4040 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4041 "Only one type should be a pointer type");
4042 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4043 "Only one type should be a floating point type");
4044 Type *IntTy =
4045 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4046 auto *VecIntTy = VectorType::get(IntTy, VF);
4047 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4048 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4049}
4050
4051/// Return a vector containing interleaved elements from multiple
4052/// smaller input vectors.
4054 const Twine &Name) {
4055 unsigned Factor = Vals.size();
4056 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4057
4058 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4059#ifndef NDEBUG
4060 for (Value *Val : Vals)
4061 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4062#endif
4063
4064 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4065 // must use intrinsics to interleave.
4066 if (VecTy->isScalableTy()) {
4067 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4068 return Builder.CreateVectorInterleave(Vals, Name);
4069 }
4070
4071 // Fixed length. Start by concatenating all vectors into a wide vector.
4072 Value *WideVec = concatenateVectors(Builder, Vals);
4073
4074 // Interleave the elements into the wide vector.
4075 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4076 return Builder.CreateShuffleVector(
4077 WideVec, createInterleaveMask(NumElts, Factor), Name);
4078}
4079
4080// Try to vectorize the interleave group that \p Instr belongs to.
4081//
4082// E.g. Translate following interleaved load group (factor = 3):
4083// for (i = 0; i < N; i+=3) {
4084// R = Pic[i]; // Member of index 0
4085// G = Pic[i+1]; // Member of index 1
4086// B = Pic[i+2]; // Member of index 2
4087// ... // do something to R, G, B
4088// }
4089// To:
4090// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4091// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4092// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4093// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4094//
4095// Or translate following interleaved store group (factor = 3):
4096// for (i = 0; i < N; i+=3) {
4097// ... do something to R, G, B
4098// Pic[i] = R; // Member of index 0
4099// Pic[i+1] = G; // Member of index 1
4100// Pic[i+2] = B; // Member of index 2
4101// }
4102// To:
4103// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4104// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4105// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4106// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4107// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4109 assert(!State.Lane && "Interleave group being replicated.");
4110 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4111 "Masking gaps for scalable vectors is not yet supported.");
4113 Instruction *Instr = Group->getInsertPos();
4114
4115 // Prepare for the vector type of the interleaved load/store.
4116 Type *ScalarTy = getLoadStoreType(Instr);
4117 unsigned InterleaveFactor = Group->getFactor();
4118 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4119
4120 VPValue *BlockInMask = getMask();
4121 VPValue *Addr = getAddr();
4122 Value *ResAddr = State.get(Addr, VPLane(0));
4123
4124 auto CreateGroupMask = [&BlockInMask, &State,
4125 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4126 if (State.VF.isScalable()) {
4127 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4128 assert(InterleaveFactor <= 8 &&
4129 "Unsupported deinterleave factor for scalable vectors");
4130 auto *ResBlockInMask = State.get(BlockInMask);
4131 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4132 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4133 }
4134
4135 if (!BlockInMask)
4136 return MaskForGaps;
4137
4138 Value *ResBlockInMask = State.get(BlockInMask);
4139 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4140 ResBlockInMask,
4141 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4142 "interleaved.mask");
4143 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4144 ShuffledMask, MaskForGaps)
4145 : ShuffledMask;
4146 };
4147
4148 const DataLayout &DL = Instr->getDataLayout();
4149 // Vectorize the interleaved load group.
4150 if (isa<LoadInst>(Instr)) {
4151 Value *MaskForGaps = nullptr;
4152 if (needsMaskForGaps()) {
4153 MaskForGaps =
4154 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4155 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4156 }
4157
4158 Instruction *NewLoad;
4159 if (BlockInMask || MaskForGaps) {
4160 Value *GroupMask = CreateGroupMask(MaskForGaps);
4161 Value *PoisonVec = PoisonValue::get(VecTy);
4162 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4163 Group->getAlign(), GroupMask,
4164 PoisonVec, "wide.masked.vec");
4165 } else
4166 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4167 Group->getAlign(), "wide.vec");
4168 applyMetadata(*NewLoad);
4169 // TODO: Also manage existing metadata using VPIRMetadata.
4170 Group->addMetadata(NewLoad);
4171
4173 if (VecTy->isScalableTy()) {
4174 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4175 // so must use intrinsics to deinterleave.
4176 assert(InterleaveFactor <= 8 &&
4177 "Unsupported deinterleave factor for scalable vectors");
4178 NewLoad = State.Builder.CreateIntrinsic(
4179 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4180 NewLoad->getType(), NewLoad,
4181 /*FMFSource=*/nullptr, "strided.vec");
4182 }
4183
4184 auto CreateStridedVector = [&InterleaveFactor, &State,
4185 &NewLoad](unsigned Index) -> Value * {
4186 assert(Index < InterleaveFactor && "Illegal group index");
4187 if (State.VF.isScalable())
4188 return State.Builder.CreateExtractValue(NewLoad, Index);
4189
4190 // For fixed length VF, use shuffle to extract the sub-vectors from the
4191 // wide load.
4192 auto StrideMask =
4193 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4194 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4195 "strided.vec");
4196 };
4197
4198 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4199 Instruction *Member = Group->getMember(I);
4200
4201 // Skip the gaps in the group.
4202 if (!Member)
4203 continue;
4204
4205 Value *StridedVec = CreateStridedVector(I);
4206
4207 // If this member has different type, cast the result type.
4208 if (Member->getType() != ScalarTy) {
4209 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4210 StridedVec =
4211 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4212 }
4213
4214 if (Group->isReverse())
4215 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4216
4217 State.set(VPDefs[J], StridedVec);
4218 ++J;
4219 }
4220 return;
4221 }
4222
4223 // The sub vector type for current instruction.
4224 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4225
4226 // Vectorize the interleaved store group.
4227 Value *MaskForGaps =
4228 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4229 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4230 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4231 ArrayRef<VPValue *> StoredValues = getStoredValues();
4232 // Collect the stored vector from each member.
4233 SmallVector<Value *, 4> StoredVecs;
4234 unsigned StoredIdx = 0;
4235 for (unsigned i = 0; i < InterleaveFactor; i++) {
4236 assert((Group->getMember(i) || MaskForGaps) &&
4237 "Fail to get a member from an interleaved store group");
4238 Instruction *Member = Group->getMember(i);
4239
4240 // Skip the gaps in the group.
4241 if (!Member) {
4242 Value *Undef = PoisonValue::get(SubVT);
4243 StoredVecs.push_back(Undef);
4244 continue;
4245 }
4246
4247 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4248 ++StoredIdx;
4249
4250 if (Group->isReverse())
4251 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4252
4253 // If this member has different type, cast it to a unified type.
4254
4255 if (StoredVec->getType() != SubVT)
4256 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4257
4258 StoredVecs.push_back(StoredVec);
4259 }
4260
4261 // Interleave all the smaller vectors into one wider vector.
4262 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4263 Instruction *NewStoreInstr;
4264 if (BlockInMask || MaskForGaps) {
4265 Value *GroupMask = CreateGroupMask(MaskForGaps);
4266 NewStoreInstr = State.Builder.CreateMaskedStore(
4267 IVec, ResAddr, Group->getAlign(), GroupMask);
4268 } else
4269 NewStoreInstr =
4270 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4271
4272 applyMetadata(*NewStoreInstr);
4273 // TODO: Also manage existing metadata using VPIRMetadata.
4274 Group->addMetadata(NewStoreInstr);
4275}
4276
4277#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4279 VPSlotTracker &SlotTracker) const {
4281 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4282 IG->getInsertPos()->printAsOperand(O, false);
4283 O << ", ";
4285 VPValue *Mask = getMask();
4286 if (Mask) {
4287 O << ", ";
4288 Mask->printAsOperand(O, SlotTracker);
4289 }
4290
4291 unsigned OpIdx = 0;
4292 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4293 if (!IG->getMember(i))
4294 continue;
4295 if (getNumStoreOperands() > 0) {
4296 O << "\n" << Indent << " store ";
4298 O << " to index " << i;
4299 } else {
4300 O << "\n" << Indent << " ";
4302 O << " = load from index " << i;
4303 }
4304 ++OpIdx;
4305 }
4306}
4307#endif
4308
4310 assert(!State.Lane && "Interleave group being replicated.");
4311 assert(State.VF.isScalable() &&
4312 "Only support scalable VF for EVL tail-folding.");
4314 "Masking gaps for scalable vectors is not yet supported.");
4316 Instruction *Instr = Group->getInsertPos();
4317
4318 // Prepare for the vector type of the interleaved load/store.
4319 Type *ScalarTy = getLoadStoreType(Instr);
4320 unsigned InterleaveFactor = Group->getFactor();
4321 assert(InterleaveFactor <= 8 &&
4322 "Unsupported deinterleave/interleave factor for scalable vectors");
4323 ElementCount WideVF = State.VF * InterleaveFactor;
4324 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4325
4326 VPValue *Addr = getAddr();
4327 Value *ResAddr = State.get(Addr, VPLane(0));
4328 Value *EVL = State.get(getEVL(), VPLane(0));
4329 Value *InterleaveEVL = State.Builder.CreateMul(
4330 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4331 /* NUW= */ true, /* NSW= */ true);
4332 LLVMContext &Ctx = State.Builder.getContext();
4333
4334 Value *GroupMask = nullptr;
4335 if (VPValue *BlockInMask = getMask()) {
4336 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4337 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4338 } else {
4339 GroupMask =
4340 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4341 }
4342
4343 // Vectorize the interleaved load group.
4344 if (isa<LoadInst>(Instr)) {
4345 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4346 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4347 "wide.vp.load");
4348 NewLoad->addParamAttr(0,
4349 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4350
4351 applyMetadata(*NewLoad);
4352 // TODO: Also manage existing metadata using VPIRMetadata.
4353 Group->addMetadata(NewLoad);
4354
4355 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4356 // so must use intrinsics to deinterleave.
4357 NewLoad = State.Builder.CreateIntrinsic(
4358 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4359 NewLoad->getType(), NewLoad,
4360 /*FMFSource=*/nullptr, "strided.vec");
4361
4362 const DataLayout &DL = Instr->getDataLayout();
4363 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4364 Instruction *Member = Group->getMember(I);
4365 // Skip the gaps in the group.
4366 if (!Member)
4367 continue;
4368
4369 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4370 // If this member has different type, cast the result type.
4371 if (Member->getType() != ScalarTy) {
4372 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4373 StridedVec =
4374 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4375 }
4376
4377 State.set(getVPValue(J), StridedVec);
4378 ++J;
4379 }
4380 return;
4381 } // End for interleaved load.
4382
4383 // The sub vector type for current instruction.
4384 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4385 // Vectorize the interleaved store group.
4386 ArrayRef<VPValue *> StoredValues = getStoredValues();
4387 // Collect the stored vector from each member.
4388 SmallVector<Value *, 4> StoredVecs;
4389 const DataLayout &DL = Instr->getDataLayout();
4390 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4391 Instruction *Member = Group->getMember(I);
4392 // Skip the gaps in the group.
4393 if (!Member) {
4394 StoredVecs.push_back(PoisonValue::get(SubVT));
4395 continue;
4396 }
4397
4398 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4399 // If this member has different type, cast it to a unified type.
4400 if (StoredVec->getType() != SubVT)
4401 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4402
4403 StoredVecs.push_back(StoredVec);
4404 ++StoredIdx;
4405 }
4406
4407 // Interleave all the smaller vectors into one wider vector.
4408 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4409 CallInst *NewStore =
4410 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4411 {IVec, ResAddr, GroupMask, InterleaveEVL});
4412 NewStore->addParamAttr(1,
4413 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4414
4415 applyMetadata(*NewStore);
4416 // TODO: Also manage existing metadata using VPIRMetadata.
4417 Group->addMetadata(NewStore);
4418}
4419
4420#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4422 VPSlotTracker &SlotTracker) const {
4424 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4425 IG->getInsertPos()->printAsOperand(O, false);
4426 O << ", ";
4428 O << ", ";
4430 if (VPValue *Mask = getMask()) {
4431 O << ", ";
4432 Mask->printAsOperand(O, SlotTracker);
4433 }
4434
4435 unsigned OpIdx = 0;
4436 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4437 if (!IG->getMember(i))
4438 continue;
4439 if (getNumStoreOperands() > 0) {
4440 O << "\n" << Indent << " vp.store ";
4442 O << " to index " << i;
4443 } else {
4444 O << "\n" << Indent << " ";
4446 O << " = vp.load from index " << i;
4447 }
4448 ++OpIdx;
4449 }
4450}
4451#endif
4452
4454 VPCostContext &Ctx) const {
4455 Instruction *InsertPos = getInsertPos();
4456 // Find the VPValue index of the interleave group. We need to skip gaps.
4457 unsigned InsertPosIdx = 0;
4458 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4459 if (auto *Member = IG->getMember(Idx)) {
4460 if (Member == InsertPos)
4461 break;
4462 InsertPosIdx++;
4463 }
4464 Type *ValTy = Ctx.Types.inferScalarType(
4465 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4466 : getStoredValues()[InsertPosIdx]);
4467 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4468 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4469 ->getAddressSpace();
4470
4471 unsigned InterleaveFactor = IG->getFactor();
4472 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4473
4474 // Holds the indices of existing members in the interleaved group.
4476 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4477 if (IG->getMember(IF))
4478 Indices.push_back(IF);
4479
4480 // Calculate the cost of the whole interleaved group.
4481 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4482 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4483 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4484
4485 if (!IG->isReverse())
4486 return Cost;
4487
4488 return Cost + IG->getNumMembers() *
4489 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4490 VectorTy, VectorTy, {}, Ctx.CostKind,
4491 0);
4492}
4493
4495 return vputils::onlyScalarValuesUsed(this) &&
4496 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4497}
4498
4499#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4501 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4502 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4503 "unexpected number of operands");
4504 O << Indent << "EMIT ";
4506 O << " = WIDEN-POINTER-INDUCTION ";
4508 O << ", ";
4510 O << ", ";
4512 if (getNumOperands() == 5) {
4513 O << ", ";
4515 O << ", ";
4517 }
4518}
4519
4521 VPSlotTracker &SlotTracker) const {
4522 O << Indent << "EMIT ";
4524 O << " = EXPAND SCEV " << *Expr;
4525}
4526#endif
4527
4529 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4530 Type *STy = CanonicalIV->getType();
4531 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4532 ElementCount VF = State.VF;
4533 Value *VStart = VF.isScalar()
4534 ? CanonicalIV
4535 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4536 Value *VStep = Builder.CreateElementCount(
4537 STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
4538 if (VF.isVector()) {
4539 VStep = Builder.CreateVectorSplat(VF, VStep);
4540 VStep =
4541 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4542 }
4543 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4544 State.set(this, CanonicalVectorIV);
4545}
4546
4547#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4549 VPSlotTracker &SlotTracker) const {
4550 O << Indent << "EMIT ";
4552 O << " = WIDEN-CANONICAL-INDUCTION ";
4554}
4555#endif
4556
4558 auto &Builder = State.Builder;
4559 // Create a vector from the initial value.
4560 auto *VectorInit = getStartValue()->getLiveInIRValue();
4561
4562 Type *VecTy = State.VF.isScalar()
4563 ? VectorInit->getType()
4564 : VectorType::get(VectorInit->getType(), State.VF);
4565
4566 BasicBlock *VectorPH =
4567 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4568 if (State.VF.isVector()) {
4569 auto *IdxTy = Builder.getInt32Ty();
4570 auto *One = ConstantInt::get(IdxTy, 1);
4571 IRBuilder<>::InsertPointGuard Guard(Builder);
4572 Builder.SetInsertPoint(VectorPH->getTerminator());
4573 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4574 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4575 VectorInit = Builder.CreateInsertElement(
4576 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4577 }
4578
4579 // Create a phi node for the new recurrence.
4580 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4581 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4582 Phi->addIncoming(VectorInit, VectorPH);
4583 State.set(this, Phi);
4584}
4585
4588 VPCostContext &Ctx) const {
4589 if (VF.isScalar())
4590 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4591
4592 return 0;
4593}
4594
4595#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4597 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4598 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4600 O << " = phi ";
4602}
4603#endif
4604
4606 // Reductions do not have to start at zero. They can start with
4607 // any loop invariant values.
4608 VPValue *StartVPV = getStartValue();
4609
4610 // In order to support recurrences we need to be able to vectorize Phi nodes.
4611 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4612 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4613 // this value when we vectorize all of the instructions that use the PHI.
4614 BasicBlock *VectorPH =
4615 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4616 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4617 Value *StartV = State.get(StartVPV, ScalarPHI);
4618 Type *VecTy = StartV->getType();
4619
4620 BasicBlock *HeaderBB = State.CFG.PrevBB;
4621 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4622 "recipe must be in the vector loop header");
4623 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4624 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4625 State.set(this, Phi, isInLoop());
4626
4627 Phi->addIncoming(StartV, VectorPH);
4628}
4629
4630#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4632 VPSlotTracker &SlotTracker) const {
4633 O << Indent << "WIDEN-REDUCTION-PHI ";
4634
4636 O << " = phi";
4637 printFlags(O);
4639 if (getVFScaleFactor() > 1)
4640 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4641}
4642#endif
4643
4645 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4646 return vputils::onlyFirstLaneUsed(this);
4647}
4648
4650 Value *Op0 = State.get(getOperand(0));
4651 Type *VecTy = Op0->getType();
4652 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4653 State.set(this, VecPhi);
4654}
4655
4657 VPCostContext &Ctx) const {
4658 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4659}
4660
4661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4663 VPSlotTracker &SlotTracker) const {
4664 O << Indent << "WIDEN-PHI ";
4665
4667 O << " = phi ";
4669}
4670#endif
4671
4673 BasicBlock *VectorPH =
4674 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4675 Value *StartMask = State.get(getOperand(0));
4676 PHINode *Phi =
4677 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4678 Phi->addIncoming(StartMask, VectorPH);
4679 State.set(this, Phi);
4680}
4681
4682#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4684 VPSlotTracker &SlotTracker) const {
4685 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4686
4688 O << " = phi ";
4690}
4691#endif
4692
4693#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4695 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4696 O << Indent << "CURRENT-ITERATION-PHI ";
4697
4699 O << " = phi ";
4701}
4702#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, VPCostContext &Ctx)
Return true if R is a predicated load/store with a loop-invariant address only masked by the header m...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI)
For call VPInstructions, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:93
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
void setAllowReciprocal(bool B=true)
Definition FMF.h:90
bool allowReciprocal() const
Definition FMF.h:71
void setNoSignedZeros(bool B=true)
Definition FMF.h:87
bool allowReassoc() const
Flag queries.
Definition FMF.h:67
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:68
void setApproxFunc(bool B=true)
Definition FMF.h:96
void setNoInfs(bool B=true)
Definition FMF.h:84
bool allowContract() const
Definition FMF.h:72
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2620
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2674
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2608
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1237
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2667
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2686
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2084
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2371
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1777
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2501
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1861
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2367
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1175
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1460
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2113
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1443
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1748
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2379
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1785
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2477
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1613
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1477
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2847
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4154
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4207
iterator end()
Definition VPlan.h:4191
const VPRecipeBase & front() const
Definition VPlan.h:4201
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4220
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2806
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2801
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2797
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:97
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:225
VPlan * getPlan()
Definition VPlan.cpp:178
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:367
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:183
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:498
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:471
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:483
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:493
VPIRValue * getStartValue() const
Definition VPlan.h:3946
VPValue * getStepValue() const
Definition VPlan.h:3948
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2326
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2069
Class to record and manage LLVM IR flags.
Definition VPlan.h:687
FastMathFlagsTy FMFs
Definition VPlan.h:775
ReductionFlagsTy ReductionFlags
Definition VPlan.h:777
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:769
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:992
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1056
TruncFlagsTy TruncFlags
Definition VPlan.h:770
CmpInst::Predicate getPredicate() const
Definition VPlan.h:964
ExactFlagsTy ExactFlags
Definition VPlan.h:772
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:773
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:982
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:987
DisjointFlagsTy DisjointFlags
Definition VPlan.h:771
FCmpFlagsTy FCmpFlags
Definition VPlan.h:776
NonNegFlagsTy NonNegFlags
Definition VPlan.h:774
bool isReductionInLoop() const
Definition VPlan.h:1062
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:921
uint8_t CmpPredStorage
Definition VPlan.h:768
RecurKind getRecurKind() const
Definition VPlan.h:1050
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1686
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1222
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1328
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1335
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1309
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1322
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1262
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1313
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1257
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1254
@ VScale
Returns the value for vscale.
Definition VPlan.h:1331
@ CanonicalIVIncrementForPart
Definition VPlan.h:1238
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1265
bool hasResult() const
Definition VPlan.h:1413
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1493
unsigned getOpcode() const
Definition VPlan.h:1397
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1438
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2910
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2914
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2912
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2904
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2933
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2898
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3007
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3020
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2970
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1600
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4298
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1625
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1585
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:405
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4499
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:479
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:553
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:525
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:469
friend class VPValue
Definition VPlanValue.h:304
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3168
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2721
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2745
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3110
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3121
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3123
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3106
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3112
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3119
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3114
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4364
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4440
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3190
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3231
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3260
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPValue * getStepValue() const
Definition VPlan.h:4018
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4026
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:605
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:672
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:607
This class can be used to assign names to VPValues.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:1155
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:329
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1545
operand_range operands()
Definition VPlanValue.h:397
unsigned getNumOperands() const
Definition VPlanValue.h:367
operand_iterator op_begin()
Definition VPlanValue.h:393
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:368
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:412
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:138
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1496
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:128
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1541
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1499
VPValue * getVFValue() const
Definition VPlan.h:2167
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2164
int64_t getStride() const
Definition VPlan.h:2165
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2236
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:2024
Function * getCalledScalarFunction() const
Definition VPlan.h:2020
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Instruction::CastOps getOpcode() const
Definition VPlan.h:1870
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1873
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2121
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2389
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2392
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2490
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2505
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2514
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1955
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1958
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3512
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3547
Instruction & Ingredient
Definition VPlan.h:3503
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3509
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3557
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3506
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3550
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:1813
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4512
const DataLayout & getDataLayout() const
Definition VPlan.h:4708
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1096
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4810
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:816
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMaximum
FP max with llvm.maximum semantics.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
LLVMContext & LLVMCtx
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1744
PHINode & getIRPhi()
Definition VPlan.h:1757
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1109
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1110
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:280
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3633
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3716
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3719
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3679