LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCanonicalIVPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenIntrinsicSC:
128 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
129 case VPCanonicalIVPHISC:
130 case VPBranchOnMaskSC:
131 case VPDerivedIVSC:
132 case VPCurrentIterationPHISC:
133 case VPFirstOrderRecurrencePHISC:
134 case VPReductionPHISC:
135 case VPPredInstPHISC:
136 case VPScalarIVStepsSC:
137 case VPWidenStoreEVLSC:
138 case VPWidenStoreSC:
139 return false;
140 case VPBlendSC:
141 case VPReductionEVLSC:
142 case VPReductionSC:
143 case VPVectorPointerSC:
144 case VPWidenCanonicalIVSC:
145 case VPWidenCastSC:
146 case VPWidenGEPSC:
147 case VPWidenIntOrFpInductionSC:
148 case VPWidenPHISC:
149 case VPWidenPointerInductionSC:
150 case VPWidenSC: {
151 const Instruction *I =
152 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
153 (void)I;
154 assert((!I || !I->mayReadFromMemory()) &&
155 "underlying instruction may read from memory");
156 return false;
157 }
158 default:
159 // FIXME: Return false if the recipe represents an interleaved store.
160 return true;
161 }
162}
163
165 switch (getVPRecipeID()) {
166 case VPExpressionSC:
167 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
168 case VPActiveLaneMaskPHISC:
169 case VPDerivedIVSC:
170 case VPCurrentIterationPHISC:
171 case VPFirstOrderRecurrencePHISC:
172 case VPReductionPHISC:
173 case VPPredInstPHISC:
174 case VPVectorEndPointerSC:
175 return false;
176 case VPInstructionSC: {
177 auto *VPI = cast<VPInstruction>(this);
178 return mayWriteToMemory() ||
179 VPI->getOpcode() == VPInstruction::BranchOnCount ||
180 VPI->getOpcode() == VPInstruction::BranchOnCond ||
181 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
182 }
183 case VPWidenCallSC: {
184 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
185 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
186 }
187 case VPWidenIntrinsicSC:
188 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
189 case VPBlendSC:
190 case VPReductionEVLSC:
191 case VPReductionSC:
192 case VPScalarIVStepsSC:
193 case VPVectorPointerSC:
194 case VPWidenCanonicalIVSC:
195 case VPWidenCastSC:
196 case VPWidenGEPSC:
197 case VPWidenIntOrFpInductionSC:
198 case VPWidenPHISC:
199 case VPWidenPointerInductionSC:
200 case VPWidenSC: {
201 const Instruction *I =
202 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
203 (void)I;
204 assert((!I || !I->mayHaveSideEffects()) &&
205 "underlying instruction has side-effects");
206 return false;
207 }
208 case VPInterleaveEVLSC:
209 case VPInterleaveSC:
210 return mayWriteToMemory();
211 case VPWidenLoadEVLSC:
212 case VPWidenLoadSC:
213 case VPWidenStoreEVLSC:
214 case VPWidenStoreSC:
215 assert(
216 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
218 "mayHaveSideffects result for ingredient differs from this "
219 "implementation");
220 return mayWriteToMemory();
221 case VPReplicateSC: {
222 auto *R = cast<VPReplicateRecipe>(this);
223 return R->getUnderlyingInstr()->mayHaveSideEffects();
224 }
225 default:
226 return true;
227 }
228}
229
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(InsertPos->getParent() &&
233 "Insertion position not in any VPBasicBlock");
234 InsertPos->getParent()->insert(this, InsertPos->getIterator());
235}
236
237void VPRecipeBase::insertBefore(VPBasicBlock &BB,
239 assert(!Parent && "Recipe already in some VPBasicBlock");
240 assert(I == BB.end() || I->getParent() == &BB);
241 BB.insert(this, I);
242}
243
245 assert(!Parent && "Recipe already in some VPBasicBlock");
246 assert(InsertPos->getParent() &&
247 "Insertion position not in any VPBasicBlock");
248 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
249}
250
252 assert(getParent() && "Recipe not in any VPBasicBlock");
254 Parent = nullptr;
255}
256
258 assert(getParent() && "Recipe not in any VPBasicBlock");
260}
261
264 insertAfter(InsertPos);
265}
266
272
274 // Get the underlying instruction for the recipe, if there is one. It is used
275 // to
276 // * decide if cost computation should be skipped for this recipe,
277 // * apply forced target instruction cost.
278 Instruction *UI = nullptr;
279 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
280 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
281 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
282 UI = IG->getInsertPos();
283 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
284 UI = &WidenMem->getIngredient();
285
286 InstructionCost RecipeCost;
287 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
288 RecipeCost = 0;
289 } else {
290 RecipeCost = computeCost(VF, Ctx);
291 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292 RecipeCost.isValid()) {
293 if (UI)
295 else
296 RecipeCost = InstructionCost(0);
297 }
298 }
299
300 LLVM_DEBUG({
301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302 dump();
303 });
304 return RecipeCost;
305}
306
308 VPCostContext &Ctx) const {
309 llvm_unreachable("subclasses should implement computeCost");
310}
311
313 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
315}
316
318 auto *VPI = dyn_cast<VPInstruction>(this);
319 return VPI && Instruction::isCast(VPI->getOpcode());
320}
321
323 assert(OpType == Other.OpType && "OpType must match");
324 switch (OpType) {
325 case OperationType::OverflowingBinOp:
326 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
327 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
328 break;
329 case OperationType::Trunc:
330 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
331 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
332 break;
333 case OperationType::DisjointOp:
334 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
335 break;
336 case OperationType::PossiblyExactOp:
337 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
338 break;
339 case OperationType::GEPOp:
340 GEPFlagsStorage &= Other.GEPFlagsStorage;
341 break;
342 case OperationType::FPMathOp:
343 case OperationType::FCmp:
344 assert((OpType != OperationType::FCmp ||
345 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
346 "Cannot drop CmpPredicate");
347 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
348 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
349 break;
350 case OperationType::NonNegOp:
351 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
352 break;
353 case OperationType::Cmp:
354 assert(CmpPredStorage == Other.CmpPredStorage &&
355 "Cannot drop CmpPredicate");
356 break;
357 case OperationType::ReductionOp:
358 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
359 "Cannot change RecurKind");
360 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
361 "Cannot change IsOrdered");
362 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
363 "Cannot change IsInLoop");
364 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
365 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
366 break;
367 case OperationType::Other:
368 break;
369 }
370}
371
373 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
374 OpType == OperationType::ReductionOp ||
375 OpType == OperationType::Other) &&
376 "recipe doesn't have fast math flags");
377 if (OpType == OperationType::Other)
378 return FastMathFlags();
379 const FastMathFlagsTy &F = getFMFsRef();
380 FastMathFlags Res;
381 Res.setAllowReassoc(F.AllowReassoc);
382 Res.setNoNaNs(F.NoNaNs);
383 Res.setNoInfs(F.NoInfs);
384 Res.setNoSignedZeros(F.NoSignedZeros);
385 Res.setAllowReciprocal(F.AllowReciprocal);
386 Res.setAllowContract(F.AllowContract);
387 Res.setApproxFunc(F.ApproxFunc);
388 return Res;
389}
390
391#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
393
394void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
395 VPSlotTracker &SlotTracker) const {
396 printRecipe(O, Indent, SlotTracker);
397 if (auto DL = getDebugLoc()) {
398 O << ", !dbg ";
399 DL.print(O);
400 }
401
402 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
404}
405#endif
406
407template <unsigned PartOpIdx>
408VPValue *
410 if (U.getNumOperands() == PartOpIdx + 1)
411 return U.getOperand(PartOpIdx);
412 return nullptr;
413}
414
415template <unsigned PartOpIdx>
417 if (auto *UnrollPartOp = getUnrollPartOperand(U))
418 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
419 return 0;
420}
421
422namespace llvm {
423template class VPUnrollPartAccessor<1>;
424template class VPUnrollPartAccessor<2>;
425template class VPUnrollPartAccessor<3>;
426}
427
429 const VPIRFlags &Flags, const VPIRMetadata &MD,
430 DebugLoc DL, const Twine &Name)
431 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
432 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
434 "Set flags not supported for the provided opcode");
436 "Opcode requires specific flags to be set");
440 "number of operands does not match opcode");
441}
442
443/// For call VPInstructions, return the operand index of the called function.
444/// The function is either the last operand (for unmasked calls) or the
445/// second-to-last operand (for masked calls).
446static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
447 assert(VPI.getOpcode() == Instruction::Call && "must be a call");
448 unsigned NumOps = VPI.getNumOperands();
449 auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
450 if (LastOp && isa<Function>(LastOp->getValue()))
451 return NumOps - 1;
452 assert(
453 isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
454 "expected function operand");
455 return NumOps - 2;
456}
457
458/// For call VPInstructions, return the called function.
460 unsigned Idx = getCalledFnOperandIndex(VPI);
461 return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
462}
463
465 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
466 return 1;
467
468 if (Instruction::isBinaryOp(Opcode))
469 return 2;
470
471 switch (Opcode) {
474 return 0;
475 case Instruction::Alloca:
476 case Instruction::ExtractValue:
477 case Instruction::Freeze:
478 case Instruction::Load:
491 return 1;
492 case Instruction::ICmp:
493 case Instruction::FCmp:
494 case Instruction::ExtractElement:
495 case Instruction::Store:
505 return 2;
506 case Instruction::Select:
509 return 3;
510 case Instruction::Call:
511 return getCalledFnOperandIndex(*this) + 1;
512 case Instruction::GetElementPtr:
513 case Instruction::PHI:
514 case Instruction::Switch:
527 // Cannot determine the number of operands from the opcode.
528 return -1u;
529 }
530 llvm_unreachable("all cases should be handled above");
531}
532
536
537bool VPInstruction::canGenerateScalarForFirstLane() const {
539 return true;
541 return true;
542 switch (Opcode) {
543 case Instruction::Freeze:
544 case Instruction::ICmp:
545 case Instruction::PHI:
546 case Instruction::Select:
556 return true;
557 default:
558 return false;
559 }
560}
561
562Value *VPInstruction::generate(VPTransformState &State) {
563 IRBuilderBase &Builder = State.Builder;
564
566 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
567 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
568 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
569 auto *Res =
570 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
571 if (auto *I = dyn_cast<Instruction>(Res))
572 applyFlags(*I);
573 return Res;
574 }
575
576 switch (getOpcode()) {
577 case VPInstruction::Not: {
578 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
579 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
580 return Builder.CreateNot(A, Name);
581 }
582 case Instruction::ExtractElement: {
583 assert(State.VF.isVector() && "Only extract elements from vectors");
584 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
585 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
586 Value *Vec = State.get(getOperand(0));
587 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
588 return Builder.CreateExtractElement(Vec, Idx, Name);
589 }
590 case Instruction::Freeze: {
592 return Builder.CreateFreeze(Op, Name);
593 }
594 case Instruction::FCmp:
595 case Instruction::ICmp: {
596 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
597 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
598 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
599 return Builder.CreateCmp(getPredicate(), A, B, Name);
600 }
601 case Instruction::PHI: {
602 llvm_unreachable("should be handled by VPPhi::execute");
603 }
604 case Instruction::Select: {
605 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
606 Value *Cond =
607 State.get(getOperand(0),
608 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
609 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
610 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
611 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
612 }
614 // Get first lane of vector induction variable.
615 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
616 // Get the original loop tripcount.
617 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
618
619 // If this part of the active lane mask is scalar, generate the CMP directly
620 // to avoid unnecessary extracts.
621 if (State.VF.isScalar())
622 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
623 Name);
624
625 ElementCount EC = State.VF.multiplyCoefficientBy(
626 cast<VPConstantInt>(getOperand(2))->getZExtValue());
627 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
628 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
629 {PredTy, ScalarTC->getType()},
630 {VIVElem0, ScalarTC}, nullptr, Name);
631 }
633 // Generate code to combine the previous and current values in vector v3.
634 //
635 // vector.ph:
636 // v_init = vector(..., ..., ..., a[-1])
637 // br vector.body
638 //
639 // vector.body
640 // i = phi [0, vector.ph], [i+4, vector.body]
641 // v1 = phi [v_init, vector.ph], [v2, vector.body]
642 // v2 = a[i, i+1, i+2, i+3];
643 // v3 = vector(v1(3), v2(0, 1, 2))
644
645 auto *V1 = State.get(getOperand(0));
646 if (!V1->getType()->isVectorTy())
647 return V1;
648 Value *V2 = State.get(getOperand(1));
649 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
650 }
652 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
653 Value *VFxUF = State.get(getOperand(1), VPLane(0));
654 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
655 Value *Cmp =
656 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
658 return Builder.CreateSelect(Cmp, Sub, Zero);
659 }
661 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
662 // be outside of the main loop.
663 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
664 // Compute EVL
665 assert(AVL->getType()->isIntegerTy() &&
666 "Requested vector length should be an integer.");
667
668 assert(State.VF.isScalable() && "Expected scalable vector factor.");
669 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
670
671 Value *EVL = Builder.CreateIntrinsic(
672 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
673 {AVL, VFArg, Builder.getTrue()});
674 return EVL;
675 }
677 Value *Cond = State.get(getOperand(0), VPLane(0));
678 // Replace the temporary unreachable terminator with a new conditional
679 // branch, hooking it up to backward destination for latch blocks now, and
680 // to forward destination(s) later when they are created.
681 // Second successor may be backwards - iff it is already in VPBB2IRBB.
682 VPBasicBlock *SecondVPSucc =
683 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
684 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
685 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
686 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
687 // First successor is always forward, reset it to nullptr.
688 Br->setSuccessor(0, nullptr);
690 applyMetadata(*Br);
691 return Br;
692 }
694 return Builder.CreateVectorSplat(
695 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
696 }
698 // For struct types, we need to build a new 'wide' struct type, where each
699 // element is widened, i.e., we create a struct of vectors.
700 auto *StructTy =
702 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
703 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
704 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
705 FieldIndex++) {
706 Value *ScalarValue =
707 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
708 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
709 VectorValue =
710 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
711 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
712 }
713 }
714 return Res;
715 }
717 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
718 auto NumOfElements = ElementCount::getFixed(getNumOperands());
719 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
720 for (const auto &[Idx, Op] : enumerate(operands()))
721 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
722 Builder.getInt32(Idx));
723 return Res;
724 }
726 if (State.VF.isScalar())
727 return State.get(getOperand(0), true);
728 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
730 // If this start vector is scaled then it should produce a vector with fewer
731 // elements than the VF.
732 ElementCount VF = State.VF.divideCoefficientBy(
733 cast<VPConstantInt>(getOperand(2))->getZExtValue());
734 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
735 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
736 Builder.getInt32(0));
737 }
739 Value *Start = State.get(getOperand(0), VPLane(0));
740 Value *NewVal = State.get(getOperand(1), VPLane(0));
741 Value *ReducedResult = State.get(getOperand(2), VPLane(0));
742 // The compares in the loop may yield poison, which propagates through the
743 // bitwise ORs. Freeze it here before the condition is used.
744 ReducedResult = Builder.CreateFreeze(ReducedResult);
745 return Builder.CreateSelect(ReducedResult, NewVal, Start, "rdx.select");
746 }
748 RecurKind RK = getRecurKind();
749 bool IsOrdered = isReductionOrdered();
750 bool IsInLoop = isReductionInLoop();
752 "FindIV should use min/max reduction kinds");
753
754 // The recipe may have multiple operands to be reduced together.
755 unsigned NumOperandsToReduce = getNumOperands();
756 VectorParts RdxParts(NumOperandsToReduce);
757 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
758 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
759
760 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
762
763 // Reduce multiple operands into one.
764 Value *ReducedPartRdx = RdxParts[0];
765 if (IsOrdered) {
766 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
767 } else {
768 // Floating-point operations should have some FMF to enable the reduction.
769 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
770 Value *RdxPart = RdxParts[Part];
772 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
773 else {
774 // For sub-recurrences, each part's reduction variable is already
775 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
777 RK == RecurKind::Sub
778 ? Instruction::Add
780 ReducedPartRdx =
781 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
782 }
783 }
784 }
785
786 // Create the reduction after the loop. Note that inloop reductions create
787 // the target reduction in the loop using a Reduction recipe.
788 if (State.VF.isVector() && !IsInLoop) {
789 // TODO: Support in-order reductions based on the recurrence descriptor.
790 // All ops in the reduction inherit fast-math-flags from the recurrence
791 // descriptor.
792 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
793 }
794
795 return ReducedPartRdx;
796 }
799 unsigned Offset =
801 Value *Res;
802 if (State.VF.isVector()) {
803 assert(Offset <= State.VF.getKnownMinValue() &&
804 "invalid offset to extract from");
805 // Extract lane VF - Offset from the operand.
806 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
807 } else {
808 // TODO: Remove ExtractLastLane for scalar VFs.
809 assert(Offset <= 1 && "invalid offset to extract from");
810 Res = State.get(getOperand(0));
811 }
813 Res->setName(Name);
814 return Res;
815 }
817 Value *A = State.get(getOperand(0));
818 Value *B = State.get(getOperand(1));
819 return Builder.CreateLogicalAnd(A, B, Name);
820 }
822 Value *A = State.get(getOperand(0));
823 Value *B = State.get(getOperand(1));
824 return Builder.CreateLogicalOr(A, B, Name);
825 }
827 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
828 "can only generate first lane for PtrAdd");
829 Value *Ptr = State.get(getOperand(0), VPLane(0));
830 Value *Addend = State.get(getOperand(1), VPLane(0));
831 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
832 }
834 Value *Ptr =
836 Value *Addend = State.get(getOperand(1));
837 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
838 }
840 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
841 for (VPValue *Op : drop_begin(operands()))
842 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
843 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
844 }
846 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
847 "simplified to ExtractElement.");
848 Value *LaneToExtract = State.get(getOperand(0), true);
849 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
850 Value *Res = nullptr;
851 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
852
853 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
854 Value *VectorStart =
855 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
856 Value *VectorIdx = Idx == 1
857 ? LaneToExtract
858 : Builder.CreateSub(LaneToExtract, VectorStart);
859 Value *Ext = State.VF.isScalar()
860 ? State.get(getOperand(Idx))
861 : Builder.CreateExtractElement(
862 State.get(getOperand(Idx)), VectorIdx);
863 if (Res) {
864 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
865 Res = Builder.CreateSelect(Cmp, Ext, Res);
866 } else {
867 Res = Ext;
868 }
869 }
870 return Res;
871 }
873 Type *Ty = State.TypeAnalysis.inferScalarType(this);
874 if (getNumOperands() == 1) {
875 Value *Mask = State.get(getOperand(0));
876 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
877 /*ZeroIsPoison=*/false, Name);
878 }
879 // If there are multiple operands, create a chain of selects to pick the
880 // first operand with an active lane and add the number of lanes of the
881 // preceding operands.
882 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
883 unsigned LastOpIdx = getNumOperands() - 1;
884 Value *Res = nullptr;
885 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
886 Value *TrailingZeros =
887 State.VF.isScalar()
888 ? Builder.CreateZExt(
889 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
890 Builder.getFalse()),
891 Ty)
893 Ty, State.get(getOperand(Idx)),
894 /*ZeroIsPoison=*/false, Name);
895 Value *Current = Builder.CreateAdd(
896 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
897 TrailingZeros);
898 if (Res) {
899 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
900 Res = Builder.CreateSelect(Cmp, Current, Res);
901 } else {
902 Res = Current;
903 }
904 }
905
906 return Res;
907 }
909 return State.get(getOperand(0), true);
911 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
913 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
914 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
915 Value *Data = State.get(getOperand(Idx));
916 Value *Mask = State.get(getOperand(Idx + 1));
917 Type *VTy = Data->getType();
918
919 if (State.VF.isScalar())
920 Result = Builder.CreateSelect(Mask, Data, Result);
921 else
922 Result = Builder.CreateIntrinsic(
923 Intrinsic::experimental_vector_extract_last_active, {VTy},
924 {Data, Mask, Result});
925 }
926
927 return Result;
928 }
929 default:
930 llvm_unreachable("Unsupported opcode for instruction");
931 }
932}
933
935 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
936 Type *ScalarTy = Ctx.Types.inferScalarType(this);
937 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
938 switch (Opcode) {
939 case Instruction::FNeg:
940 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
941 case Instruction::UDiv:
942 case Instruction::SDiv:
943 case Instruction::SRem:
944 case Instruction::URem:
945 case Instruction::Add:
946 case Instruction::FAdd:
947 case Instruction::Sub:
948 case Instruction::FSub:
949 case Instruction::Mul:
950 case Instruction::FMul:
951 case Instruction::FDiv:
952 case Instruction::FRem:
953 case Instruction::Shl:
954 case Instruction::LShr:
955 case Instruction::AShr:
956 case Instruction::And:
957 case Instruction::Or:
958 case Instruction::Xor: {
959 // Certain instructions can be cheaper if they have a constant second
960 // operand. One example of this are shifts on x86.
961 VPValue *RHS = getOperand(1);
962 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
963
964 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
967
970 if (CtxI)
971 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
972 return Ctx.TTI.getArithmeticInstrCost(
973 Opcode, ResultTy, Ctx.CostKind,
974 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
975 RHSInfo, Operands, CtxI, &Ctx.TLI);
976 }
977 case Instruction::Freeze:
978 // This opcode is unknown. Assume that it is the same as 'mul'.
979 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
980 Ctx.CostKind);
981 case Instruction::ExtractValue:
982 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
983 Ctx.CostKind);
984 case Instruction::ICmp:
985 case Instruction::FCmp: {
986 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
987 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
989 return Ctx.TTI.getCmpSelInstrCost(
990 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
991 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
992 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
993 }
994 case Instruction::BitCast: {
995 Type *ScalarTy = Ctx.Types.inferScalarType(this);
996 if (ScalarTy->isPointerTy())
997 return 0;
998 [[fallthrough]];
999 }
1000 case Instruction::SExt:
1001 case Instruction::ZExt:
1002 case Instruction::FPToUI:
1003 case Instruction::FPToSI:
1004 case Instruction::FPExt:
1005 case Instruction::PtrToInt:
1006 case Instruction::PtrToAddr:
1007 case Instruction::IntToPtr:
1008 case Instruction::SIToFP:
1009 case Instruction::UIToFP:
1010 case Instruction::Trunc:
1011 case Instruction::FPTrunc:
1012 case Instruction::AddrSpaceCast: {
1013 // Computes the CastContextHint from a recipe that may access memory.
1014 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1015 if (isa<VPInterleaveBase>(R))
1017 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1018 // Only compute CCH for memory operations, matching the legacy model
1019 // which only considers loads/stores for cast context hints.
1020 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1021 if (!isa<LoadInst, StoreInst>(UI))
1023 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1025 }
1026 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1027 if (WidenMemoryRecipe == nullptr)
1029 if (VF.isScalar())
1031 if (!WidenMemoryRecipe->isConsecutive())
1033 if (WidenMemoryRecipe->isReverse())
1035 if (WidenMemoryRecipe->isMasked())
1038 };
1039
1040 VPValue *Operand = getOperand(0);
1042 // For Trunc/FPTrunc, get the context from the only user.
1043 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1044 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1045 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1046 return nullptr;
1047 return dyn_cast<VPRecipeBase>(*R->user_begin());
1048 };
1049 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1050 if (match(Recipe, m_Reverse(m_VPValue())))
1051 Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
1052 if (Recipe)
1053 CCH = ComputeCCH(Recipe);
1054 }
1055 }
1056 // For Z/Sext, get the context from the operand.
1057 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1058 Opcode == Instruction::FPExt) {
1059 if (auto *Recipe = Operand->getDefiningRecipe()) {
1060 VPValue *ReverseOp;
1061 if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
1062 Recipe = ReverseOp->getDefiningRecipe();
1063 if (Recipe)
1064 CCH = ComputeCCH(Recipe);
1065 }
1066 }
1067
1068 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1069 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1070 // Arm TTI will use the underlying instruction to determine the cost.
1071 return Ctx.TTI.getCastInstrCost(
1072 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1074 }
1075 case Instruction::Select: {
1077 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1078 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1079
1080 VPValue *Op0, *Op1;
1081 bool IsLogicalAnd =
1082 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1083 bool IsLogicalOr =
1084 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1085 // Also match the inverted forms:
1086 // select x, false, y --> !x & y (still AND)
1087 // select x, y, true --> !x | y (still OR)
1088 IsLogicalAnd |=
1089 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1090 IsLogicalOr |=
1091 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1092
1093 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1094 (IsLogicalAnd || IsLogicalOr)) {
1095 // select x, y, false --> x & y
1096 // select x, true, y --> x | y
1097 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1098 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1099
1101 if (SI && all_of(operands(),
1102 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1103 append_range(Operands, SI->operands());
1104 return Ctx.TTI.getArithmeticInstrCost(
1105 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1106 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1107 }
1108
1109 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1110 if (!IsScalarCond && VF.isVector())
1111 CondTy = VectorType::get(CondTy, VF);
1112
1113 llvm::CmpPredicate Pred;
1114 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1115 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1116 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1117 Pred = Cmp->getPredicate();
1118 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1119 return Ctx.TTI.getCmpSelInstrCost(
1120 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1121 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1122 }
1123 }
1124 llvm_unreachable("called for unsupported opcode");
1125}
1126
1128 VPCostContext &Ctx) const {
1130 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1131 // TODO: Compute cost for VPInstructions without underlying values once
1132 // the legacy cost model has been retired.
1133 return 0;
1134 }
1135
1137 "Should only generate a vector value or single scalar, not scalars "
1138 "for all lanes.");
1140 getOpcode(),
1142 }
1143
1144 switch (getOpcode()) {
1145 case Instruction::Select: {
1147 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1148 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1149 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1150 if (!vputils::onlyFirstLaneUsed(this)) {
1151 CondTy = toVectorTy(CondTy, VF);
1152 VecTy = toVectorTy(VecTy, VF);
1153 }
1154 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1155 Ctx.CostKind);
1156 }
1157 case Instruction::ExtractElement:
1159 if (VF.isScalar()) {
1160 // ExtractLane with VF=1 takes care of handling extracting across multiple
1161 // parts.
1162 return 0;
1163 }
1164
1165 // Add on the cost of extracting the element.
1166 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1167 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1168 Ctx.CostKind);
1169 }
1170 case VPInstruction::AnyOf: {
1171 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1172 return Ctx.TTI.getArithmeticReductionCost(
1173 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1174 }
1176 Type *Ty = Ctx.Types.inferScalarType(this);
1177 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1178 if (VF.isScalar())
1179 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1181 CmpInst::ICMP_EQ, Ctx.CostKind);
1182 // Calculate the cost of determining the lane index.
1183 auto *PredTy = toVectorTy(ScalarTy, VF);
1184 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1185 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1186 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1187 }
1189 Type *Ty = Ctx.Types.inferScalarType(this);
1190 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1191 if (VF.isScalar())
1192 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1195 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1196 auto *PredTy = toVectorTy(ScalarTy, VF);
1197 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1198 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1200 // Add cost of NOT operation on the predicate.
1202 Instruction::Xor, PredTy, Ctx.CostKind,
1203 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1204 {TargetTransformInfo::OK_UniformConstantValue,
1205 TargetTransformInfo::OP_None});
1206 // Add cost of SUB operation on the index.
1207 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1208 return Cost;
1209 }
1211 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1212 Type *VecTy = toVectorTy(ScalarTy, VF);
1213 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1214 IntrinsicCostAttributes ICA(
1215 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1216 {VecTy, MaskTy, ScalarTy});
1217 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1218 }
1220 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1221 SmallVector<int> Mask(VF.getKnownMinValue());
1222 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1223 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1224
1226 cast<VectorType>(VectorTy),
1227 cast<VectorType>(VectorTy), Mask,
1228 Ctx.CostKind, VF.getKnownMinValue() - 1);
1229 }
1231 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1232 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1233 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1234 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1235 {ArgTy, ArgTy});
1236 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1237 }
1239 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1240 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1241 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1242 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1243 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1244 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1245 }
1247 assert(VF.isVector() && "Reverse operation must be vector type");
1248 auto *VectorTy = cast<VectorType>(
1251 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1252 /*Index=*/0);
1253 }
1255 // Add on the cost of extracting the element.
1256 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1257 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1258 VecTy, Ctx.CostKind, 0);
1259 }
1261 if (VF == ElementCount::getScalable(1))
1263 [[fallthrough]];
1264 default:
1265 // TODO: Compute cost other VPInstructions once the legacy cost model has
1266 // been retired.
1268 "unexpected VPInstruction witht underlying value");
1269 return 0;
1270 }
1271}
1272
1285
1287 switch (getOpcode()) {
1288 case Instruction::Load:
1289 case Instruction::PHI:
1293 return true;
1294 default:
1295 return isScalarCast();
1296 }
1297}
1298
1300 assert(!isMasked() && "cannot execute masked VPInstruction");
1301 assert(!State.Lane && "VPInstruction executing an Lane");
1302 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1304 "Set flags not supported for the provided opcode");
1306 "Opcode requires specific flags to be set");
1307 if (hasFastMathFlags())
1308 State.Builder.setFastMathFlags(getFastMathFlags());
1309 Value *GeneratedValue = generate(State);
1310 if (!hasResult())
1311 return;
1312 assert(GeneratedValue && "generate must produce a value");
1313 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1316 assert((((GeneratedValue->getType()->isVectorTy() ||
1317 GeneratedValue->getType()->isStructTy()) ==
1318 !GeneratesPerFirstLaneOnly) ||
1319 State.VF.isScalar()) &&
1320 "scalar value but not only first lane defined");
1321 State.set(this, GeneratedValue,
1322 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1324 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1325 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1326 // vectorization explicitly connects VPlans.
1327 setUnderlyingValue(GeneratedValue);
1328 }
1329}
1330
1334 return false;
1335 switch (getOpcode()) {
1336 case Instruction::ExtractValue:
1337 case Instruction::InsertValue:
1338 case Instruction::GetElementPtr:
1339 case Instruction::ExtractElement:
1340 case Instruction::Freeze:
1341 case Instruction::FCmp:
1342 case Instruction::ICmp:
1343 case Instruction::Select:
1344 case Instruction::PHI:
1368 case VPInstruction::Not:
1377 return false;
1378 case Instruction::Call:
1379 return !getCalledFunction(*this)->doesNotAccessMemory();
1380 default:
1381 return true;
1382 }
1383}
1384
1386 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1388 return vputils::onlyFirstLaneUsed(this);
1389
1390 switch (getOpcode()) {
1391 default:
1392 return false;
1393 case Instruction::ExtractElement:
1394 return Op == getOperand(1);
1395 case Instruction::PHI:
1396 return true;
1397 case Instruction::FCmp:
1398 case Instruction::ICmp:
1399 case Instruction::Select:
1400 case Instruction::Or:
1401 case Instruction::Freeze:
1402 case VPInstruction::Not:
1403 // TODO: Cover additional opcodes.
1404 return vputils::onlyFirstLaneUsed(this);
1405 case Instruction::Load:
1414 return true;
1417 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1418 // operand, after replicating its operands only the first lane is used.
1419 // Before replicating, it will have only a single operand.
1420 return getNumOperands() > 1;
1422 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1424 // WidePtrAdd supports scalar and vector base addresses.
1425 return false;
1427 return Op == getOperand(0) || Op == getOperand(1);
1430 return Op == getOperand(0);
1431 };
1432 llvm_unreachable("switch should return");
1433}
1434
1436 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1438 return vputils::onlyFirstPartUsed(this);
1439
1440 switch (getOpcode()) {
1441 default:
1442 return false;
1443 case Instruction::FCmp:
1444 case Instruction::ICmp:
1445 case Instruction::Select:
1446 return vputils::onlyFirstPartUsed(this);
1451 return true;
1452 };
1453 llvm_unreachable("switch should return");
1454}
1455
1456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1458 VPSlotTracker SlotTracker(getParent()->getPlan());
1460}
1461
1463 VPSlotTracker &SlotTracker) const {
1464 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1465
1466 if (hasResult()) {
1468 O << " = ";
1469 }
1470
1471 switch (getOpcode()) {
1472 case VPInstruction::Not:
1473 O << "not";
1474 break;
1476 O << "combined load";
1477 break;
1479 O << "combined store";
1480 break;
1482 O << "active lane mask";
1483 break;
1485 O << "EXPLICIT-VECTOR-LENGTH";
1486 break;
1488 O << "first-order splice";
1489 break;
1491 O << "branch-on-cond";
1492 break;
1494 O << "branch-on-two-conds";
1495 break;
1497 O << "TC > VF ? TC - VF : 0";
1498 break;
1500 O << "VF * Part +";
1501 break;
1503 O << "branch-on-count";
1504 break;
1506 O << "broadcast";
1507 break;
1509 O << "buildstructvector";
1510 break;
1512 O << "buildvector";
1513 break;
1515 O << "exiting-iv-value";
1516 break;
1518 O << "masked-cond";
1519 break;
1521 O << "extract-lane";
1522 break;
1524 O << "extract-last-lane";
1525 break;
1527 O << "extract-last-part";
1528 break;
1530 O << "extract-penultimate-element";
1531 break;
1533 O << "compute-anyof-result";
1534 break;
1536 O << "compute-reduction-result";
1537 break;
1539 O << "logical-and";
1540 break;
1542 O << "logical-or";
1543 break;
1545 O << "ptradd";
1546 break;
1548 O << "wide-ptradd";
1549 break;
1551 O << "any-of";
1552 break;
1554 O << "first-active-lane";
1555 break;
1557 O << "last-active-lane";
1558 break;
1560 O << "reduction-start-vector";
1561 break;
1563 O << "resume-for-epilogue";
1564 break;
1566 O << "reverse";
1567 break;
1569 O << "unpack";
1570 break;
1572 O << "extract-last-active";
1573 break;
1574 default:
1576 }
1577
1578 printFlags(O);
1580}
1581#endif
1582
1584 State.setDebugLocFrom(getDebugLoc());
1585 if (isScalarCast()) {
1586 Value *Op = State.get(getOperand(0), VPLane(0));
1587 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1588 Op, ResultTy);
1589 State.set(this, Cast, VPLane(0));
1590 return;
1591 }
1592 switch (getOpcode()) {
1594 Value *StepVector =
1595 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1596 State.set(this, StepVector);
1597 break;
1598 }
1599 case VPInstruction::VScale: {
1600 Value *VScale = State.Builder.CreateVScale(ResultTy);
1601 State.set(this, VScale, true);
1602 break;
1603 }
1604
1605 default:
1606 llvm_unreachable("opcode not implemented yet");
1607 }
1608}
1609
1610#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1612 VPSlotTracker &SlotTracker) const {
1613 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1615 O << " = ";
1616
1617 switch (getOpcode()) {
1619 O << "wide-iv-step ";
1621 break;
1623 O << "step-vector " << *ResultTy;
1624 break;
1626 O << "vscale " << *ResultTy;
1627 break;
1628 case Instruction::Load:
1629 O << "load ";
1631 break;
1632 default:
1633 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1636 O << " to " << *ResultTy;
1637 }
1638}
1639#endif
1640
1642 State.setDebugLocFrom(getDebugLoc());
1643 PHINode *NewPhi = State.Builder.CreatePHI(
1644 State.TypeAnalysis.inferScalarType(this), 2, getName());
1645 unsigned NumIncoming = getNumIncoming();
1646 // Detect header phis: the parent block dominates its second incoming block
1647 // (the latch). Those IR incoming values have not been generated yet and need
1648 // to be added after they have been executed.
1649 if (NumIncoming == 2 &&
1650 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1651 NumIncoming = 1;
1652 }
1653 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1654 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1655 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1656 NewPhi->addIncoming(IncV, PredBB);
1657 }
1658 State.set(this, NewPhi, VPLane(0));
1659}
1660
1661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1663 VPSlotTracker &SlotTracker) const {
1664 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1666 O << " = phi";
1667 printFlags(O);
1669}
1670#endif
1671
1672VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1673 if (auto *Phi = dyn_cast<PHINode>(&I))
1674 return new VPIRPhi(*Phi);
1675 return new VPIRInstruction(I);
1676}
1677
1679 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1680 "PHINodes must be handled by VPIRPhi");
1681 // Advance the insert point after the wrapped IR instruction. This allows
1682 // interleaving VPIRInstructions and other recipes.
1683 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1684}
1685
1687 VPCostContext &Ctx) const {
1688 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1689 // hence it does not contribute to the cost-modeling for the VPlan.
1690 return 0;
1691}
1692
1693#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1695 VPSlotTracker &SlotTracker) const {
1696 O << Indent << "IR " << I;
1697}
1698#endif
1699
1701 PHINode *Phi = &getIRPhi();
1702 for (const auto &[Idx, Op] : enumerate(operands())) {
1703 VPValue *ExitValue = Op;
1704 auto Lane = vputils::isSingleScalar(ExitValue)
1706 : VPLane::getLastLaneForVF(State.VF);
1707 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1708 auto *PredVPBB = Pred->getExitingBasicBlock();
1709 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1710 // Set insertion point in PredBB in case an extract needs to be generated.
1711 // TODO: Model extracts explicitly.
1712 State.Builder.SetInsertPoint(PredBB->getTerminator());
1713 Value *V = State.get(ExitValue, VPLane(Lane));
1714 // If there is no existing block for PredBB in the phi, add a new incoming
1715 // value. Otherwise update the existing incoming value for PredBB.
1716 if (Phi->getBasicBlockIndex(PredBB) == -1)
1717 Phi->addIncoming(V, PredBB);
1718 else
1719 Phi->setIncomingValueForBlock(PredBB, V);
1720 }
1721
1722 // Advance the insert point after the wrapped IR instruction. This allows
1723 // interleaving VPIRInstructions and other recipes.
1724 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1725}
1726
1728 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1729 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1730 "Number of phi operands must match number of predecessors");
1731 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1732 R->removeOperand(Position);
1733}
1734
1735VPValue *
1737 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1738 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1739}
1740
1742 VPValue *V) const {
1743 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1744 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1745}
1746
1747#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1749 VPSlotTracker &SlotTracker) const {
1750 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1751 [this, &O, &SlotTracker](auto Op) {
1752 O << "[ ";
1753 Op.value()->printAsOperand(O, SlotTracker);
1754 O << ", ";
1755 getIncomingBlock(Op.index())->printAsOperand(O);
1756 O << " ]";
1757 });
1758}
1759#endif
1760
1761#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1763 VPSlotTracker &SlotTracker) const {
1765
1766 if (getNumOperands() != 0) {
1767 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1769 [&O, &SlotTracker](auto Op) {
1770 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1771 O << " from ";
1772 std::get<1>(Op)->printAsOperand(O);
1773 });
1774 O << ")";
1775 }
1776}
1777#endif
1778
1780 for (const auto &[Kind, Node] : Metadata)
1781 I.setMetadata(Kind, Node);
1782}
1783
1785 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1786 for (const auto &[KindA, MDA] : Metadata) {
1787 for (const auto &[KindB, MDB] : Other.Metadata) {
1788 if (KindA == KindB && MDA == MDB) {
1789 MetadataIntersection.emplace_back(KindA, MDA);
1790 break;
1791 }
1792 }
1793 }
1794 Metadata = std::move(MetadataIntersection);
1795}
1796
1797#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1799 const Module *M = SlotTracker.getModule();
1800 if (Metadata.empty() || !M)
1801 return;
1802
1803 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1804 O << " (";
1805 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1806 auto [Kind, Node] = KindNodePair;
1807 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1808 "Unexpected unnamed metadata kind");
1809 O << "!" << MDNames[Kind] << " ";
1810 Node->printAsOperand(O, M);
1811 });
1812 O << ")";
1813}
1814#endif
1815
1817 assert(State.VF.isVector() && "not widening");
1818 assert(Variant != nullptr && "Can't create vector function.");
1819
1820 FunctionType *VFTy = Variant->getFunctionType();
1821 // Add return type if intrinsic is overloaded on it.
1823 for (const auto &I : enumerate(args())) {
1824 Value *Arg;
1825 // Some vectorized function variants may also take a scalar argument,
1826 // e.g. linear parameters for pointers. This needs to be the scalar value
1827 // from the start of the respective part when interleaving.
1828 if (!VFTy->getParamType(I.index())->isVectorTy())
1829 Arg = State.get(I.value(), VPLane(0));
1830 else
1831 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1832 Args.push_back(Arg);
1833 }
1834
1837 if (CI)
1838 CI->getOperandBundlesAsDefs(OpBundles);
1839
1840 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1841 applyFlags(*V);
1842 applyMetadata(*V);
1843 V->setCallingConv(Variant->getCallingConv());
1844
1845 if (!V->getType()->isVoidTy())
1846 State.set(this, V);
1847}
1848
1850 VPCostContext &Ctx) const {
1851 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1852 Variant->getFunctionType()->params(),
1853 Ctx.CostKind);
1854}
1855
1856#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1858 VPSlotTracker &SlotTracker) const {
1859 O << Indent << "WIDEN-CALL ";
1860
1861 Function *CalledFn = getCalledScalarFunction();
1862 if (CalledFn->getReturnType()->isVoidTy())
1863 O << "void ";
1864 else {
1866 O << " = ";
1867 }
1868
1869 O << "call";
1870 printFlags(O);
1871 O << " @" << CalledFn->getName() << "(";
1872 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1873 Op->printAsOperand(O, SlotTracker);
1874 });
1875 O << ")";
1876
1877 O << " (using library function";
1878 if (Variant->hasName())
1879 O << ": " << Variant->getName();
1880 O << ")";
1881}
1882#endif
1883
1885 assert(State.VF.isVector() && "not widening");
1886
1887 SmallVector<Type *, 2> TysForDecl;
1888 // Add return type if intrinsic is overloaded on it.
1889 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
1890 State.TTI)) {
1891 Type *RetTy = toVectorizedTy(getResultType(), State.VF);
1892 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
1893 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
1895 Idx, State.TTI))
1896 TysForDecl.push_back(Ty);
1897 }
1898 }
1900 for (const auto &I : enumerate(operands())) {
1901 // Some intrinsics have a scalar argument - don't replace it with a
1902 // vector.
1903 Value *Arg;
1904 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1905 State.TTI))
1906 Arg = State.get(I.value(), VPLane(0));
1907 else
1908 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1909 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1910 State.TTI))
1911 TysForDecl.push_back(Arg->getType());
1912 Args.push_back(Arg);
1913 }
1914
1915 // Use vector version of the intrinsic.
1916 Module *M = State.Builder.GetInsertBlock()->getModule();
1917 Function *VectorF =
1918 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1919 assert(VectorF &&
1920 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1921
1924 if (CI)
1925 CI->getOperandBundlesAsDefs(OpBundles);
1926
1927 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1928
1929 applyFlags(*V);
1930 applyMetadata(*V);
1931
1932 if (!V->getType()->isVoidTy())
1933 State.set(this, V);
1934}
1935
1936/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1939 const VPRecipeWithIRFlags &R,
1940 ElementCount VF,
1941 VPCostContext &Ctx) {
1942 // Some backends analyze intrinsic arguments to determine cost. Use the
1943 // underlying value for the operand if it has one. Otherwise try to use the
1944 // operand of the underlying call instruction, if there is one. Otherwise
1945 // clear Arguments.
1946 // TODO: Rework TTI interface to be independent of concrete IR values.
1948 for (const auto &[Idx, Op] : enumerate(Operands)) {
1949 auto *V = Op->getUnderlyingValue();
1950 if (!V) {
1951 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1952 Arguments.push_back(UI->getArgOperand(Idx));
1953 continue;
1954 }
1955 Arguments.clear();
1956 break;
1957 }
1958 Arguments.push_back(V);
1959 }
1960
1961 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1962 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1963 SmallVector<Type *> ParamTys;
1964 for (const VPValue *Op : Operands) {
1965 ParamTys.push_back(VF.isVector()
1966 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1967 : Ctx.Types.inferScalarType(Op));
1968 }
1969
1970 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1971 IntrinsicCostAttributes CostAttrs(
1972 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1973 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1975 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1976}
1977
1979 VPCostContext &Ctx) const {
1981 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1982}
1983
1985 return Intrinsic::getBaseName(VectorIntrinsicID);
1986}
1987
1989 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1990 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1991 auto [Idx, V] = X;
1993 Idx, nullptr);
1994 });
1995}
1996
1997#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1999 VPSlotTracker &SlotTracker) const {
2000 O << Indent << "WIDEN-INTRINSIC ";
2001 if (ResultTy->isVoidTy()) {
2002 O << "void ";
2003 } else {
2005 O << " = ";
2006 }
2007
2008 O << "call";
2009 printFlags(O);
2010 O << getIntrinsicName() << "(";
2011
2013 Op->printAsOperand(O, SlotTracker);
2014 });
2015 O << ")";
2016}
2017#endif
2018
2020 IRBuilderBase &Builder = State.Builder;
2021
2022 Value *Address = State.get(getOperand(0));
2023 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2024 VectorType *VTy = cast<VectorType>(Address->getType());
2025
2026 // The histogram intrinsic requires a mask even if the recipe doesn't;
2027 // if the mask operand was omitted then all lanes should be executed and
2028 // we just need to synthesize an all-true mask.
2029 Value *Mask = nullptr;
2030 if (VPValue *VPMask = getMask())
2031 Mask = State.get(VPMask);
2032 else
2033 Mask =
2034 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2035
2036 // If this is a subtract, we want to invert the increment amount. We may
2037 // add a separate intrinsic in future, but for now we'll try this.
2038 if (Opcode == Instruction::Sub)
2039 IncAmt = Builder.CreateNeg(IncAmt);
2040 else
2041 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2042
2043 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
2044 {VTy, IncAmt->getType()},
2045 {Address, IncAmt, Mask});
2046}
2047
2049 VPCostContext &Ctx) const {
2050 // FIXME: Take the gather and scatter into account as well. For now we're
2051 // generating the same cost as the fallback path, but we'll likely
2052 // need to create a new TTI method for determining the cost, including
2053 // whether we can use base + vec-of-smaller-indices or just
2054 // vec-of-pointers.
2055 assert(VF.isVector() && "Invalid VF for histogram cost");
2056 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2057 VPValue *IncAmt = getOperand(1);
2058 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2059 VectorType *VTy = VectorType::get(IncTy, VF);
2060
2061 // Assume that a non-constant update value (or a constant != 1) requires
2062 // a multiply, and add that into the cost.
2063 InstructionCost MulCost =
2064 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2065 if (match(IncAmt, m_One()))
2066 MulCost = TTI::TCC_Free;
2067
2068 // Find the cost of the histogram operation itself.
2069 Type *PtrTy = VectorType::get(AddressTy, VF);
2070 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2071 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2072 Type::getVoidTy(Ctx.LLVMCtx),
2073 {PtrTy, IncTy, MaskTy});
2074
2075 // Add the costs together with the add/sub operation.
2076 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2077 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2078}
2079
2080#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2082 VPSlotTracker &SlotTracker) const {
2083 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2085
2086 if (Opcode == Instruction::Sub)
2087 O << ", dec: ";
2088 else {
2089 assert(Opcode == Instruction::Add);
2090 O << ", inc: ";
2091 }
2093
2094 if (VPValue *Mask = getMask()) {
2095 O << ", mask: ";
2096 Mask->printAsOperand(O, SlotTracker);
2097 }
2098}
2099#endif
2100
2101VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2102 AllowReassoc = FMF.allowReassoc();
2103 NoNaNs = FMF.noNaNs();
2104 NoInfs = FMF.noInfs();
2105 NoSignedZeros = FMF.noSignedZeros();
2106 AllowReciprocal = FMF.allowReciprocal();
2107 AllowContract = FMF.allowContract();
2108 ApproxFunc = FMF.approxFunc();
2109}
2110
2112 switch (Opcode) {
2113 case Instruction::Add:
2114 case Instruction::Sub:
2115 case Instruction::Mul:
2116 case Instruction::Shl:
2118 return WrapFlagsTy(false, false);
2119 case Instruction::Trunc:
2120 return TruncFlagsTy(false, false);
2121 case Instruction::Or:
2122 return DisjointFlagsTy(false);
2123 case Instruction::AShr:
2124 case Instruction::LShr:
2125 case Instruction::UDiv:
2126 case Instruction::SDiv:
2127 return ExactFlagsTy(false);
2128 case Instruction::GetElementPtr:
2131 return GEPNoWrapFlags::none();
2132 case Instruction::ZExt:
2133 case Instruction::UIToFP:
2134 return NonNegFlagsTy(false);
2135 case Instruction::FAdd:
2136 case Instruction::FSub:
2137 case Instruction::FMul:
2138 case Instruction::FDiv:
2139 case Instruction::FRem:
2140 case Instruction::FNeg:
2141 case Instruction::FPExt:
2142 case Instruction::FPTrunc:
2143 return FastMathFlags();
2144 case Instruction::ICmp:
2145 case Instruction::FCmp:
2147 llvm_unreachable("opcode requires explicit flags");
2148 default:
2149 return VPIRFlags();
2150 }
2151}
2152
2153#if !defined(NDEBUG)
2154bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2155 switch (OpType) {
2156 case OperationType::OverflowingBinOp:
2157 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2158 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2159 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2160 case OperationType::Trunc:
2161 return Opcode == Instruction::Trunc;
2162 case OperationType::DisjointOp:
2163 return Opcode == Instruction::Or;
2164 case OperationType::PossiblyExactOp:
2165 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2166 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2167 case OperationType::GEPOp:
2168 return Opcode == Instruction::GetElementPtr ||
2169 Opcode == VPInstruction::PtrAdd ||
2170 Opcode == VPInstruction::WidePtrAdd;
2171 case OperationType::FPMathOp:
2172 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2173 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2174 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2175 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2176 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2177 Opcode == Instruction::Select ||
2178 Opcode == VPInstruction::WideIVStep ||
2180 case OperationType::FCmp:
2181 return Opcode == Instruction::FCmp;
2182 case OperationType::NonNegOp:
2183 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2184 case OperationType::Cmp:
2185 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2186 case OperationType::ReductionOp:
2188 case OperationType::Other:
2189 return true;
2190 }
2191 llvm_unreachable("Unknown OperationType enum");
2192}
2193
2194bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2195 // Handle opcodes without default flags.
2196 if (Opcode == Instruction::ICmp)
2197 return OpType == OperationType::Cmp;
2198 if (Opcode == Instruction::FCmp)
2199 return OpType == OperationType::FCmp;
2201 return OpType == OperationType::ReductionOp;
2202
2203 OperationType Required = getDefaultFlags(Opcode).OpType;
2204 return Required == OperationType::Other || Required == OpType;
2205}
2206#endif
2207
2208#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2210 switch (OpType) {
2211 case OperationType::Cmp:
2213 break;
2214 case OperationType::FCmp:
2217 break;
2218 case OperationType::DisjointOp:
2219 if (DisjointFlags.IsDisjoint)
2220 O << " disjoint";
2221 break;
2222 case OperationType::PossiblyExactOp:
2223 if (ExactFlags.IsExact)
2224 O << " exact";
2225 break;
2226 case OperationType::OverflowingBinOp:
2227 if (WrapFlags.HasNUW)
2228 O << " nuw";
2229 if (WrapFlags.HasNSW)
2230 O << " nsw";
2231 break;
2232 case OperationType::Trunc:
2233 if (TruncFlags.HasNUW)
2234 O << " nuw";
2235 if (TruncFlags.HasNSW)
2236 O << " nsw";
2237 break;
2238 case OperationType::FPMathOp:
2240 break;
2241 case OperationType::GEPOp: {
2243 if (Flags.isInBounds())
2244 O << " inbounds";
2245 else if (Flags.hasNoUnsignedSignedWrap())
2246 O << " nusw";
2247 if (Flags.hasNoUnsignedWrap())
2248 O << " nuw";
2249 break;
2250 }
2251 case OperationType::NonNegOp:
2252 if (NonNegFlags.NonNeg)
2253 O << " nneg";
2254 break;
2255 case OperationType::ReductionOp: {
2256 RecurKind RK = getRecurKind();
2257 O << " (";
2258 switch (RK) {
2259 case RecurKind::AnyOf:
2260 O << "any-of";
2261 break;
2263 O << "find-last";
2264 break;
2265 case RecurKind::SMax:
2266 O << "smax";
2267 break;
2268 case RecurKind::SMin:
2269 O << "smin";
2270 break;
2271 case RecurKind::UMax:
2272 O << "umax";
2273 break;
2274 case RecurKind::UMin:
2275 O << "umin";
2276 break;
2277 case RecurKind::FMinNum:
2278 O << "fminnum";
2279 break;
2280 case RecurKind::FMaxNum:
2281 O << "fmaxnum";
2282 break;
2284 O << "fminimum";
2285 break;
2287 O << "fmaximum";
2288 break;
2290 O << "fminimumnum";
2291 break;
2293 O << "fmaximumnum";
2294 break;
2295 default:
2297 break;
2298 }
2299 if (isReductionInLoop())
2300 O << ", in-loop";
2301 if (isReductionOrdered())
2302 O << ", ordered";
2303 O << ")";
2305 break;
2306 }
2307 case OperationType::Other:
2308 break;
2309 }
2310 O << " ";
2311}
2312#endif
2313
2315 auto &Builder = State.Builder;
2316 switch (Opcode) {
2317 case Instruction::Call:
2318 case Instruction::UncondBr:
2319 case Instruction::CondBr:
2320 case Instruction::PHI:
2321 case Instruction::GetElementPtr:
2322 llvm_unreachable("This instruction is handled by a different recipe.");
2323 case Instruction::UDiv:
2324 case Instruction::SDiv:
2325 case Instruction::SRem:
2326 case Instruction::URem:
2327 case Instruction::Add:
2328 case Instruction::FAdd:
2329 case Instruction::Sub:
2330 case Instruction::FSub:
2331 case Instruction::FNeg:
2332 case Instruction::Mul:
2333 case Instruction::FMul:
2334 case Instruction::FDiv:
2335 case Instruction::FRem:
2336 case Instruction::Shl:
2337 case Instruction::LShr:
2338 case Instruction::AShr:
2339 case Instruction::And:
2340 case Instruction::Or:
2341 case Instruction::Xor: {
2342 // Just widen unops and binops.
2344 for (VPValue *VPOp : operands())
2345 Ops.push_back(State.get(VPOp));
2346
2347 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2348
2349 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2350 applyFlags(*VecOp);
2351 applyMetadata(*VecOp);
2352 }
2353
2354 // Use this vector value for all users of the original instruction.
2355 State.set(this, V);
2356 break;
2357 }
2358 case Instruction::ExtractValue: {
2359 assert(getNumOperands() == 2 && "expected single level extractvalue");
2360 Value *Op = State.get(getOperand(0));
2361 Value *Extract = Builder.CreateExtractValue(
2362 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2363 State.set(this, Extract);
2364 break;
2365 }
2366 case Instruction::Freeze: {
2367 Value *Op = State.get(getOperand(0));
2368 Value *Freeze = Builder.CreateFreeze(Op);
2369 State.set(this, Freeze);
2370 break;
2371 }
2372 case Instruction::ICmp:
2373 case Instruction::FCmp: {
2374 // Widen compares. Generate vector compares.
2375 bool FCmp = Opcode == Instruction::FCmp;
2376 Value *A = State.get(getOperand(0));
2377 Value *B = State.get(getOperand(1));
2378 Value *C = nullptr;
2379 if (FCmp) {
2380 C = Builder.CreateFCmp(getPredicate(), A, B);
2381 } else {
2382 C = Builder.CreateICmp(getPredicate(), A, B);
2383 }
2384 if (auto *I = dyn_cast<Instruction>(C)) {
2385 applyFlags(*I);
2386 applyMetadata(*I);
2387 }
2388 State.set(this, C);
2389 break;
2390 }
2391 case Instruction::Select: {
2392 VPValue *CondOp = getOperand(0);
2393 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2394 Value *Op0 = State.get(getOperand(1));
2395 Value *Op1 = State.get(getOperand(2));
2396 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2397 State.set(this, Sel);
2398 if (auto *I = dyn_cast<Instruction>(Sel)) {
2400 applyFlags(*I);
2401 applyMetadata(*I);
2402 }
2403 break;
2404 }
2405 default:
2406 // This instruction is not vectorized by simple widening.
2407 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2408 << Instruction::getOpcodeName(Opcode));
2409 llvm_unreachable("Unhandled instruction!");
2410 } // end of switch.
2411
2412#if !defined(NDEBUG)
2413 // Verify that VPlan type inference results agree with the type of the
2414 // generated values.
2415 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2416 State.get(this)->getType() &&
2417 "inferred type and type from generated instructions do not match");
2418#endif
2419}
2420
2422 VPCostContext &Ctx) const {
2423 switch (Opcode) {
2424 case Instruction::UDiv:
2425 case Instruction::SDiv:
2426 case Instruction::SRem:
2427 case Instruction::URem:
2428 // If the div/rem operation isn't safe to speculate and requires
2429 // predication, then the only way we can even create a vplan is to insert
2430 // a select on the second input operand to ensure we use the value of 1
2431 // for the inactive lanes. The select will be costed separately.
2432 case Instruction::FNeg:
2433 case Instruction::Add:
2434 case Instruction::FAdd:
2435 case Instruction::Sub:
2436 case Instruction::FSub:
2437 case Instruction::Mul:
2438 case Instruction::FMul:
2439 case Instruction::FDiv:
2440 case Instruction::FRem:
2441 case Instruction::Shl:
2442 case Instruction::LShr:
2443 case Instruction::AShr:
2444 case Instruction::And:
2445 case Instruction::Or:
2446 case Instruction::Xor:
2447 case Instruction::Freeze:
2448 case Instruction::ExtractValue:
2449 case Instruction::ICmp:
2450 case Instruction::FCmp:
2451 case Instruction::Select:
2452 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2453 default:
2454 llvm_unreachable("Unsupported opcode for instruction");
2455 }
2456}
2457
2458#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2460 VPSlotTracker &SlotTracker) const {
2461 O << Indent << "WIDEN ";
2463 O << " = " << Instruction::getOpcodeName(Opcode);
2464 printFlags(O);
2466}
2467#endif
2468
2470 auto &Builder = State.Builder;
2471 /// Vectorize casts.
2472 assert(State.VF.isVector() && "Not vectorizing?");
2473 Type *DestTy = VectorType::get(getResultType(), State.VF);
2474 VPValue *Op = getOperand(0);
2475 Value *A = State.get(Op);
2476 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2477 State.set(this, Cast);
2478 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2479 applyFlags(*CastOp);
2480 applyMetadata(*CastOp);
2481 }
2482}
2483
2485 VPCostContext &Ctx) const {
2486 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2487 // the legacy cost model, including truncates/extends when evaluating a
2488 // reduction in a smaller type.
2489 if (!getUnderlyingValue())
2490 return 0;
2491 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2492}
2493
2494#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2496 VPSlotTracker &SlotTracker) const {
2497 O << Indent << "WIDEN-CAST ";
2499 O << " = " << Instruction::getOpcodeName(Opcode);
2500 printFlags(O);
2502 O << " to " << *getResultType();
2503}
2504#endif
2505
2507 VPCostContext &Ctx) const {
2508 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2509}
2510
2511#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2513 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2514 O << Indent;
2516 O << " = WIDEN-INDUCTION";
2517 printFlags(O);
2519
2520 if (auto *TI = getTruncInst())
2521 O << " (truncated to " << *TI->getType() << ")";
2522}
2523#endif
2524
2526 // The step may be defined by a recipe in the preheader (e.g. if it requires
2527 // SCEV expansion), but for the canonical induction the step is required to be
2528 // 1, which is represented as live-in.
2529 return match(getStartValue(), m_ZeroInt()) &&
2530 match(getStepValue(), m_One()) &&
2531 getScalarType() == getRegion()->getCanonicalIVType();
2532}
2533
2535 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2536
2537 // Fast-math-flags propagate from the original induction instruction.
2538 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2539 if (FPBinOp)
2540 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2541
2542 Value *Step = State.get(getStepValue(), VPLane(0));
2543 Value *Index = State.get(getOperand(1), VPLane(0));
2544 Value *DerivedIV = emitTransformedIndex(
2545 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
2547 DerivedIV->setName(Name);
2548 State.set(this, DerivedIV, VPLane(0));
2549}
2550
2551#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2553 VPSlotTracker &SlotTracker) const {
2554 O << Indent;
2556 O << " = DERIVED-IV ";
2557 getStartValue()->printAsOperand(O, SlotTracker);
2558 O << " + ";
2559 getOperand(1)->printAsOperand(O, SlotTracker);
2560 O << " * ";
2561 getStepValue()->printAsOperand(O, SlotTracker);
2562}
2563#endif
2564
2566 // Fast-math-flags propagate from the original induction instruction.
2567 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2568 State.Builder.setFastMathFlags(getFastMathFlags());
2569
2570 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2571 /// variable on which to base the steps, \p Step is the size of the step.
2572
2573 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2574 Value *Step = State.get(getStepValue(), VPLane(0));
2575 IRBuilderBase &Builder = State.Builder;
2576
2577 // Ensure step has the same type as that of scalar IV.
2578 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2579 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2580
2581 // We build scalar steps for both integer and floating-point induction
2582 // variables. Here, we determine the kind of arithmetic we will perform.
2585 if (BaseIVTy->isIntegerTy()) {
2586 AddOp = Instruction::Add;
2587 MulOp = Instruction::Mul;
2588 } else {
2589 AddOp = InductionOpcode;
2590 MulOp = Instruction::FMul;
2591 }
2592
2593 // Determine the number of scalars we need to generate.
2594 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2595 // Compute the scalar steps and save the results in State.
2596
2597 unsigned StartLane = 0;
2598 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2599 if (State.Lane) {
2600 StartLane = State.Lane->getKnownLane();
2601 EndLane = StartLane + 1;
2602 }
2603 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2604 : Constant::getNullValue(BaseIVTy);
2605
2606 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2607 // It is okay if the induction variable type cannot hold the lane number,
2608 // we expect truncation in this case.
2609 Constant *LaneValue =
2610 BaseIVTy->isIntegerTy()
2611 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2612 /*ImplicitTrunc=*/true)
2613 : ConstantFP::get(BaseIVTy, Lane);
2614 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2615 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2616 "Expected StartIdx to be folded to a constant when VF is not "
2617 "scalable");
2618 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2619 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2620 State.set(this, Add, VPLane(Lane));
2621 }
2622}
2623
2624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2626 VPSlotTracker &SlotTracker) const {
2627 O << Indent;
2629 O << " = SCALAR-STEPS ";
2631}
2632#endif
2633
2635 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2637}
2638
2640 assert(State.VF.isVector() && "not widening");
2641 // Construct a vector GEP by widening the operands of the scalar GEP as
2642 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2643 // results in a vector of pointers when at least one operand of the GEP
2644 // is vector-typed. Thus, to keep the representation compact, we only use
2645 // vector-typed operands for loop-varying values.
2646
2647 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2648 return Op->isDefinedOutsideLoopRegions();
2649 });
2650 if (AllOperandsAreInvariant) {
2651 // If we are vectorizing, but the GEP has only loop-invariant operands,
2652 // the GEP we build (by only using vector-typed operands for
2653 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2654 // produce a vector of pointers, we need to either arbitrarily pick an
2655 // operand to broadcast, or broadcast a clone of the original GEP.
2656 // Here, we broadcast a clone of the original.
2657
2659 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2660 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2661
2662 auto *NewGEP =
2663 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2664 "", getGEPNoWrapFlags());
2665 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2666 State.set(this, Splat);
2667 return;
2668 }
2669
2670 // If the GEP has at least one loop-varying operand, we are sure to
2671 // produce a vector of pointers unless VF is scalar.
2672 // The pointer operand of the new GEP. If it's loop-invariant, we
2673 // won't broadcast it.
2674 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2675
2676 // Collect all the indices for the new GEP. If any index is
2677 // loop-invariant, we won't broadcast it.
2679 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2680 VPValue *Operand = getOperand(I);
2681 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2682 }
2683
2684 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2685 // but it should be a vector, otherwise.
2686 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2687 "", getGEPNoWrapFlags());
2688 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2689 "NewGEP is not a pointer vector");
2690 State.set(this, NewGEP);
2691}
2692
2693#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2695 VPSlotTracker &SlotTracker) const {
2696 O << Indent << "WIDEN-GEP ";
2697 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2698 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2699 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2700
2701 O << " ";
2703 O << " = getelementptr";
2704 printFlags(O);
2706}
2707#endif
2708
2710 assert(!getOffset() && "Unexpected offset operand");
2711 VPBuilder Builder(this);
2712 VPlan &Plan = *getParent()->getPlan();
2713 VPValue *VFVal = getVFValue();
2714 VPTypeAnalysis TypeInfo(Plan);
2715 const DataLayout &DL = Plan.getDataLayout();
2716 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
2717 VPValue *Stride =
2718 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
2719 Type *VFTy = TypeInfo.inferScalarType(VFVal);
2720 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
2722
2723 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2724 VPInstruction *VFMinusOne =
2725 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
2726 DebugLoc::getUnknown(), "", {true, true});
2727 VPInstruction *Offset0 =
2728 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
2729
2730 // Offset for PartN = Offset0 + Part * Stride * VF.
2731 VPValue *PartxStride =
2732 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
2733 VPValue *Offset = Builder.createAdd(
2734 Offset0,
2735 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
2737}
2738
2740 auto &Builder = State.Builder;
2741 assert(getOffset() && "Expected prior materialization of offset");
2742 Value *Ptr = State.get(getPointer(), true);
2743 Value *Offset = State.get(getOffset(), true);
2744 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2746 State.set(this, ResultPtr, /*IsScalar*/ true);
2747}
2748
2749#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2751 VPSlotTracker &SlotTracker) const {
2752 O << Indent;
2754 O << " = vector-end-pointer";
2755 printFlags(O);
2757}
2758#endif
2759
2761 auto &Builder = State.Builder;
2762 assert(getOffset() &&
2763 "Expected prior simplification of recipe without offset");
2764 Value *Ptr = State.get(getOperand(0), VPLane(0));
2765 Value *Offset = State.get(getOffset(), true);
2766 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2768 State.set(this, ResultPtr, /*IsScalar*/ true);
2769}
2770
2771#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2773 VPSlotTracker &SlotTracker) const {
2774 O << Indent;
2776 O << " = vector-pointer";
2777 printFlags(O);
2779}
2780#endif
2781
2783 VPCostContext &Ctx) const {
2784 // A blend will be expanded to a select VPInstruction, which will generate a
2785 // scalar select if only the first lane is used.
2787 VF = ElementCount::getFixed(1);
2788
2789 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2790 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2791 return (getNumIncomingValues() - 1) *
2792 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2793 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2794}
2795
2796#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2798 VPSlotTracker &SlotTracker) const {
2799 O << Indent << "BLEND ";
2801 O << " =";
2802 printFlags(O);
2803 if (getNumIncomingValues() == 1) {
2804 // Not a User of any mask: not really blending, this is a
2805 // single-predecessor phi.
2806 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2807 } else {
2808 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2809 if (I != 0)
2810 O << " ";
2811 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2812 if (I == 0 && isNormalized())
2813 continue;
2814 O << "/";
2815 getMask(I)->printAsOperand(O, SlotTracker);
2816 }
2817 }
2818}
2819#endif
2820
2822 assert(!State.Lane && "Reduction being replicated.");
2825 "In-loop AnyOf reductions aren't currently supported");
2826 // Propagate the fast-math flags carried by the underlying instruction.
2827 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2828 State.Builder.setFastMathFlags(getFastMathFlags());
2829 Value *NewVecOp = State.get(getVecOp());
2830 if (VPValue *Cond = getCondOp()) {
2831 Value *NewCond = State.get(Cond, State.VF.isScalar());
2832 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2833 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2834
2835 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2836 if (State.VF.isVector())
2837 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2838
2839 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2840 NewVecOp = Select;
2841 }
2842 Value *NewRed;
2843 Value *NextInChain;
2844 if (isOrdered()) {
2845 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2846 if (State.VF.isVector())
2847 NewRed =
2848 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2849 else
2850 NewRed = State.Builder.CreateBinOp(
2852 PrevInChain, NewVecOp);
2853 PrevInChain = NewRed;
2854 NextInChain = NewRed;
2855 } else if (isPartialReduction()) {
2856 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2857 "Unexpected partial reduction kind");
2858 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
2859 NewRed = State.Builder.CreateIntrinsic(
2860 PrevInChain->getType(),
2861 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2862 : Intrinsic::vector_partial_reduce_fadd,
2863 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
2864 "partial.reduce");
2865 PrevInChain = NewRed;
2866 NextInChain = NewRed;
2867 } else {
2868 assert(isInLoop() &&
2869 "The reduction must either be ordered, partial or in-loop");
2870 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2871 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2873 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2874 else
2875 NextInChain = State.Builder.CreateBinOp(
2877 PrevInChain, NewRed);
2878 }
2879 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
2880}
2881
2883 assert(!State.Lane && "Reduction being replicated.");
2884
2885 auto &Builder = State.Builder;
2886 // Propagate the fast-math flags carried by the underlying instruction.
2887 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2888 Builder.setFastMathFlags(getFastMathFlags());
2889
2891 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2892 Value *VecOp = State.get(getVecOp());
2893 Value *EVL = State.get(getEVL(), VPLane(0));
2894
2895 Value *Mask;
2896 if (VPValue *CondOp = getCondOp())
2897 Mask = State.get(CondOp);
2898 else
2899 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2900
2901 Value *NewRed;
2902 if (isOrdered()) {
2903 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2904 } else {
2905 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2907 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2908 else
2909 NewRed = Builder.CreateBinOp(
2911 Prev);
2912 }
2913 State.set(this, NewRed, /*IsScalar*/ true);
2914}
2915
2917 VPCostContext &Ctx) const {
2918 RecurKind RdxKind = getRecurrenceKind();
2919 Type *ElementTy = Ctx.Types.inferScalarType(this);
2920 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2921 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2923 std::optional<FastMathFlags> OptionalFMF =
2924 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2925
2926 if (isPartialReduction()) {
2927 InstructionCost CondCost = 0;
2928 if (isConditional()) {
2930 auto *CondTy = cast<VectorType>(
2931 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
2932 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
2933 CondTy, Pred, Ctx.CostKind);
2934 }
2935 return CondCost + Ctx.TTI.getPartialReductionCost(
2936 Opcode, ElementTy, ElementTy, ElementTy, VF,
2937 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
2938 OptionalFMF);
2939 }
2940
2941 // TODO: Support any-of reductions.
2942 assert(
2944 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2945 "Any-of reduction not implemented in VPlan-based cost model currently.");
2946
2947 // Note that TTI should model the cost of moving result to the scalar register
2948 // and the BinOp cost in the getMinMaxReductionCost().
2951 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2952 }
2953
2954 // Note that TTI should model the cost of moving result to the scalar register
2955 // and the BinOp cost in the getArithmeticReductionCost().
2956 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2957 Ctx.CostKind);
2958}
2959
2960VPExpressionRecipe::VPExpressionRecipe(
2961 ExpressionTypes ExpressionType,
2962 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2963 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2964 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2965 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2966 assert(
2967 none_of(ExpressionRecipes,
2968 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2969 "expression cannot contain recipes with side-effects");
2970
2971 // Maintain a copy of the expression recipes as a set of users.
2972 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2973 for (auto *R : ExpressionRecipes)
2974 ExpressionRecipesAsSetOfUsers.insert(R);
2975
2976 // Recipes in the expression, except the last one, must only be used by
2977 // (other) recipes inside the expression. If there are other users, external
2978 // to the expression, use a clone of the recipe for external users.
2979 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2980 if (R != ExpressionRecipes.back() &&
2981 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2982 return !ExpressionRecipesAsSetOfUsers.contains(U);
2983 })) {
2984 // There are users outside of the expression. Clone the recipe and use the
2985 // clone those external users.
2986 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2987 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2988 VPUser &U, unsigned) {
2989 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2990 });
2991 CopyForExtUsers->insertBefore(R);
2992 }
2993 if (R->getParent())
2994 R->removeFromParent();
2995 }
2996
2997 // Internalize all external operands to the expression recipes. To do so,
2998 // create new temporary VPValues for all operands defined by a recipe outside
2999 // the expression. The original operands are added as operands of the
3000 // VPExpressionRecipe itself.
3001 for (auto *R : ExpressionRecipes) {
3002 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3003 auto *Def = Op->getDefiningRecipe();
3004 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3005 continue;
3006 addOperand(Op);
3007 LiveInPlaceholders.push_back(new VPSymbolicValue());
3008 }
3009 }
3010
3011 // Replace each external operand with the first one created for it in
3012 // LiveInPlaceholders.
3013 for (auto *R : ExpressionRecipes)
3014 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3015 R->replaceUsesOfWith(LiveIn, Tmp);
3016}
3017
3019 for (auto *R : ExpressionRecipes)
3020 // Since the list could contain duplicates, make sure the recipe hasn't
3021 // already been inserted.
3022 if (!R->getParent())
3023 R->insertBefore(this);
3024
3025 for (const auto &[Idx, Op] : enumerate(operands()))
3026 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3027
3028 replaceAllUsesWith(ExpressionRecipes.back());
3029 ExpressionRecipes.clear();
3030}
3031
3033 VPCostContext &Ctx) const {
3034 Type *RedTy = Ctx.Types.inferScalarType(this);
3035 auto *SrcVecTy = cast<VectorType>(
3036 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
3037 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3038 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3039 switch (ExpressionType) {
3040 case ExpressionTypes::ExtendedReduction: {
3041 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3042 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
3043 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3044 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3045
3046 if (RedR->isPartialReduction())
3047 return Ctx.TTI.getPartialReductionCost(
3048 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
3050 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3051 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3052 : std::nullopt);
3053 else if (!RedTy->isFloatingPointTy())
3054 // TTI::getExtendedReductionCost only supports integer types.
3055 return Ctx.TTI.getExtendedReductionCost(
3056 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3057 std::nullopt, Ctx.CostKind);
3058 else
3060 }
3061 case ExpressionTypes::MulAccReduction:
3062 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3063 Ctx.CostKind);
3064
3065 case ExpressionTypes::ExtNegatedMulAccReduction:
3066 assert(Opcode == Instruction::Add && "Unexpected opcode");
3067 Opcode = Instruction::Sub;
3068 [[fallthrough]];
3069 case ExpressionTypes::ExtMulAccReduction: {
3070 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3071 if (RedR->isPartialReduction()) {
3072 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3073 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3074 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3075 return Ctx.TTI.getPartialReductionCost(
3076 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
3077 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
3079 Ext0R->getOpcode()),
3081 Ext1R->getOpcode()),
3082 Mul->getOpcode(), Ctx.CostKind,
3083 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3084 : std::nullopt);
3085 }
3086 return Ctx.TTI.getMulAccReductionCost(
3087 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3088 Instruction::ZExt,
3089 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3090 }
3091 }
3092 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3093}
3094
3096 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3097 return R->mayReadFromMemory() || R->mayWriteToMemory();
3098 });
3099}
3100
3102 assert(
3103 none_of(ExpressionRecipes,
3104 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3105 "expression cannot contain recipes with side-effects");
3106 return false;
3107}
3108
3110 // Cannot use vputils::isSingleScalar(), because all external operands
3111 // of the expression will be live-ins while bundled.
3112 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3113 return RR && !RR->isPartialReduction();
3114}
3115
3116#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3117
3119 VPSlotTracker &SlotTracker) const {
3120 O << Indent << "EXPRESSION ";
3122 O << " = ";
3123 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3124 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3125
3126 switch (ExpressionType) {
3127 case ExpressionTypes::ExtendedReduction: {
3129 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3130 O << Instruction::getOpcodeName(Opcode) << " (";
3132 Red->printFlags(O);
3133
3134 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3135 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3136 << *Ext0->getResultType();
3137 if (Red->isConditional()) {
3138 O << ", ";
3139 Red->getCondOp()->printAsOperand(O, SlotTracker);
3140 }
3141 O << ")";
3142 break;
3143 }
3144 case ExpressionTypes::ExtNegatedMulAccReduction: {
3146 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3148 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3149 << " (sub (0, mul";
3150 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3151 Mul->printFlags(O);
3152 O << "(";
3154 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3155 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3156 << *Ext0->getResultType() << "), (";
3158 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3159 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3160 << *Ext1->getResultType() << ")";
3161 if (Red->isConditional()) {
3162 O << ", ";
3163 Red->getCondOp()->printAsOperand(O, SlotTracker);
3164 }
3165 O << "))";
3166 break;
3167 }
3168 case ExpressionTypes::MulAccReduction:
3169 case ExpressionTypes::ExtMulAccReduction: {
3171 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3173 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3174 << " (";
3175 O << "mul";
3176 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3177 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3178 : ExpressionRecipes[0]);
3179 Mul->printFlags(O);
3180 if (IsExtended)
3181 O << "(";
3183 if (IsExtended) {
3184 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3185 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3186 << *Ext0->getResultType() << "), (";
3187 } else {
3188 O << ", ";
3189 }
3191 if (IsExtended) {
3192 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3193 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3194 << *Ext1->getResultType() << ")";
3195 }
3196 if (Red->isConditional()) {
3197 O << ", ";
3198 Red->getCondOp()->printAsOperand(O, SlotTracker);
3199 }
3200 O << ")";
3201 break;
3202 }
3203 }
3204}
3205
3207 VPSlotTracker &SlotTracker) const {
3208 if (isPartialReduction())
3209 O << Indent << "PARTIAL-REDUCE ";
3210 else
3211 O << Indent << "REDUCE ";
3213 O << " = ";
3215 O << " +";
3216 printFlags(O);
3217 O << " reduce."
3220 << " (";
3222 if (isConditional()) {
3223 O << ", ";
3225 }
3226 O << ")";
3227}
3228
3230 VPSlotTracker &SlotTracker) const {
3231 O << Indent << "REDUCE ";
3233 O << " = ";
3235 O << " +";
3236 printFlags(O);
3237 O << " vp.reduce."
3240 << " (";
3242 O << ", ";
3244 if (isConditional()) {
3245 O << ", ";
3247 }
3248 O << ")";
3249}
3250
3251#endif
3252
3253/// A helper function to scalarize a single Instruction in the innermost loop.
3254/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3255/// operands from \p RepRecipe instead of \p Instr's operands.
3256static void scalarizeInstruction(const Instruction *Instr,
3257 VPReplicateRecipe *RepRecipe,
3258 const VPLane &Lane, VPTransformState &State) {
3259 assert((!Instr->getType()->isAggregateType() ||
3260 canVectorizeTy(Instr->getType())) &&
3261 "Expected vectorizable or non-aggregate type.");
3262
3263 // Does this instruction return a value ?
3264 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3265
3266 Instruction *Cloned = Instr->clone();
3267 if (!IsVoidRetTy) {
3268 Cloned->setName(Instr->getName() + ".cloned");
3269 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3270 // The operands of the replicate recipe may have been narrowed, resulting in
3271 // a narrower result type. Update the type of the cloned instruction to the
3272 // correct type.
3273 if (ResultTy != Cloned->getType())
3274 Cloned->mutateType(ResultTy);
3275 }
3276
3277 RepRecipe->applyFlags(*Cloned);
3278 RepRecipe->applyMetadata(*Cloned);
3279
3280 if (RepRecipe->hasPredicate())
3281 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3282
3283 if (auto DL = RepRecipe->getDebugLoc())
3284 State.setDebugLocFrom(DL);
3285
3286 // Replace the operands of the cloned instructions with their scalar
3287 // equivalents in the new loop.
3288 for (const auto &I : enumerate(RepRecipe->operands())) {
3289 auto InputLane = Lane;
3290 VPValue *Operand = I.value();
3291 if (vputils::isSingleScalar(Operand))
3292 InputLane = VPLane::getFirstLane();
3293 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3294 }
3295
3296 // Place the cloned scalar in the new loop.
3297 State.Builder.Insert(Cloned);
3298
3299 State.set(RepRecipe, Cloned, Lane);
3300
3301 // If we just cloned a new assumption, add it the assumption cache.
3302 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3303 State.AC->registerAssumption(II);
3304
3305 assert(
3306 (RepRecipe->getRegion() ||
3307 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3308 all_of(RepRecipe->operands(),
3309 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3310 "Expected a recipe is either within a region or all of its operands "
3311 "are defined outside the vectorized region.");
3312}
3313
3316
3317 if (!State.Lane) {
3318 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3319 "must have already been unrolled");
3320 scalarizeInstruction(UI, this, VPLane(0), State);
3321 return;
3322 }
3323
3324 assert((State.VF.isScalar() || !isSingleScalar()) &&
3325 "uniform recipe shouldn't be predicated");
3326 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3327 scalarizeInstruction(UI, this, *State.Lane, State);
3328 // Insert scalar instance packing it into a vector.
3329 if (State.VF.isVector() && shouldPack()) {
3330 Value *WideValue =
3331 State.Lane->isFirstLane()
3332 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3333 : State.get(this);
3334 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3335 *State.Lane));
3336 }
3337}
3338
3340 // Find if the recipe is used by a widened recipe via an intervening
3341 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3342 return any_of(users(), [](const VPUser *U) {
3343 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3344 return !vputils::onlyScalarValuesUsed(PredR);
3345 return false;
3346 });
3347}
3348
3349/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3350/// which the legacy cost model computes a SCEV expression when computing the
3351/// address cost. Computing SCEVs for VPValues is incomplete and returns
3352/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3353/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3354static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3356 const Loop *L) {
3357 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3358 if (isa<SCEVCouldNotCompute>(Addr))
3359 return Addr;
3360
3361 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3362}
3363
3364/// Returns true if \p V is used as part of the address of another load or
3365/// store.
3366static bool isUsedByLoadStoreAddress(const VPUser *V) {
3368 SmallVector<const VPUser *> WorkList = {V};
3369
3370 while (!WorkList.empty()) {
3371 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3372 if (!Cur || !Seen.insert(Cur).second)
3373 continue;
3374
3375 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3376 // Skip blends that use V only through a compare by checking if any incoming
3377 // value was already visited.
3378 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3379 [&](unsigned I) {
3380 return Seen.contains(
3381 Blend->getIncomingValue(I)->getDefiningRecipe());
3382 }))
3383 continue;
3384
3385 for (VPUser *U : Cur->users()) {
3386 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3387 if (InterleaveR->getAddr() == Cur)
3388 return true;
3389 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3390 if (RepR->getOpcode() == Instruction::Load &&
3391 RepR->getOperand(0) == Cur)
3392 return true;
3393 if (RepR->getOpcode() == Instruction::Store &&
3394 RepR->getOperand(1) == Cur)
3395 return true;
3396 }
3397 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3398 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3399 return true;
3400 }
3401 }
3402
3403 // The legacy cost model only supports scalarization loads/stores with phi
3404 // addresses, if the phi is directly used as load/store address. Don't
3405 // traverse further for Blends.
3406 if (Blend)
3407 continue;
3408
3409 append_range(WorkList, Cur->users());
3410 }
3411 return false;
3412}
3413
3414/// Return true if \p R is a predicated load/store with a loop-invariant address
3415/// only masked by the header mask.
3417 const SCEV *PtrSCEV,
3418 VPCostContext &Ctx) {
3419 const VPRegionBlock *ParentRegion = R.getRegion();
3420 if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
3421 !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
3422 return false;
3423 auto *BOM =
3425 return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
3426}
3427
3429 VPCostContext &Ctx) const {
3431 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3432 // transform, avoid computing their cost multiple times for now.
3433 Ctx.SkipCostComputation.insert(UI);
3434
3435 if (VF.isScalable() && !isSingleScalar())
3437
3438 switch (UI->getOpcode()) {
3439 case Instruction::Alloca:
3440 if (VF.isScalable())
3442 return Ctx.TTI.getArithmeticInstrCost(
3443 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3444 case Instruction::GetElementPtr:
3445 // We mark this instruction as zero-cost because the cost of GEPs in
3446 // vectorized code depends on whether the corresponding memory instruction
3447 // is scalarized or not. Therefore, we handle GEPs with the memory
3448 // instruction cost.
3449 return 0;
3450 case Instruction::Call: {
3451 auto *CalledFn =
3453
3456 for (const VPValue *ArgOp : ArgOps)
3457 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3458
3459 if (CalledFn->isIntrinsic())
3460 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3461 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3462 switch (CalledFn->getIntrinsicID()) {
3463 case Intrinsic::assume:
3464 case Intrinsic::lifetime_end:
3465 case Intrinsic::lifetime_start:
3466 case Intrinsic::sideeffect:
3467 case Intrinsic::pseudoprobe:
3468 case Intrinsic::experimental_noalias_scope_decl: {
3469 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3470 ElementCount::getFixed(1), Ctx) == 0 &&
3471 "scalarizing intrinsic should be free");
3472 return InstructionCost(0);
3473 }
3474 default:
3475 break;
3476 }
3477
3478 Type *ResultTy = Ctx.Types.inferScalarType(this);
3479 InstructionCost ScalarCallCost =
3480 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3481 if (isSingleScalar()) {
3482 if (CalledFn->isIntrinsic())
3483 ScalarCallCost = std::min(
3484 ScalarCallCost,
3485 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3486 ElementCount::getFixed(1), Ctx));
3487 return ScalarCallCost;
3488 }
3489
3490 return ScalarCallCost * VF.getFixedValue() +
3491 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3492 }
3493 case Instruction::Add:
3494 case Instruction::Sub:
3495 case Instruction::FAdd:
3496 case Instruction::FSub:
3497 case Instruction::Mul:
3498 case Instruction::FMul:
3499 case Instruction::FDiv:
3500 case Instruction::FRem:
3501 case Instruction::Shl:
3502 case Instruction::LShr:
3503 case Instruction::AShr:
3504 case Instruction::And:
3505 case Instruction::Or:
3506 case Instruction::Xor:
3507 case Instruction::ICmp:
3508 case Instruction::FCmp:
3510 Ctx) *
3511 (isSingleScalar() ? 1 : VF.getFixedValue());
3512 case Instruction::SDiv:
3513 case Instruction::UDiv:
3514 case Instruction::SRem:
3515 case Instruction::URem: {
3516 InstructionCost ScalarCost =
3518 if (isSingleScalar())
3519 return ScalarCost;
3520
3521 // If any of the operands is from a different replicate region and has its
3522 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3523 // model to avoid cost mis-match.
3524 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3525 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3526 if (!PredR)
3527 return false;
3528 return Ctx.skipCostComputation(
3530 PredR->getOperand(0)->getUnderlyingValue()),
3531 VF.isVector());
3532 }))
3533 break;
3534
3535 ScalarCost = ScalarCost * VF.getFixedValue() +
3536 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3537 to_vector(operands()), VF);
3538 // If the recipe is not predicated (i.e. not in a replicate region), return
3539 // the scalar cost. Otherwise handle predicated cost.
3540 if (!getRegion()->isReplicator())
3541 return ScalarCost;
3542
3543 // Account for the phi nodes that we will create.
3544 ScalarCost += VF.getFixedValue() *
3545 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3546 // Scale the cost by the probability of executing the predicated blocks.
3547 // This assumes the predicated block for each vector lane is equally
3548 // likely.
3549 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3550 return ScalarCost;
3551 }
3552 case Instruction::Load:
3553 case Instruction::Store: {
3554 bool IsLoad = UI->getOpcode() == Instruction::Load;
3555 const VPValue *PtrOp = getOperand(!IsLoad);
3556 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3558 break;
3559
3560 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3561 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3562 const Align Alignment = getLoadStoreAlignment(UI);
3563 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3565 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3566 bool UsedByLoadStoreAddress =
3567 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3568 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3569 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3570 UsedByLoadStoreAddress ? UI : nullptr);
3571
3572 // Check if this is a predicated load/store with a loop-invariant address
3573 // only masked by the header mask. If so, return the uniform mem op cost.
3574 if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
3575 InstructionCost UniformCost =
3576 ScalarMemOpCost +
3577 Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
3578 /*Ptr=*/nullptr, Ctx.CostKind);
3579 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3580 if (IsLoad) {
3581 return UniformCost +
3582 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
3583 VectorTy, VectorTy, {}, Ctx.CostKind);
3584 }
3585
3586 VPValue *StoredVal = getOperand(0);
3587 if (!StoredVal->isDefinedOutsideLoopRegions())
3588 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3589 Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
3590 return UniformCost;
3591 }
3592
3593 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3594 InstructionCost ScalarCost =
3595 ScalarMemOpCost +
3596 Ctx.TTI.getAddressComputationCost(
3597 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3598 Ctx.CostKind);
3599 if (isSingleScalar())
3600 return ScalarCost;
3601
3602 SmallVector<const VPValue *> OpsToScalarize;
3603 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3604 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3605 // don't assign scalarization overhead in general, if the target prefers
3606 // vectorized addressing or the loaded value is used as part of an address
3607 // of another load or store.
3608 if (!UsedByLoadStoreAddress) {
3609 bool EfficientVectorLoadStore =
3610 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3611 if (!(IsLoad && !PreferVectorizedAddressing) &&
3612 !(!IsLoad && EfficientVectorLoadStore))
3613 append_range(OpsToScalarize, operands());
3614
3615 if (!EfficientVectorLoadStore)
3616 ResultTy = Ctx.Types.inferScalarType(this);
3617 }
3618
3622 (ScalarCost * VF.getFixedValue()) +
3623 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3624
3625 const VPRegionBlock *ParentRegion = getRegion();
3626 if (ParentRegion && ParentRegion->isReplicator()) {
3627 if (!PtrSCEV)
3628 break;
3629 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3630 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3631
3632 auto *VecI1Ty = VectorType::get(
3633 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3634 Cost += Ctx.TTI.getScalarizationOverhead(
3635 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3636 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3637
3638 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3639 // Artificially setting to a high enough value to practically disable
3640 // vectorization with such operations.
3641 return 3000000;
3642 }
3643 }
3644 return Cost;
3645 }
3646 case Instruction::SExt:
3647 case Instruction::ZExt:
3648 case Instruction::FPToUI:
3649 case Instruction::FPToSI:
3650 case Instruction::FPExt:
3651 case Instruction::PtrToInt:
3652 case Instruction::PtrToAddr:
3653 case Instruction::IntToPtr:
3654 case Instruction::SIToFP:
3655 case Instruction::UIToFP:
3656 case Instruction::Trunc:
3657 case Instruction::FPTrunc:
3658 case Instruction::Select:
3659 case Instruction::AddrSpaceCast: {
3661 Ctx) *
3662 (isSingleScalar() ? 1 : VF.getFixedValue());
3663 }
3664 case Instruction::ExtractValue:
3665 case Instruction::InsertValue:
3666 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3667 }
3668
3669 return Ctx.getLegacyCost(UI, VF);
3670}
3671
3672#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3674 VPSlotTracker &SlotTracker) const {
3675 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3676
3677 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3679 O << " = ";
3680 }
3681 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3682 O << "call";
3683 printFlags(O);
3684 O << "@" << CB->getCalledFunction()->getName() << "(";
3686 O, [&O, &SlotTracker](VPValue *Op) {
3687 Op->printAsOperand(O, SlotTracker);
3688 });
3689 O << ")";
3690 } else {
3692 printFlags(O);
3694 }
3695
3696 if (shouldPack())
3697 O << " (S->V)";
3698}
3699#endif
3700
3702 assert(State.Lane && "Branch on Mask works only on single instance.");
3703
3704 VPValue *BlockInMask = getOperand(0);
3705 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3706
3707 // Replace the temporary unreachable terminator with a new conditional branch,
3708 // whose two destinations will be set later when they are created.
3709 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3710 assert(isa<UnreachableInst>(CurrentTerminator) &&
3711 "Expected to replace unreachable terminator with conditional branch.");
3712 auto CondBr =
3713 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3714 CondBr->setSuccessor(0, nullptr);
3715 CurrentTerminator->eraseFromParent();
3716}
3717
3719 VPCostContext &Ctx) const {
3720 // The legacy cost model doesn't assign costs to branches for individual
3721 // replicate regions. Match the current behavior in the VPlan cost model for
3722 // now.
3723 return 0;
3724}
3725
3727 assert(State.Lane && "Predicated instruction PHI works per instance.");
3728 Instruction *ScalarPredInst =
3729 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3730 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3731 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3732 assert(PredicatingBB && "Predicated block has no single predecessor.");
3734 "operand must be VPReplicateRecipe");
3735
3736 // By current pack/unpack logic we need to generate only a single phi node: if
3737 // a vector value for the predicated instruction exists at this point it means
3738 // the instruction has vector users only, and a phi for the vector value is
3739 // needed. In this case the recipe of the predicated instruction is marked to
3740 // also do that packing, thereby "hoisting" the insert-element sequence.
3741 // Otherwise, a phi node for the scalar value is needed.
3742 if (State.hasVectorValue(getOperand(0))) {
3743 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3745 "Packed operands must generate an insertelement or insertvalue");
3746
3747 // If VectorI is a struct, it will be a sequence like:
3748 // %1 = insertvalue %unmodified, %x, 0
3749 // %2 = insertvalue %1, %y, 1
3750 // %VectorI = insertvalue %2, %z, 2
3751 // To get the unmodified vector we need to look through the chain.
3752 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3753 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3754 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3755
3756 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3757 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3758 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3759 if (State.hasVectorValue(this))
3760 State.reset(this, VPhi);
3761 else
3762 State.set(this, VPhi);
3763 // NOTE: Currently we need to update the value of the operand, so the next
3764 // predicated iteration inserts its generated value in the correct vector.
3765 State.reset(getOperand(0), VPhi);
3766 } else {
3767 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3768 return;
3769
3770 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3771 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3772 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3773 PredicatingBB);
3774 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3775 if (State.hasScalarValue(this, *State.Lane))
3776 State.reset(this, Phi, *State.Lane);
3777 else
3778 State.set(this, Phi, *State.Lane);
3779 // NOTE: Currently we need to update the value of the operand, so the next
3780 // predicated iteration inserts its generated value in the correct vector.
3781 State.reset(getOperand(0), Phi, *State.Lane);
3782 }
3783}
3784
3785#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3787 VPSlotTracker &SlotTracker) const {
3788 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3790 O << " = ";
3792}
3793#endif
3794
3796 VPCostContext &Ctx) const {
3798 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3799 ->getAddressSpace();
3800 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3801 ? Instruction::Load
3802 : Instruction::Store;
3803
3804 if (!Consecutive) {
3805 // TODO: Using the original IR may not be accurate.
3806 // Currently, ARM will use the underlying IR to calculate gather/scatter
3807 // instruction cost.
3808 assert(!Reverse &&
3809 "Inconsecutive memory access should not have the order.");
3810
3812 Type *PtrTy = Ptr->getType();
3813
3814 // If the address value is uniform across all lanes, then the address can be
3815 // calculated with scalar type and broadcast.
3817 PtrTy = toVectorTy(PtrTy, VF);
3818
3819 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
3820 : isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
3821 : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
3822 : Intrinsic::vp_scatter;
3823 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3824 Ctx.CostKind) +
3825 Ctx.TTI.getMemIntrinsicInstrCost(
3827 &Ingredient),
3828 Ctx.CostKind);
3829 }
3830
3832 if (IsMasked) {
3833 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3834 : Intrinsic::masked_store;
3835 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3836 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3837 } else {
3838 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3840 : getOperand(1));
3841 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3842 OpInfo, &Ingredient);
3843 }
3844 return Cost;
3845}
3846
3848 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3849 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3850 bool CreateGather = !isConsecutive();
3851
3852 auto &Builder = State.Builder;
3853 Value *Mask = nullptr;
3854 if (auto *VPMask = getMask()) {
3855 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3856 // of a null all-one mask is a null mask.
3857 Mask = State.get(VPMask);
3858 if (isReverse())
3859 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3860 }
3861
3862 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3863 Value *NewLI;
3864 if (CreateGather) {
3865 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3866 "wide.masked.gather");
3867 } else if (Mask) {
3868 NewLI =
3869 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3870 PoisonValue::get(DataTy), "wide.masked.load");
3871 } else {
3872 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3873 }
3875 State.set(this, NewLI);
3876}
3877
3878#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3880 VPSlotTracker &SlotTracker) const {
3881 O << Indent << "WIDEN ";
3883 O << " = load ";
3885}
3886#endif
3887
3888/// Use all-true mask for reverse rather than actual mask, as it avoids a
3889/// dependence w/o affecting the result.
3891 Value *EVL, const Twine &Name) {
3892 VectorType *ValTy = cast<VectorType>(Operand->getType());
3893 Value *AllTrueMask =
3894 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3895 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3896 {Operand, AllTrueMask, EVL}, nullptr, Name);
3897}
3898
3900 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3901 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3902 bool CreateGather = !isConsecutive();
3903
3904 auto &Builder = State.Builder;
3905 CallInst *NewLI;
3906 Value *EVL = State.get(getEVL(), VPLane(0));
3907 Value *Addr = State.get(getAddr(), !CreateGather);
3908 Value *Mask = nullptr;
3909 if (VPValue *VPMask = getMask()) {
3910 Mask = State.get(VPMask);
3911 if (isReverse())
3912 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3913 } else {
3914 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3915 }
3916
3917 if (CreateGather) {
3918 NewLI =
3919 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3920 nullptr, "wide.masked.gather");
3921 } else {
3922 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3923 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3924 }
3925 NewLI->addParamAttr(
3927 applyMetadata(*NewLI);
3928 Instruction *Res = NewLI;
3929 State.set(this, Res);
3930}
3931
3933 VPCostContext &Ctx) const {
3934 if (!Consecutive || IsMasked)
3935 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3936
3937 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3938 // here because the EVL recipes using EVL to replace the tail mask. But in the
3939 // legacy model, it will always calculate the cost of mask.
3940 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3941 // don't need to compare to the legacy cost model.
3943 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3944 ->getAddressSpace();
3945 return Ctx.TTI.getMemIntrinsicInstrCost(
3946 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3947 Ctx.CostKind);
3948}
3949
3950#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3952 VPSlotTracker &SlotTracker) const {
3953 O << Indent << "WIDEN ";
3955 O << " = vp.load ";
3957}
3958#endif
3959
3961 VPValue *StoredVPValue = getStoredValue();
3962 bool CreateScatter = !isConsecutive();
3963
3964 auto &Builder = State.Builder;
3965
3966 Value *Mask = nullptr;
3967 if (auto *VPMask = getMask()) {
3968 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3969 // of a null all-one mask is a null mask.
3970 Mask = State.get(VPMask);
3971 if (isReverse())
3972 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3973 }
3974
3975 Value *StoredVal = State.get(StoredVPValue);
3976 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3977 Instruction *NewSI = nullptr;
3978 if (CreateScatter)
3979 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3980 else if (Mask)
3981 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3982 else
3983 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3984 applyMetadata(*NewSI);
3985}
3986
3987#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3989 VPSlotTracker &SlotTracker) const {
3990 O << Indent << "WIDEN store ";
3992}
3993#endif
3994
3996 VPValue *StoredValue = getStoredValue();
3997 bool CreateScatter = !isConsecutive();
3998
3999 auto &Builder = State.Builder;
4000
4001 CallInst *NewSI = nullptr;
4002 Value *StoredVal = State.get(StoredValue);
4003 Value *EVL = State.get(getEVL(), VPLane(0));
4004 Value *Mask = nullptr;
4005 if (VPValue *VPMask = getMask()) {
4006 Mask = State.get(VPMask);
4007 if (isReverse())
4008 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
4009 } else {
4010 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4011 }
4012 Value *Addr = State.get(getAddr(), !CreateScatter);
4013 if (CreateScatter) {
4014 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4015 Intrinsic::vp_scatter,
4016 {StoredVal, Addr, Mask, EVL});
4017 } else {
4018 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4019 Intrinsic::vp_store,
4020 {StoredVal, Addr, Mask, EVL});
4021 }
4022 NewSI->addParamAttr(
4024 applyMetadata(*NewSI);
4025}
4026
4028 VPCostContext &Ctx) const {
4029 if (!Consecutive || IsMasked)
4030 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4031
4032 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4033 // here because the EVL recipes using EVL to replace the tail mask. But in the
4034 // legacy model, it will always calculate the cost of mask.
4035 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4036 // don't need to compare to the legacy cost model.
4038 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4039 ->getAddressSpace();
4040 return Ctx.TTI.getMemIntrinsicInstrCost(
4041 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4042 Ctx.CostKind);
4043}
4044
4045#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4047 VPSlotTracker &SlotTracker) const {
4048 O << Indent << "WIDEN vp.store ";
4050}
4051#endif
4052
4054 VectorType *DstVTy, const DataLayout &DL) {
4055 // Verify that V is a vector type with same number of elements as DstVTy.
4056 auto VF = DstVTy->getElementCount();
4057 auto *SrcVecTy = cast<VectorType>(V->getType());
4058 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4059 Type *SrcElemTy = SrcVecTy->getElementType();
4060 Type *DstElemTy = DstVTy->getElementType();
4061 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4062 "Vector elements must have same size");
4063
4064 // Do a direct cast if element types are castable.
4065 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4066 return Builder.CreateBitOrPointerCast(V, DstVTy);
4067 }
4068 // V cannot be directly casted to desired vector type.
4069 // May happen when V is a floating point vector but DstVTy is a vector of
4070 // pointers or vice-versa. Handle this using a two-step bitcast using an
4071 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4072 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4073 "Only one type should be a pointer type");
4074 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4075 "Only one type should be a floating point type");
4076 Type *IntTy =
4077 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4078 auto *VecIntTy = VectorType::get(IntTy, VF);
4079 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4080 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4081}
4082
4083/// Return a vector containing interleaved elements from multiple
4084/// smaller input vectors.
4086 const Twine &Name) {
4087 unsigned Factor = Vals.size();
4088 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4089
4090 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4091#ifndef NDEBUG
4092 for (Value *Val : Vals)
4093 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4094#endif
4095
4096 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4097 // must use intrinsics to interleave.
4098 if (VecTy->isScalableTy()) {
4099 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4100 return Builder.CreateVectorInterleave(Vals, Name);
4101 }
4102
4103 // Fixed length. Start by concatenating all vectors into a wide vector.
4104 Value *WideVec = concatenateVectors(Builder, Vals);
4105
4106 // Interleave the elements into the wide vector.
4107 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4108 return Builder.CreateShuffleVector(
4109 WideVec, createInterleaveMask(NumElts, Factor), Name);
4110}
4111
4112// Try to vectorize the interleave group that \p Instr belongs to.
4113//
4114// E.g. Translate following interleaved load group (factor = 3):
4115// for (i = 0; i < N; i+=3) {
4116// R = Pic[i]; // Member of index 0
4117// G = Pic[i+1]; // Member of index 1
4118// B = Pic[i+2]; // Member of index 2
4119// ... // do something to R, G, B
4120// }
4121// To:
4122// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4123// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4124// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4125// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4126//
4127// Or translate following interleaved store group (factor = 3):
4128// for (i = 0; i < N; i+=3) {
4129// ... do something to R, G, B
4130// Pic[i] = R; // Member of index 0
4131// Pic[i+1] = G; // Member of index 1
4132// Pic[i+2] = B; // Member of index 2
4133// }
4134// To:
4135// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4136// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4137// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4138// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4139// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4141 assert(!State.Lane && "Interleave group being replicated.");
4142 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4143 "Masking gaps for scalable vectors is not yet supported.");
4145 Instruction *Instr = Group->getInsertPos();
4146
4147 // Prepare for the vector type of the interleaved load/store.
4148 Type *ScalarTy = getLoadStoreType(Instr);
4149 unsigned InterleaveFactor = Group->getFactor();
4150 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4151
4152 VPValue *BlockInMask = getMask();
4153 VPValue *Addr = getAddr();
4154 Value *ResAddr = State.get(Addr, VPLane(0));
4155
4156 auto CreateGroupMask = [&BlockInMask, &State,
4157 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4158 if (State.VF.isScalable()) {
4159 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4160 assert(InterleaveFactor <= 8 &&
4161 "Unsupported deinterleave factor for scalable vectors");
4162 auto *ResBlockInMask = State.get(BlockInMask);
4163 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4164 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4165 }
4166
4167 if (!BlockInMask)
4168 return MaskForGaps;
4169
4170 Value *ResBlockInMask = State.get(BlockInMask);
4171 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4172 ResBlockInMask,
4173 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4174 "interleaved.mask");
4175 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4176 ShuffledMask, MaskForGaps)
4177 : ShuffledMask;
4178 };
4179
4180 const DataLayout &DL = Instr->getDataLayout();
4181 // Vectorize the interleaved load group.
4182 if (isa<LoadInst>(Instr)) {
4183 Value *MaskForGaps = nullptr;
4184 if (needsMaskForGaps()) {
4185 MaskForGaps =
4186 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4187 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4188 }
4189
4190 Instruction *NewLoad;
4191 if (BlockInMask || MaskForGaps) {
4192 Value *GroupMask = CreateGroupMask(MaskForGaps);
4193 Value *PoisonVec = PoisonValue::get(VecTy);
4194 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4195 Group->getAlign(), GroupMask,
4196 PoisonVec, "wide.masked.vec");
4197 } else
4198 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4199 Group->getAlign(), "wide.vec");
4200 applyMetadata(*NewLoad);
4201 // TODO: Also manage existing metadata using VPIRMetadata.
4202 Group->addMetadata(NewLoad);
4203
4205 if (VecTy->isScalableTy()) {
4206 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4207 // so must use intrinsics to deinterleave.
4208 assert(InterleaveFactor <= 8 &&
4209 "Unsupported deinterleave factor for scalable vectors");
4210 NewLoad = State.Builder.CreateIntrinsic(
4211 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4212 NewLoad->getType(), NewLoad,
4213 /*FMFSource=*/nullptr, "strided.vec");
4214 }
4215
4216 auto CreateStridedVector = [&InterleaveFactor, &State,
4217 &NewLoad](unsigned Index) -> Value * {
4218 assert(Index < InterleaveFactor && "Illegal group index");
4219 if (State.VF.isScalable())
4220 return State.Builder.CreateExtractValue(NewLoad, Index);
4221
4222 // For fixed length VF, use shuffle to extract the sub-vectors from the
4223 // wide load.
4224 auto StrideMask =
4225 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4226 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4227 "strided.vec");
4228 };
4229
4230 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4231 Instruction *Member = Group->getMember(I);
4232
4233 // Skip the gaps in the group.
4234 if (!Member)
4235 continue;
4236
4237 Value *StridedVec = CreateStridedVector(I);
4238
4239 // If this member has different type, cast the result type.
4240 if (Member->getType() != ScalarTy) {
4241 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4242 StridedVec =
4243 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4244 }
4245
4246 if (Group->isReverse())
4247 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4248
4249 State.set(VPDefs[J], StridedVec);
4250 ++J;
4251 }
4252 return;
4253 }
4254
4255 // The sub vector type for current instruction.
4256 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4257
4258 // Vectorize the interleaved store group.
4259 Value *MaskForGaps =
4260 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4261 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4262 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4263 ArrayRef<VPValue *> StoredValues = getStoredValues();
4264 // Collect the stored vector from each member.
4265 SmallVector<Value *, 4> StoredVecs;
4266 unsigned StoredIdx = 0;
4267 for (unsigned i = 0; i < InterleaveFactor; i++) {
4268 assert((Group->getMember(i) || MaskForGaps) &&
4269 "Fail to get a member from an interleaved store group");
4270 Instruction *Member = Group->getMember(i);
4271
4272 // Skip the gaps in the group.
4273 if (!Member) {
4274 Value *Undef = PoisonValue::get(SubVT);
4275 StoredVecs.push_back(Undef);
4276 continue;
4277 }
4278
4279 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4280 ++StoredIdx;
4281
4282 if (Group->isReverse())
4283 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4284
4285 // If this member has different type, cast it to a unified type.
4286
4287 if (StoredVec->getType() != SubVT)
4288 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4289
4290 StoredVecs.push_back(StoredVec);
4291 }
4292
4293 // Interleave all the smaller vectors into one wider vector.
4294 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4295 Instruction *NewStoreInstr;
4296 if (BlockInMask || MaskForGaps) {
4297 Value *GroupMask = CreateGroupMask(MaskForGaps);
4298 NewStoreInstr = State.Builder.CreateMaskedStore(
4299 IVec, ResAddr, Group->getAlign(), GroupMask);
4300 } else
4301 NewStoreInstr =
4302 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4303
4304 applyMetadata(*NewStoreInstr);
4305 // TODO: Also manage existing metadata using VPIRMetadata.
4306 Group->addMetadata(NewStoreInstr);
4307}
4308
4309#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4311 VPSlotTracker &SlotTracker) const {
4313 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4314 IG->getInsertPos()->printAsOperand(O, false);
4315 O << ", ";
4317 VPValue *Mask = getMask();
4318 if (Mask) {
4319 O << ", ";
4320 Mask->printAsOperand(O, SlotTracker);
4321 }
4322
4323 unsigned OpIdx = 0;
4324 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4325 if (!IG->getMember(i))
4326 continue;
4327 if (getNumStoreOperands() > 0) {
4328 O << "\n" << Indent << " store ";
4330 O << " to index " << i;
4331 } else {
4332 O << "\n" << Indent << " ";
4334 O << " = load from index " << i;
4335 }
4336 ++OpIdx;
4337 }
4338}
4339#endif
4340
4342 assert(!State.Lane && "Interleave group being replicated.");
4343 assert(State.VF.isScalable() &&
4344 "Only support scalable VF for EVL tail-folding.");
4346 "Masking gaps for scalable vectors is not yet supported.");
4348 Instruction *Instr = Group->getInsertPos();
4349
4350 // Prepare for the vector type of the interleaved load/store.
4351 Type *ScalarTy = getLoadStoreType(Instr);
4352 unsigned InterleaveFactor = Group->getFactor();
4353 assert(InterleaveFactor <= 8 &&
4354 "Unsupported deinterleave/interleave factor for scalable vectors");
4355 ElementCount WideVF = State.VF * InterleaveFactor;
4356 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4357
4358 VPValue *Addr = getAddr();
4359 Value *ResAddr = State.get(Addr, VPLane(0));
4360 Value *EVL = State.get(getEVL(), VPLane(0));
4361 Value *InterleaveEVL = State.Builder.CreateMul(
4362 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4363 /* NUW= */ true, /* NSW= */ true);
4364 LLVMContext &Ctx = State.Builder.getContext();
4365
4366 Value *GroupMask = nullptr;
4367 if (VPValue *BlockInMask = getMask()) {
4368 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4369 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4370 } else {
4371 GroupMask =
4372 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4373 }
4374
4375 // Vectorize the interleaved load group.
4376 if (isa<LoadInst>(Instr)) {
4377 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4378 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4379 "wide.vp.load");
4380 NewLoad->addParamAttr(0,
4381 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4382
4383 applyMetadata(*NewLoad);
4384 // TODO: Also manage existing metadata using VPIRMetadata.
4385 Group->addMetadata(NewLoad);
4386
4387 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4388 // so must use intrinsics to deinterleave.
4389 NewLoad = State.Builder.CreateIntrinsic(
4390 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4391 NewLoad->getType(), NewLoad,
4392 /*FMFSource=*/nullptr, "strided.vec");
4393
4394 const DataLayout &DL = Instr->getDataLayout();
4395 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4396 Instruction *Member = Group->getMember(I);
4397 // Skip the gaps in the group.
4398 if (!Member)
4399 continue;
4400
4401 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4402 // If this member has different type, cast the result type.
4403 if (Member->getType() != ScalarTy) {
4404 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4405 StridedVec =
4406 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4407 }
4408
4409 State.set(getVPValue(J), StridedVec);
4410 ++J;
4411 }
4412 return;
4413 } // End for interleaved load.
4414
4415 // The sub vector type for current instruction.
4416 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4417 // Vectorize the interleaved store group.
4418 ArrayRef<VPValue *> StoredValues = getStoredValues();
4419 // Collect the stored vector from each member.
4420 SmallVector<Value *, 4> StoredVecs;
4421 const DataLayout &DL = Instr->getDataLayout();
4422 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4423 Instruction *Member = Group->getMember(I);
4424 // Skip the gaps in the group.
4425 if (!Member) {
4426 StoredVecs.push_back(PoisonValue::get(SubVT));
4427 continue;
4428 }
4429
4430 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4431 // If this member has different type, cast it to a unified type.
4432 if (StoredVec->getType() != SubVT)
4433 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4434
4435 StoredVecs.push_back(StoredVec);
4436 ++StoredIdx;
4437 }
4438
4439 // Interleave all the smaller vectors into one wider vector.
4440 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4441 CallInst *NewStore =
4442 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4443 {IVec, ResAddr, GroupMask, InterleaveEVL});
4444 NewStore->addParamAttr(1,
4445 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4446
4447 applyMetadata(*NewStore);
4448 // TODO: Also manage existing metadata using VPIRMetadata.
4449 Group->addMetadata(NewStore);
4450}
4451
4452#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4454 VPSlotTracker &SlotTracker) const {
4456 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4457 IG->getInsertPos()->printAsOperand(O, false);
4458 O << ", ";
4460 O << ", ";
4462 if (VPValue *Mask = getMask()) {
4463 O << ", ";
4464 Mask->printAsOperand(O, SlotTracker);
4465 }
4466
4467 unsigned OpIdx = 0;
4468 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4469 if (!IG->getMember(i))
4470 continue;
4471 if (getNumStoreOperands() > 0) {
4472 O << "\n" << Indent << " vp.store ";
4474 O << " to index " << i;
4475 } else {
4476 O << "\n" << Indent << " ";
4478 O << " = vp.load from index " << i;
4479 }
4480 ++OpIdx;
4481 }
4482}
4483#endif
4484
4486 VPCostContext &Ctx) const {
4487 Instruction *InsertPos = getInsertPos();
4488 // Find the VPValue index of the interleave group. We need to skip gaps.
4489 unsigned InsertPosIdx = 0;
4490 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4491 if (auto *Member = IG->getMember(Idx)) {
4492 if (Member == InsertPos)
4493 break;
4494 InsertPosIdx++;
4495 }
4496 Type *ValTy = Ctx.Types.inferScalarType(
4497 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4498 : getStoredValues()[InsertPosIdx]);
4499 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4500 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4501 ->getAddressSpace();
4502
4503 unsigned InterleaveFactor = IG->getFactor();
4504 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4505
4506 // Holds the indices of existing members in the interleaved group.
4508 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4509 if (IG->getMember(IF))
4510 Indices.push_back(IF);
4511
4512 // Calculate the cost of the whole interleaved group.
4513 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4514 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4515 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4516
4517 if (!IG->isReverse())
4518 return Cost;
4519
4520 return Cost + IG->getNumMembers() *
4521 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4522 VectorTy, VectorTy, {}, Ctx.CostKind,
4523 0);
4524}
4525
4526#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4528 VPSlotTracker &SlotTracker) const {
4529 O << Indent << "EMIT ";
4531 O << " = CANONICAL-INDUCTION ";
4533}
4534#endif
4535
4537 return vputils::onlyScalarValuesUsed(this) &&
4538 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4539}
4540
4541#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4543 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4544 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4545 "unexpected number of operands");
4546 O << Indent << "EMIT ";
4548 O << " = WIDEN-POINTER-INDUCTION ";
4550 O << ", ";
4552 O << ", ";
4554 if (getNumOperands() == 5) {
4555 O << ", ";
4557 O << ", ";
4559 }
4560}
4561
4563 VPSlotTracker &SlotTracker) const {
4564 O << Indent << "EMIT ";
4566 O << " = EXPAND SCEV " << *Expr;
4567}
4568#endif
4569
4571 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4572 Type *STy = CanonicalIV->getType();
4573 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4574 ElementCount VF = State.VF;
4575 Value *VStart = VF.isScalar()
4576 ? CanonicalIV
4577 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4578 Value *VStep = Builder.CreateElementCount(
4579 STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
4580 if (VF.isVector()) {
4581 VStep = Builder.CreateVectorSplat(VF, VStep);
4582 VStep =
4583 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4584 }
4585 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4586 State.set(this, CanonicalVectorIV);
4587}
4588
4589#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4591 VPSlotTracker &SlotTracker) const {
4592 O << Indent << "EMIT ";
4594 O << " = WIDEN-CANONICAL-INDUCTION ";
4596}
4597#endif
4598
4600 auto &Builder = State.Builder;
4601 // Create a vector from the initial value.
4602 auto *VectorInit = getStartValue()->getLiveInIRValue();
4603
4604 Type *VecTy = State.VF.isScalar()
4605 ? VectorInit->getType()
4606 : VectorType::get(VectorInit->getType(), State.VF);
4607
4608 BasicBlock *VectorPH =
4609 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4610 if (State.VF.isVector()) {
4611 auto *IdxTy = Builder.getInt32Ty();
4612 auto *One = ConstantInt::get(IdxTy, 1);
4613 IRBuilder<>::InsertPointGuard Guard(Builder);
4614 Builder.SetInsertPoint(VectorPH->getTerminator());
4615 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4616 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4617 VectorInit = Builder.CreateInsertElement(
4618 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4619 }
4620
4621 // Create a phi node for the new recurrence.
4622 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4623 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4624 Phi->addIncoming(VectorInit, VectorPH);
4625 State.set(this, Phi);
4626}
4627
4630 VPCostContext &Ctx) const {
4631 if (VF.isScalar())
4632 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4633
4634 return 0;
4635}
4636
4637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4639 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4640 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4642 O << " = phi ";
4644}
4645#endif
4646
4648 // Reductions do not have to start at zero. They can start with
4649 // any loop invariant values.
4650 VPValue *StartVPV = getStartValue();
4651
4652 // In order to support recurrences we need to be able to vectorize Phi nodes.
4653 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4654 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4655 // this value when we vectorize all of the instructions that use the PHI.
4656 BasicBlock *VectorPH =
4657 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4658 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4659 Value *StartV = State.get(StartVPV, ScalarPHI);
4660 Type *VecTy = StartV->getType();
4661
4662 BasicBlock *HeaderBB = State.CFG.PrevBB;
4663 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4664 "recipe must be in the vector loop header");
4665 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4666 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4667 State.set(this, Phi, isInLoop());
4668
4669 Phi->addIncoming(StartV, VectorPH);
4670}
4671
4672#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4674 VPSlotTracker &SlotTracker) const {
4675 O << Indent << "WIDEN-REDUCTION-PHI ";
4676
4678 O << " = phi";
4679 printFlags(O);
4681 if (getVFScaleFactor() > 1)
4682 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4683}
4684#endif
4685
4687 Value *Op0 = State.get(getOperand(0));
4688 Type *VecTy = Op0->getType();
4689 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4690 State.set(this, VecPhi);
4691}
4692
4694 VPCostContext &Ctx) const {
4695 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4696}
4697
4698#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4700 VPSlotTracker &SlotTracker) const {
4701 O << Indent << "WIDEN-PHI ";
4702
4704 O << " = phi ";
4706}
4707#endif
4708
4710 BasicBlock *VectorPH =
4711 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4712 Value *StartMask = State.get(getOperand(0));
4713 PHINode *Phi =
4714 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4715 Phi->addIncoming(StartMask, VectorPH);
4716 State.set(this, Phi);
4717}
4718
4719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4721 VPSlotTracker &SlotTracker) const {
4722 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4723
4725 O << " = phi ";
4727}
4728#endif
4729
4730#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4732 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4733 O << Indent << "CURRENT-ITERATION-PHI ";
4734
4736 O << " = phi ";
4738}
4739#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, VPCostContext &Ctx)
Return true if R is a predicated load/store with a loop-invariant address only masked by the header m...
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI)
For call VPInstructions, return the operand index of the called function.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:93
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
void setAllowReciprocal(bool B=true)
Definition FMF.h:90
bool allowReciprocal() const
Definition FMF.h:71
void setNoSignedZeros(bool B=true)
Definition FMF.h:87
bool allowReassoc() const
Flag queries.
Definition FMF.h:67
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:68
void setApproxFunc(bool B=true)
Definition FMF.h:96
void setNoInfs(bool B=true)
Definition FMF.h:84
bool allowContract() const
Definition FMF.h:72
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2585
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:564
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2639
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2573
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1224
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2632
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2651
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:579
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2049
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1752
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1836
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1162
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1447
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2078
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1430
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1735
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2344
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1760
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1600
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1464
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2812
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4253
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4306
iterator end()
Definition VPlan.h:4290
const VPRecipeBase & front() const
Definition VPlan.h:4300
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4319
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2825
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2820
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2816
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:368
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPIRValue * getStartValue() const
Definition VPlan.h:4048
VPValue * getStepValue() const
Definition VPlan.h:4049
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2337
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2077
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
FastMathFlagsTy FMFs
Definition VPlan.h:778
ReductionFlagsTy ReductionFlags
Definition VPlan.h:780
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:772
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:995
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1059
TruncFlagsTy TruncFlags
Definition VPlan.h:773
CmpInst::Predicate getPredicate() const
Definition VPlan.h:967
ExactFlagsTy ExactFlags
Definition VPlan.h:775
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:776
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:985
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:990
DisjointFlagsTy DisjointFlags
Definition VPlan.h:774
FCmpFlagsTy FCmpFlags
Definition VPlan.h:779
NonNegFlagsTy NonNegFlags
Definition VPlan.h:777
bool isReductionInLoop() const
Definition VPlan.h:1065
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:924
uint8_t CmpPredStorage
Definition VPlan.h:771
RecurKind getRecurKind() const
Definition VPlan.h:1053
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1694
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1336
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1327
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1343
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1272
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1317
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1330
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1321
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ VScale
Returns the value for vscale.
Definition VPlan.h:1339
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
bool hasResult() const
Definition VPlan.h:1421
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1501
unsigned getOpcode() const
Definition VPlan.h:1405
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1446
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2937
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2941
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2939
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2931
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2960
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2925
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3034
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3047
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2997
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1608
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4397
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1633
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1593
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4558
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:116
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:527
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:471
friend class VPValue
Definition VPlanValue.h:271
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3195
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2740
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2764
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3137
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3148
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3150
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3133
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3139
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3146
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3141
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4441
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4509
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3217
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3258
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3287
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
VPValue * getStepValue() const
Definition VPlan.h:4117
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4125
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:609
This class can be used to assign names to VPValues.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:1158
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1495
operand_range operands()
Definition VPlanValue.h:364
unsigned getNumOperands() const
Definition VPlanValue.h:334
operand_iterator op_begin()
Definition VPlanValue.h:360
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:379
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1446
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1491
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1449
VPValue * getVFValue() const
Definition VPlan.h:2175
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2172
int64_t getStride() const
Definition VPlan.h:2173
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2244
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:2032
Function * getCalledScalarFunction() const
Definition VPlan.h:2028
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1881
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2129
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2400
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2403
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2501
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2516
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2525
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1963
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1966
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3542
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3539
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3582
Instruction & Ingredient
Definition VPlan.h:3530
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3536
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3596
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3533
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3589
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3586
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4571
const DataLayout & getDataLayout() const
Definition VPlan.h:4766
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1067
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4868
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:816
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2313
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMaximum
FP max with llvm.maximum semantics.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
LLVMContext & LLVMCtx
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1752
PHINode & getIRPhi()
Definition VPlan.h:1765
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1112
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1113
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:279
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3674
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3758
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3761
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3721