LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Twine.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/Instruction.h"
32#include "llvm/IR/Intrinsics.h"
33#include "llvm/IR/Type.h"
34#include "llvm/IR/Value.h"
37#include "llvm/Support/Debug.h"
41#include <cassert>
42
43using namespace llvm;
44using namespace llvm::VPlanPatternMatch;
45
47
48#define LV_NAME "loop-vectorize"
49#define DEBUG_TYPE LV_NAME
50
52 switch (getVPRecipeID()) {
53 case VPExpressionSC:
54 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
55 case VPInstructionSC: {
56 auto *VPI = cast<VPInstruction>(this);
57 // Loads read from memory but don't write to memory.
58 if (VPI->getOpcode() == Instruction::Load)
59 return false;
60 return VPI->opcodeMayReadOrWriteFromMemory();
61 }
62 case VPInterleaveEVLSC:
63 case VPInterleaveSC:
64 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
65 case VPWidenStoreEVLSC:
66 case VPWidenStoreSC:
67 return true;
68 case VPReplicateSC:
69 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
70 ->mayWriteToMemory();
71 case VPWidenCallSC:
72 return !cast<VPWidenCallRecipe>(this)
73 ->getCalledScalarFunction()
74 ->onlyReadsMemory();
75 case VPWidenIntrinsicSC:
76 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
77 case VPActiveLaneMaskPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenIntrinsicSC:
128 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
129 case VPBranchOnMaskSC:
130 case VPDerivedIVSC:
131 case VPCurrentIterationPHISC:
132 case VPFirstOrderRecurrencePHISC:
133 case VPReductionPHISC:
134 case VPPredInstPHISC:
135 case VPScalarIVStepsSC:
136 case VPWidenStoreEVLSC:
137 case VPWidenStoreSC:
138 return false;
139 case VPBlendSC:
140 case VPReductionEVLSC:
141 case VPReductionSC:
142 case VPVectorPointerSC:
143 case VPWidenCanonicalIVSC:
144 case VPWidenCastSC:
145 case VPWidenGEPSC:
146 case VPWidenIntOrFpInductionSC:
147 case VPWidenPHISC:
148 case VPWidenPointerInductionSC:
149 case VPWidenSC: {
150 const Instruction *I =
151 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
152 (void)I;
153 assert((!I || !I->mayReadFromMemory()) &&
154 "underlying instruction may read from memory");
155 return false;
156 }
157 default:
158 // FIXME: Return false if the recipe represents an interleaved store.
159 return true;
160 }
161}
162
164 switch (getVPRecipeID()) {
165 case VPExpressionSC:
166 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
167 case VPActiveLaneMaskPHISC:
168 case VPDerivedIVSC:
169 case VPCurrentIterationPHISC:
170 case VPFirstOrderRecurrencePHISC:
171 case VPReductionPHISC:
172 case VPPredInstPHISC:
173 case VPVectorEndPointerSC:
174 return false;
175 case VPInstructionSC: {
176 auto *VPI = cast<VPInstruction>(this);
177 return mayWriteToMemory() ||
178 VPI->getOpcode() == VPInstruction::BranchOnCount ||
179 VPI->getOpcode() == VPInstruction::BranchOnCond ||
180 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
181 }
182 case VPWidenCallSC: {
183 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
184 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
185 }
186 case VPWidenIntrinsicSC:
187 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
188 case VPBlendSC:
189 case VPReductionEVLSC:
190 case VPReductionSC:
191 case VPScalarIVStepsSC:
192 case VPVectorPointerSC:
193 case VPWidenCanonicalIVSC:
194 case VPWidenCastSC:
195 case VPWidenGEPSC:
196 case VPWidenIntOrFpInductionSC:
197 case VPWidenPHISC:
198 case VPWidenPointerInductionSC:
199 case VPWidenSC: {
200 const Instruction *I =
201 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
202 (void)I;
203 assert((!I || !I->mayHaveSideEffects()) &&
204 "underlying instruction has side-effects");
205 return false;
206 }
207 case VPInterleaveEVLSC:
208 case VPInterleaveSC:
209 return mayWriteToMemory();
210 case VPWidenLoadEVLSC:
211 case VPWidenLoadSC:
212 case VPWidenStoreEVLSC:
213 case VPWidenStoreSC:
214 assert(
215 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
217 "mayHaveSideffects result for ingredient differs from this "
218 "implementation");
219 return mayWriteToMemory();
220 case VPReplicateSC: {
221 auto *R = cast<VPReplicateRecipe>(this);
222 return R->getUnderlyingInstr()->mayHaveSideEffects();
223 }
224 default:
225 return true;
226 }
227}
228
230 switch (getVPRecipeID()) {
231 default:
232 return false;
233 case VPInstructionSC: {
234 unsigned Opcode = cast<VPInstruction>(this)->getOpcode();
235 if (Instruction::isCast(Opcode))
236 return true;
237
238 switch (Opcode) {
239 default:
240 return false;
241 case Instruction::Add:
242 case Instruction::Sub:
243 case Instruction::Mul:
244 case Instruction::GetElementPtr:
245 return true;
246 }
247 }
248 }
249}
250
252 assert(!Parent && "Recipe already in some VPBasicBlock");
253 assert(InsertPos->getParent() &&
254 "Insertion position not in any VPBasicBlock");
255 InsertPos->getParent()->insert(this, InsertPos->getIterator());
256}
257
258void VPRecipeBase::insertBefore(VPBasicBlock &BB,
260 assert(!Parent && "Recipe already in some VPBasicBlock");
261 assert(I == BB.end() || I->getParent() == &BB);
262 BB.insert(this, I);
263}
264
266 assert(!Parent && "Recipe already in some VPBasicBlock");
267 assert(InsertPos->getParent() &&
268 "Insertion position not in any VPBasicBlock");
269 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
270}
271
273 assert(getParent() && "Recipe not in any VPBasicBlock");
275 Parent = nullptr;
276}
277
279 assert(getParent() && "Recipe not in any VPBasicBlock");
281}
282
285 insertAfter(InsertPos);
286}
287
293
295 // Get the underlying instruction for the recipe, if there is one. It is used
296 // to
297 // * decide if cost computation should be skipped for this recipe,
298 // * apply forced target instruction cost.
299 Instruction *UI = nullptr;
300 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
301 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
302 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
303 UI = IG->getInsertPos();
304 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
305 UI = &WidenMem->getIngredient();
306
307 InstructionCost RecipeCost;
308 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
309 RecipeCost = 0;
310 } else {
311 RecipeCost = computeCost(VF, Ctx);
312 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
313 RecipeCost.isValid()) {
314 if (UI)
316 else
317 RecipeCost = InstructionCost(0);
318 }
319 }
320
321 LLVM_DEBUG({
322 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
323 dump();
324 });
325 return RecipeCost;
326}
327
329 VPCostContext &Ctx) const {
330 llvm_unreachable("subclasses should implement computeCost");
331}
332
334 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
336}
337
339 auto *VPI = dyn_cast<VPInstruction>(this);
340 return VPI && Instruction::isCast(VPI->getOpcode());
341}
342
344 assert(OpType == Other.OpType && "OpType must match");
345 switch (OpType) {
346 case OperationType::OverflowingBinOp:
347 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
348 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
349 break;
350 case OperationType::Trunc:
351 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
352 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
353 break;
354 case OperationType::DisjointOp:
355 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
356 break;
357 case OperationType::PossiblyExactOp:
358 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
359 break;
360 case OperationType::GEPOp:
361 GEPFlagsStorage &= Other.GEPFlagsStorage;
362 break;
363 case OperationType::FPMathOp:
364 case OperationType::FCmp:
365 assert((OpType != OperationType::FCmp ||
366 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
367 "Cannot drop CmpPredicate");
368 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
369 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
370 break;
371 case OperationType::NonNegOp:
372 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
373 break;
374 case OperationType::Cmp:
375 assert(CmpPredStorage == Other.CmpPredStorage &&
376 "Cannot drop CmpPredicate");
377 break;
378 case OperationType::ReductionOp:
379 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
380 "Cannot change RecurKind");
381 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
382 "Cannot change IsOrdered");
383 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
384 "Cannot change IsInLoop");
385 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
386 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
387 break;
388 case OperationType::Other:
389 break;
390 }
391}
392
394 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
395 OpType == OperationType::ReductionOp ||
396 OpType == OperationType::Other) &&
397 "recipe doesn't have fast math flags");
398 if (OpType == OperationType::Other)
399 return FastMathFlags();
400 const FastMathFlagsTy &F = getFMFsRef();
401 FastMathFlags Res;
402 Res.setAllowReassoc(F.AllowReassoc);
403 Res.setNoNaNs(F.NoNaNs);
404 Res.setNoInfs(F.NoInfs);
405 Res.setNoSignedZeros(F.NoSignedZeros);
406 Res.setAllowReciprocal(F.AllowReciprocal);
407 Res.setAllowContract(F.AllowContract);
408 Res.setApproxFunc(F.ApproxFunc);
409 return Res;
410}
411
412#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
414
415void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
416 VPSlotTracker &SlotTracker) const {
417 printRecipe(O, Indent, SlotTracker);
418 if (auto DL = getDebugLoc()) {
419 O << ", !dbg ";
420 DL.print(O);
421 }
422
423 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
425}
426#endif
427
428template <unsigned PartOpIdx>
429VPValue *
431 if (U.getNumOperands() == PartOpIdx + 1)
432 return U.getOperand(PartOpIdx);
433 return nullptr;
434}
435
436template <unsigned PartOpIdx>
438 if (auto *UnrollPartOp = getUnrollPartOperand(U))
439 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
440 return 0;
441}
442
443namespace llvm {
444template class VPUnrollPartAccessor<1>;
445template class VPUnrollPartAccessor<2>;
446template class VPUnrollPartAccessor<3>;
447}
448
450 const VPIRFlags &Flags, const VPIRMetadata &MD,
451 DebugLoc DL, const Twine &Name)
452 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
453 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
455 "Set flags not supported for the provided opcode");
457 "Opcode requires specific flags to be set");
461 "number of operands does not match opcode");
462}
463
464/// For call VPInstructions, return the operand index of the called function.
465/// The function is either the last operand (for unmasked calls) or the
466/// second-to-last operand (for masked calls).
467static unsigned getCalledFnOperandIndex(const VPInstruction &VPI) {
468 assert(VPI.getOpcode() == Instruction::Call && "must be a call");
469 unsigned NumOps = VPI.getNumOperands();
470 auto *LastOp = dyn_cast<VPIRValue>(VPI.getOperand(NumOps - 1));
471 if (LastOp && isa<Function>(LastOp->getValue()))
472 return NumOps - 1;
473 assert(
474 isa<Function>(cast<VPIRValue>(VPI.getOperand(NumOps - 2))->getValue()) &&
475 "expected function operand");
476 return NumOps - 2;
477}
478
479/// For call VPInstructions, return the called function.
481 unsigned Idx = getCalledFnOperandIndex(VPI);
482 return cast<Function>(cast<VPIRValue>(VPI.getOperand(Idx))->getValue());
483}
484
486 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
487 return 1;
488
489 if (Instruction::isBinaryOp(Opcode))
490 return 2;
491
492 switch (Opcode) {
495 return 0;
496 case Instruction::Alloca:
497 case Instruction::ExtractValue:
498 case Instruction::Freeze:
499 case Instruction::Load:
512 return 1;
513 case Instruction::ICmp:
514 case Instruction::FCmp:
515 case Instruction::ExtractElement:
516 case Instruction::Store:
526 return 2;
527 case Instruction::InsertElement:
528 case Instruction::Select:
531 return 3;
532 case Instruction::Call:
533 return getCalledFnOperandIndex(*this) + 1;
534 case Instruction::GetElementPtr:
535 case Instruction::PHI:
536 case Instruction::Switch:
546 // Cannot determine the number of operands from the opcode.
547 return -1u;
548 }
549 llvm_unreachable("all cases should be handled above");
550}
551
555
556bool VPInstruction::canGenerateScalarForFirstLane() const {
558 return true;
560 return true;
561 switch (Opcode) {
562 case Instruction::Freeze:
563 case Instruction::ICmp:
564 case Instruction::PHI:
565 case Instruction::Select:
575 return true;
576 default:
577 return false;
578 }
579}
580
581Value *VPInstruction::generate(VPTransformState &State) {
582 IRBuilderBase &Builder = State.Builder;
583
585 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
586 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
587 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
588 auto *Res =
589 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
590 if (auto *I = dyn_cast<Instruction>(Res))
591 applyFlags(*I);
592 return Res;
593 }
594
595 switch (getOpcode()) {
596 case VPInstruction::Not: {
597 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
598 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
599 return Builder.CreateNot(A, Name);
600 }
601 case Instruction::ExtractElement: {
602 assert(State.VF.isVector() && "Only extract elements from vectors");
603 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
604 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
605 Value *Vec = State.get(getOperand(0));
606 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
607 return Builder.CreateExtractElement(Vec, Idx, Name);
608 }
609 case Instruction::InsertElement: {
610 assert(State.VF.isVector() && "Can only insert elements into vectors");
611 Value *Vec = State.get(getOperand(0), /*IsScalar=*/false);
612 Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
613 Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
614 return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
615 }
616 case Instruction::Freeze: {
618 return Builder.CreateFreeze(Op, Name);
619 }
620 case Instruction::FCmp:
621 case Instruction::ICmp: {
622 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
623 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
624 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
625 return Builder.CreateCmp(getPredicate(), A, B, Name);
626 }
627 case Instruction::PHI: {
628 llvm_unreachable("should be handled by VPPhi::execute");
629 }
630 case Instruction::Select: {
631 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
632 Value *Cond =
633 State.get(getOperand(0),
634 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
635 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
636 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
637 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
638 }
640 // Get first lane of vector induction variable.
641 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
642 // Get the original loop tripcount.
643 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
644
645 // If this part of the active lane mask is scalar, generate the CMP directly
646 // to avoid unnecessary extracts.
647 if (State.VF.isScalar())
648 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
649 Name);
650
651 ElementCount EC = State.VF.multiplyCoefficientBy(
652 cast<VPConstantInt>(getOperand(2))->getZExtValue());
653 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
654 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
655 {PredTy, ScalarTC->getType()},
656 {VIVElem0, ScalarTC}, nullptr, Name);
657 }
659 // Generate code to combine the previous and current values in vector v3.
660 //
661 // vector.ph:
662 // v_init = vector(..., ..., ..., a[-1])
663 // br vector.body
664 //
665 // vector.body
666 // i = phi [0, vector.ph], [i+4, vector.body]
667 // v1 = phi [v_init, vector.ph], [v2, vector.body]
668 // v2 = a[i, i+1, i+2, i+3];
669 // v3 = vector(v1(3), v2(0, 1, 2))
670
671 auto *V1 = State.get(getOperand(0));
672 if (!V1->getType()->isVectorTy())
673 return V1;
674 Value *V2 = State.get(getOperand(1));
675 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
676 }
678 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
679 Value *VFxUF = State.get(getOperand(1), VPLane(0));
680 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
681 Value *Cmp =
682 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
684 return Builder.CreateSelect(Cmp, Sub, Zero);
685 }
687 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
688 // be outside of the main loop.
689 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
690 // Compute EVL
691 assert(AVL->getType()->isIntegerTy() &&
692 "Requested vector length should be an integer.");
693
694 assert(State.VF.isScalable() && "Expected scalable vector factor.");
695 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
696
697 Value *EVL = Builder.CreateIntrinsic(
698 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
699 {AVL, VFArg, Builder.getTrue()});
700 return EVL;
701 }
703 Value *Cond = State.get(getOperand(0), VPLane(0));
704 // Replace the temporary unreachable terminator with a new conditional
705 // branch, hooking it up to backward destination for latch blocks now, and
706 // to forward destination(s) later when they are created.
707 // Second successor may be backwards - iff it is already in VPBB2IRBB.
708 VPBasicBlock *SecondVPSucc =
709 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
710 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
711 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
712 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
713 // First successor is always forward, reset it to nullptr.
714 Br->setSuccessor(0, nullptr);
716 applyMetadata(*Br);
717 return Br;
718 }
720 return Builder.CreateVectorSplat(
721 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
722 }
724 // For struct types, we need to build a new 'wide' struct type, where each
725 // element is widened, i.e., we create a struct of vectors.
726 auto *StructTy =
728 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
729 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
730 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
731 FieldIndex++) {
732 Value *ScalarValue =
733 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
734 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
735 VectorValue =
736 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
737 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
738 }
739 }
740 return Res;
741 }
743 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
744 auto NumOfElements = ElementCount::getFixed(getNumOperands());
745 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
746 for (const auto &[Idx, Op] : enumerate(operands()))
747 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
748 Builder.getInt32(Idx));
749 return Res;
750 }
752 if (State.VF.isScalar())
753 return State.get(getOperand(0), true);
754 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
756 // If this start vector is scaled then it should produce a vector with fewer
757 // elements than the VF.
758 ElementCount VF = State.VF.divideCoefficientBy(
759 cast<VPConstantInt>(getOperand(2))->getZExtValue());
760 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
761 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
762 Builder.getInt32(0));
763 }
765 RecurKind RK = getRecurKind();
766 bool IsOrdered = isReductionOrdered();
767 bool IsInLoop = isReductionInLoop();
769 "FindIV should use min/max reduction kinds");
770
771 // The recipe may have multiple operands to be reduced together.
772 unsigned NumOperandsToReduce = getNumOperands();
773 VectorParts RdxParts(NumOperandsToReduce);
774 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
775 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
776
777 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
779
780 // Reduce multiple operands into one.
781 Value *ReducedPartRdx = RdxParts[0];
782 if (IsOrdered) {
783 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
784 } else {
785 // Floating-point operations should have some FMF to enable the reduction.
786 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
787 Value *RdxPart = RdxParts[Part];
789 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
790 else {
791 // For sub-recurrences, each part's reduction variable is already
792 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
794 RK == RecurKind::Sub
795 ? Instruction::Add
797 ReducedPartRdx =
798 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
799 }
800 }
801 }
802
803 // Create the reduction after the loop. Note that inloop reductions create
804 // the target reduction in the loop using a Reduction recipe.
805 if (State.VF.isVector() && !IsInLoop) {
806 // TODO: Support in-order reductions based on the recurrence descriptor.
807 // All ops in the reduction inherit fast-math-flags from the recurrence
808 // descriptor.
809 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
810 }
811
812 return ReducedPartRdx;
813 }
816 unsigned Offset =
818 Value *Res;
819 if (State.VF.isVector()) {
820 assert(Offset <= State.VF.getKnownMinValue() &&
821 "invalid offset to extract from");
822 // Extract lane VF - Offset from the operand.
823 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
824 } else {
825 // TODO: Remove ExtractLastLane for scalar VFs.
826 assert(Offset <= 1 && "invalid offset to extract from");
827 Res = State.get(getOperand(0));
828 }
830 Res->setName(Name);
831 return Res;
832 }
834 Value *A = State.get(getOperand(0));
835 Value *B = State.get(getOperand(1));
836 return Builder.CreateLogicalAnd(A, B, Name);
837 }
839 Value *A = State.get(getOperand(0));
840 Value *B = State.get(getOperand(1));
841 return Builder.CreateLogicalOr(A, B, Name);
842 }
844 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
845 "can only generate first lane for PtrAdd");
846 Value *Ptr = State.get(getOperand(0), VPLane(0));
847 Value *Addend = State.get(getOperand(1), VPLane(0));
848 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
849 }
851 Value *Ptr =
853 Value *Addend = State.get(getOperand(1));
854 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
855 }
857 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
858 for (VPValue *Op : drop_begin(operands()))
859 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
860 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
861 }
863 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
864 "simplified to ExtractElement.");
865 Value *LaneToExtract = State.get(getOperand(0), true);
866 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
867 Value *Res = nullptr;
868 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
869
870 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
871 Value *VectorStart =
872 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
873 Value *VectorIdx = Idx == 1
874 ? LaneToExtract
875 : Builder.CreateSub(LaneToExtract, VectorStart);
876 Value *Ext = State.VF.isScalar()
877 ? State.get(getOperand(Idx))
878 : Builder.CreateExtractElement(
879 State.get(getOperand(Idx)), VectorIdx);
880 if (Res) {
881 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
882 Res = Builder.CreateSelect(Cmp, Ext, Res);
883 } else {
884 Res = Ext;
885 }
886 }
887 return Res;
888 }
890 Type *Ty = State.TypeAnalysis.inferScalarType(this);
891 if (getNumOperands() == 1) {
892 Value *Mask = State.get(getOperand(0));
893 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
894 /*ZeroIsPoison=*/false, Name);
895 }
896 // If there are multiple operands, create a chain of selects to pick the
897 // first operand with an active lane and add the number of lanes of the
898 // preceding operands.
899 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
900 unsigned LastOpIdx = getNumOperands() - 1;
901 Value *Res = nullptr;
902 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
903 Value *TrailingZeros =
904 State.VF.isScalar()
905 ? Builder.CreateZExt(
906 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
907 Builder.getFalse()),
908 Ty)
910 Ty, State.get(getOperand(Idx)),
911 /*ZeroIsPoison=*/false, Name);
912 Value *Current = Builder.CreateAdd(
913 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
914 TrailingZeros);
915 if (Res) {
916 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
917 Res = Builder.CreateSelect(Cmp, Current, Res);
918 } else {
919 Res = Current;
920 }
921 }
922
923 return Res;
924 }
926 return State.get(getOperand(0), true);
928 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
930 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
931 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
932 Value *Data = State.get(getOperand(Idx));
933 Value *Mask = State.get(getOperand(Idx + 1));
934 Type *VTy = Data->getType();
935
936 if (State.VF.isScalar())
937 Result = Builder.CreateSelect(Mask, Data, Result);
938 else
939 Result = Builder.CreateIntrinsic(
940 Intrinsic::experimental_vector_extract_last_active, {VTy},
941 {Data, Mask, Result});
942 }
943
944 return Result;
945 }
946 default:
947 llvm_unreachable("Unsupported opcode for instruction");
948 }
949}
950
952 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
953 Type *ScalarTy = Ctx.Types.inferScalarType(this);
954 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
955 switch (Opcode) {
956 case Instruction::FNeg:
957 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
958 case Instruction::UDiv:
959 case Instruction::SDiv:
960 case Instruction::SRem:
961 case Instruction::URem:
962 case Instruction::Add:
963 case Instruction::FAdd:
964 case Instruction::Sub:
965 case Instruction::FSub:
966 case Instruction::Mul:
967 case Instruction::FMul:
968 case Instruction::FDiv:
969 case Instruction::FRem:
970 case Instruction::Shl:
971 case Instruction::LShr:
972 case Instruction::AShr:
973 case Instruction::And:
974 case Instruction::Or:
975 case Instruction::Xor: {
976 // Certain instructions can be cheaper if they have a constant second
977 // operand. One example of this are shifts on x86.
978 VPValue *RHS = getOperand(1);
979 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
980
981 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
984
987 if (CtxI)
988 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
989 return Ctx.TTI.getArithmeticInstrCost(
990 Opcode, ResultTy, Ctx.CostKind,
991 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
992 RHSInfo, Operands, CtxI, &Ctx.TLI);
993 }
994 case Instruction::Freeze:
995 // This opcode is unknown. Assume that it is the same as 'mul'.
996 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
997 Ctx.CostKind);
998 case Instruction::ExtractValue:
999 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1000 Ctx.CostKind);
1001 case Instruction::ICmp:
1002 case Instruction::FCmp: {
1003 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1004 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1006 return Ctx.TTI.getCmpSelInstrCost(
1007 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1008 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1009 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1010 }
1011 case Instruction::BitCast: {
1012 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1013 if (ScalarTy->isPointerTy())
1014 return 0;
1015 [[fallthrough]];
1016 }
1017 case Instruction::SExt:
1018 case Instruction::ZExt:
1019 case Instruction::FPToUI:
1020 case Instruction::FPToSI:
1021 case Instruction::FPExt:
1022 case Instruction::PtrToInt:
1023 case Instruction::PtrToAddr:
1024 case Instruction::IntToPtr:
1025 case Instruction::SIToFP:
1026 case Instruction::UIToFP:
1027 case Instruction::Trunc:
1028 case Instruction::FPTrunc:
1029 case Instruction::AddrSpaceCast: {
1030 // Computes the CastContextHint from a recipe that may access memory.
1031 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1032 if (isa<VPInterleaveBase>(R))
1034 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1035 // Only compute CCH for memory operations, matching the legacy model
1036 // which only considers loads/stores for cast context hints.
1037 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1038 if (!isa<LoadInst, StoreInst>(UI))
1040 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1042 }
1043 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1044 if (WidenMemoryRecipe == nullptr)
1046 if (VF.isScalar())
1048 if (!WidenMemoryRecipe->isConsecutive())
1050 if (WidenMemoryRecipe->isMasked())
1053 };
1054
1055 VPValue *Operand = getOperand(0);
1057 bool IsReverse = false;
1058 // For Trunc/FPTrunc, get the context from the only user.
1059 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1060 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1061 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1062 return nullptr;
1063 return dyn_cast<VPRecipeBase>(*R->user_begin());
1064 };
1065 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1066 if (match(Recipe,
1070 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1071 IsReverse = true;
1072 }
1073 if (Recipe)
1074 CCH = ComputeCCH(Recipe);
1075 }
1076 }
1077 // For Z/Sext, get the context from the operand.
1078 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1079 Opcode == Instruction::FPExt) {
1080 if (auto *Recipe = Operand->getDefiningRecipe()) {
1081 VPValue *ReverseOp;
1082 if (match(Recipe,
1083 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1085 m_VPValue(ReverseOp))))) {
1086 Recipe = ReverseOp->getDefiningRecipe();
1087 IsReverse = true;
1088 }
1089 if (Recipe)
1090 CCH = ComputeCCH(Recipe);
1091 }
1092 }
1093 if (IsReverse && CCH != TTI::CastContextHint::None)
1095
1096 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1097 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1098 // Arm TTI will use the underlying instruction to determine the cost.
1099 return Ctx.TTI.getCastInstrCost(
1100 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1102 }
1103 case Instruction::Select: {
1105 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1106 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1107
1108 VPValue *Op0, *Op1;
1109 bool IsLogicalAnd =
1110 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1111 bool IsLogicalOr =
1112 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1113 // Also match the inverted forms:
1114 // select x, false, y --> !x & y (still AND)
1115 // select x, y, true --> !x | y (still OR)
1116 IsLogicalAnd |=
1117 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1118 IsLogicalOr |=
1119 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1120
1121 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1122 (IsLogicalAnd || IsLogicalOr)) {
1123 // select x, y, false --> x & y
1124 // select x, true, y --> x | y
1125 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1126 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1127
1129 if (SI && all_of(operands(),
1130 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1131 append_range(Operands, SI->operands());
1132 return Ctx.TTI.getArithmeticInstrCost(
1133 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1134 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1135 }
1136
1137 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1138 if (!IsScalarCond && VF.isVector())
1139 CondTy = VectorType::get(CondTy, VF);
1140
1141 llvm::CmpPredicate Pred;
1142 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1143 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1144 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1145 Pred = Cmp->getPredicate();
1146 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1147 return Ctx.TTI.getCmpSelInstrCost(
1148 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1149 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1150 }
1151 }
1152 llvm_unreachable("called for unsupported opcode");
1153}
1154
1156 VPCostContext &Ctx) const {
1158 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1159 // TODO: Compute cost for VPInstructions without underlying values once
1160 // the legacy cost model has been retired.
1161 return 0;
1162 }
1165 "Should only generate a vector value or single scalar, not scalars "
1166 "for all lanes.");
1168 getOpcode(),
1170 }
1171
1172 switch (getOpcode()) {
1173 case Instruction::Select: {
1175 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1176 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1177 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1178 if (!vputils::onlyFirstLaneUsed(this)) {
1179 CondTy = toVectorTy(CondTy, VF);
1180 VecTy = toVectorTy(VecTy, VF);
1181 }
1182 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1183 Ctx.CostKind);
1184 }
1185 case Instruction::ExtractElement:
1187 if (VF.isScalar()) {
1188 // ExtractLane with VF=1 takes care of handling extracting across multiple
1189 // parts.
1190 return 0;
1191 }
1192
1193 // Add on the cost of extracting the element.
1194 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1195 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1196 Ctx.CostKind);
1197 }
1198 case VPInstruction::AnyOf: {
1199 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1201 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1202 }
1204 Type *Ty = Ctx.Types.inferScalarType(this);
1205 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1206 if (VF.isScalar())
1207 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1210 // Calculate the cost of determining the lane index.
1211 auto *PredTy = toVectorTy(ScalarTy, VF);
1212 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1213 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1214 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1215 }
1217 Type *Ty = Ctx.Types.inferScalarType(this);
1218 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1219 if (VF.isScalar())
1220 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1223 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1224 auto *PredTy = toVectorTy(ScalarTy, VF);
1225 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1226 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1228 // Add cost of NOT operation on the predicate.
1230 Instruction::Xor, PredTy, Ctx.CostKind,
1231 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1232 {TargetTransformInfo::OK_UniformConstantValue,
1233 TargetTransformInfo::OP_None});
1234 // Add cost of SUB operation on the index.
1235 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1236 return Cost;
1237 }
1239 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1240 Type *VecTy = toVectorTy(ScalarTy, VF);
1241 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1242 IntrinsicCostAttributes ICA(
1243 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1244 {VecTy, MaskTy, ScalarTy});
1245 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1246 }
1248 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1249 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1250 return Ctx.TTI.getShuffleCost(
1252 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1253 }
1255 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1256 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1257 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1258 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1259 {ArgTy, ArgTy});
1260 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1261 }
1263 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1264 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1265 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1266 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1267 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1268 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1269 }
1271 assert(VF.isVector() && "Reverse operation must be vector type");
1272 Type *EltTy = Ctx.Types.inferScalarType(this);
1273 // Skip the reverse operation cost for the mask.
1274 // FIXME: Remove this once redundant mask reverse operations can be
1275 // eliminated by VPlanTransforms::cse before cost computation.
1276 if (EltTy->isIntegerTy(1))
1277 return 0;
1278 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1280 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1281 /*Index=*/0);
1282 }
1284 // Add on the cost of extracting the element.
1285 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1286 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1287 VecTy, Ctx.CostKind, 0);
1288 }
1289 case Instruction::FCmp:
1290 case Instruction::ICmp:
1291 // FIXME: We don't handle scalar compares here yet. Scalar compares used for
1292 // the loop exit condition are handled by the legacy cost model, but other
1293 // scalar compares (e.g. in the middle block deciding whether to execute the
1294 // scalar epilogue) aren't accounted for.
1296 return 0;
1297 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
1299 if (VF == ElementCount::getScalable(1))
1301 [[fallthrough]];
1302 default:
1303 // TODO: Compute cost other VPInstructions once the legacy cost model has
1304 // been retired.
1306 "unexpected VPInstruction witht underlying value");
1307 return 0;
1308 }
1309}
1310
1322
1324 switch (getOpcode()) {
1325 case Instruction::Load:
1326 case Instruction::PHI:
1330 return true;
1331 default:
1332 return isScalarCast();
1333 }
1334}
1335
1337 assert(!isMasked() && "cannot execute masked VPInstruction");
1338 assert(!State.Lane && "VPInstruction executing an Lane");
1339 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1341 "Set flags not supported for the provided opcode");
1343 "Opcode requires specific flags to be set");
1344 if (hasFastMathFlags())
1345 State.Builder.setFastMathFlags(getFastMathFlags());
1346 Value *GeneratedValue = generate(State);
1347 if (!hasResult())
1348 return;
1349 assert(GeneratedValue && "generate must produce a value");
1350 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1353 assert((((GeneratedValue->getType()->isVectorTy() ||
1354 GeneratedValue->getType()->isStructTy()) ==
1355 !GeneratesPerFirstLaneOnly) ||
1356 State.VF.isScalar()) &&
1357 "scalar value but not only first lane defined");
1358 State.set(this, GeneratedValue,
1359 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1361 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1362 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1363 // vectorization explicitly connects VPlans.
1364 setUnderlyingValue(GeneratedValue);
1365 }
1366}
1367
1371 return false;
1372 switch (getOpcode()) {
1373 case Instruction::ExtractValue:
1374 case Instruction::InsertValue:
1375 case Instruction::GetElementPtr:
1376 case Instruction::ExtractElement:
1377 case Instruction::InsertElement:
1378 case Instruction::Freeze:
1379 case Instruction::FCmp:
1380 case Instruction::ICmp:
1381 case Instruction::Select:
1382 case Instruction::PHI:
1406 case VPInstruction::Not:
1415 return false;
1416 case Instruction::Call:
1417 return !getCalledFunction(*this)->doesNotAccessMemory();
1418 default:
1419 return true;
1420 }
1421}
1422
1424 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1426 return vputils::onlyFirstLaneUsed(this);
1427
1428 switch (getOpcode()) {
1429 default:
1430 return false;
1431 case Instruction::ExtractElement:
1432 return Op == getOperand(1);
1433 case Instruction::InsertElement:
1434 return Op == getOperand(1) || Op == getOperand(2);
1435 case Instruction::PHI:
1436 return true;
1437 case Instruction::FCmp:
1438 case Instruction::ICmp:
1439 case Instruction::Select:
1440 case Instruction::Or:
1441 case Instruction::Freeze:
1442 case VPInstruction::Not:
1443 // TODO: Cover additional opcodes.
1444 return vputils::onlyFirstLaneUsed(this);
1445 case Instruction::Load:
1455 return true;
1458 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1459 // operand, after replicating its operands only the first lane is used.
1460 // Before replicating, it will have only a single operand.
1461 return getNumOperands() > 1;
1463 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1465 // WidePtrAdd supports scalar and vector base addresses.
1466 return false;
1469 return Op == getOperand(0);
1470 };
1471 llvm_unreachable("switch should return");
1472}
1473
1475 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1477 return vputils::onlyFirstPartUsed(this);
1478
1479 switch (getOpcode()) {
1480 default:
1481 return false;
1482 case Instruction::FCmp:
1483 case Instruction::ICmp:
1484 case Instruction::Select:
1485 return vputils::onlyFirstPartUsed(this);
1490 return true;
1491 };
1492 llvm_unreachable("switch should return");
1493}
1494
1495#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1497 VPSlotTracker SlotTracker(getParent()->getPlan());
1499}
1500
1502 VPSlotTracker &SlotTracker) const {
1503 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1504
1505 if (hasResult()) {
1507 O << " = ";
1508 }
1509
1510 switch (getOpcode()) {
1511 case VPInstruction::Not:
1512 O << "not";
1513 break;
1515 O << "active lane mask";
1516 break;
1518 O << "EXPLICIT-VECTOR-LENGTH";
1519 break;
1521 O << "first-order splice";
1522 break;
1524 O << "branch-on-cond";
1525 break;
1527 O << "branch-on-two-conds";
1528 break;
1530 O << "TC > VF ? TC - VF : 0";
1531 break;
1533 O << "VF * Part +";
1534 break;
1536 O << "branch-on-count";
1537 break;
1539 O << "broadcast";
1540 break;
1542 O << "buildstructvector";
1543 break;
1545 O << "buildvector";
1546 break;
1548 O << "exiting-iv-value";
1549 break;
1551 O << "masked-cond";
1552 break;
1554 O << "extract-lane";
1555 break;
1557 O << "extract-last-lane";
1558 break;
1560 O << "extract-last-part";
1561 break;
1563 O << "extract-penultimate-element";
1564 break;
1566 O << "compute-reduction-result";
1567 break;
1569 O << "logical-and";
1570 break;
1572 O << "logical-or";
1573 break;
1575 O << "ptradd";
1576 break;
1578 O << "wide-ptradd";
1579 break;
1581 O << "any-of";
1582 break;
1584 O << "first-active-lane";
1585 break;
1587 O << "last-active-lane";
1588 break;
1590 O << "reduction-start-vector";
1591 break;
1593 O << "resume-for-epilogue";
1594 break;
1596 O << "reverse";
1597 break;
1599 O << "unpack";
1600 break;
1602 O << "extract-last-active";
1603 break;
1604 default:
1606 }
1607
1608 printFlags(O);
1610}
1611#endif
1612
1614 State.setDebugLocFrom(getDebugLoc());
1615 if (isScalarCast()) {
1616 Value *Op = State.get(getOperand(0), VPLane(0));
1617 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1618 Op, ResultTy);
1619 State.set(this, Cast, VPLane(0));
1620 return;
1621 }
1622 switch (getOpcode()) {
1624 Value *StepVector =
1625 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1626 State.set(this, StepVector);
1627 break;
1628 }
1629 case VPInstruction::VScale: {
1630 Value *VScale = State.Builder.CreateVScale(ResultTy);
1631 State.set(this, VScale, true);
1632 break;
1633 }
1634
1635 default:
1636 llvm_unreachable("opcode not implemented yet");
1637 }
1638}
1639
1640#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1642 VPSlotTracker &SlotTracker) const {
1643 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1645 O << " = ";
1646
1647 switch (getOpcode()) {
1649 O << "wide-iv-step ";
1651 break;
1653 O << "step-vector " << *ResultTy;
1654 break;
1656 O << "vscale " << *ResultTy;
1657 break;
1658 case Instruction::Load:
1659 O << "load ";
1661 break;
1662 default:
1663 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1666 O << " to " << *ResultTy;
1667 }
1668}
1669#endif
1670
1672 State.setDebugLocFrom(getDebugLoc());
1673 PHINode *NewPhi = State.Builder.CreatePHI(
1674 State.TypeAnalysis.inferScalarType(this), 2, getName());
1675 unsigned NumIncoming = getNumIncoming();
1676 // Detect header phis: the parent block dominates its second incoming block
1677 // (the latch). Those IR incoming values have not been generated yet and need
1678 // to be added after they have been executed.
1679 if (NumIncoming == 2 &&
1680 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1681 NumIncoming = 1;
1682 }
1683 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1684 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1685 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1686 NewPhi->addIncoming(IncV, PredBB);
1687 }
1688 State.set(this, NewPhi, VPLane(0));
1689}
1690
1691#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1692void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1693 VPSlotTracker &SlotTracker) const {
1694 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1696 O << " = phi";
1697 printFlags(O);
1699}
1700#endif
1701
1702VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1703 if (auto *Phi = dyn_cast<PHINode>(&I))
1704 return new VPIRPhi(*Phi);
1705 return new VPIRInstruction(I);
1706}
1707
1709 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1710 "PHINodes must be handled by VPIRPhi");
1711 // Advance the insert point after the wrapped IR instruction. This allows
1712 // interleaving VPIRInstructions and other recipes.
1713 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1714}
1715
1717 VPCostContext &Ctx) const {
1718 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1719 // hence it does not contribute to the cost-modeling for the VPlan.
1720 return 0;
1721}
1722
1723#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1725 VPSlotTracker &SlotTracker) const {
1726 O << Indent << "IR " << I;
1727}
1728#endif
1729
1731 PHINode *Phi = &getIRPhi();
1732 for (const auto &[Idx, Op] : enumerate(operands())) {
1733 VPValue *ExitValue = Op;
1734 auto Lane = vputils::isSingleScalar(ExitValue)
1736 : VPLane::getLastLaneForVF(State.VF);
1737 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1738 auto *PredVPBB = Pred->getExitingBasicBlock();
1739 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1740 // Set insertion point in PredBB in case an extract needs to be generated.
1741 // TODO: Model extracts explicitly.
1742 State.Builder.SetInsertPoint(PredBB->getTerminator());
1743 Value *V = State.get(ExitValue, VPLane(Lane));
1744 // If there is no existing block for PredBB in the phi, add a new incoming
1745 // value. Otherwise update the existing incoming value for PredBB.
1746 if (Phi->getBasicBlockIndex(PredBB) == -1)
1747 Phi->addIncoming(V, PredBB);
1748 else
1749 Phi->setIncomingValueForBlock(PredBB, V);
1750 }
1751
1752 // Advance the insert point after the wrapped IR instruction. This allows
1753 // interleaving VPIRInstructions and other recipes.
1754 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1755}
1756
1758 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1759 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1760 "Number of phi operands must match number of predecessors");
1761 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1762 R->removeOperand(Position);
1763}
1764
1765VPValue *
1767 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1768 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1769}
1770
1772 VPValue *V) const {
1773 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1774 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1775}
1776
1777#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1779 VPSlotTracker &SlotTracker) const {
1780 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1781 [this, &O, &SlotTracker](auto Op) {
1782 O << "[ ";
1783 Op.value()->printAsOperand(O, SlotTracker);
1784 O << ", ";
1785 getIncomingBlock(Op.index())->printAsOperand(O);
1786 O << " ]";
1787 });
1788}
1789#endif
1790
1791#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1793 VPSlotTracker &SlotTracker) const {
1795
1796 if (getNumOperands() != 0) {
1797 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1799 [&O, &SlotTracker](auto Op) {
1800 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1801 O << " from ";
1802 std::get<1>(Op)->printAsOperand(O);
1803 });
1804 O << ")";
1805 }
1806}
1807#endif
1808
1810 for (const auto &[Kind, Node] : Metadata)
1811 I.setMetadata(Kind, Node);
1812}
1813
1815 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1816 for (const auto &[KindA, MDA] : Metadata) {
1817 for (const auto &[KindB, MDB] : Other.Metadata) {
1818 if (KindA == KindB && MDA == MDB) {
1819 MetadataIntersection.emplace_back(KindA, MDA);
1820 break;
1821 }
1822 }
1823 }
1824 Metadata = std::move(MetadataIntersection);
1825}
1826
1827#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1829 const Module *M = SlotTracker.getModule();
1830 if (Metadata.empty() || !M)
1831 return;
1832
1833 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1834 O << " (";
1835 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1836 auto [Kind, Node] = KindNodePair;
1837 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1838 "Unexpected unnamed metadata kind");
1839 O << "!" << MDNames[Kind] << " ";
1840 Node->printAsOperand(O, M);
1841 });
1842 O << ")";
1843}
1844#endif
1845
1847 assert(State.VF.isVector() && "not widening");
1848 assert(Variant != nullptr && "Can't create vector function.");
1849
1850 FunctionType *VFTy = Variant->getFunctionType();
1851 // Add return type if intrinsic is overloaded on it.
1853 for (const auto &I : enumerate(args())) {
1854 Value *Arg;
1855 // Some vectorized function variants may also take a scalar argument,
1856 // e.g. linear parameters for pointers. This needs to be the scalar value
1857 // from the start of the respective part when interleaving.
1858 if (!VFTy->getParamType(I.index())->isVectorTy())
1859 Arg = State.get(I.value(), VPLane(0));
1860 else
1861 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1862 Args.push_back(Arg);
1863 }
1864
1867 if (CI)
1868 CI->getOperandBundlesAsDefs(OpBundles);
1869
1870 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1871 applyFlags(*V);
1872 applyMetadata(*V);
1873 V->setCallingConv(Variant->getCallingConv());
1874
1875 if (!V->getType()->isVoidTy())
1876 State.set(this, V);
1877}
1878
1880 VPCostContext &Ctx) const {
1881 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1882 Variant->getFunctionType()->params(),
1883 Ctx.CostKind);
1884}
1885
1886#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1888 VPSlotTracker &SlotTracker) const {
1889 O << Indent << "WIDEN-CALL ";
1890
1891 Function *CalledFn = getCalledScalarFunction();
1892 if (CalledFn->getReturnType()->isVoidTy())
1893 O << "void ";
1894 else {
1896 O << " = ";
1897 }
1898
1899 O << "call";
1900 printFlags(O);
1901 O << " @" << CalledFn->getName() << "(";
1902 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1903 Op->printAsOperand(O, SlotTracker);
1904 });
1905 O << ")";
1906
1907 O << " (using library function";
1908 if (Variant->hasName())
1909 O << ": " << Variant->getName();
1910 O << ")";
1911}
1912#endif
1913
1915 assert(State.VF.isVector() && "not widening");
1916
1917 SmallVector<Type *, 2> TysForDecl;
1918 // Add return type if intrinsic is overloaded on it.
1919 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
1920 State.TTI)) {
1921 Type *RetTy = toVectorizedTy(getResultType(), State.VF);
1922 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
1923 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
1925 Idx, State.TTI))
1926 TysForDecl.push_back(Ty);
1927 }
1928 }
1930 for (const auto &I : enumerate(operands())) {
1931 // Some intrinsics have a scalar argument - don't replace it with a
1932 // vector.
1933 Value *Arg;
1934 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1935 State.TTI))
1936 Arg = State.get(I.value(), VPLane(0));
1937 else
1938 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1939 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1940 State.TTI))
1941 TysForDecl.push_back(Arg->getType());
1942 Args.push_back(Arg);
1943 }
1944
1945 // Use vector version of the intrinsic.
1946 Module *M = State.Builder.GetInsertBlock()->getModule();
1947 Function *VectorF =
1948 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1949 assert(VectorF &&
1950 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1951
1954 if (CI)
1955 CI->getOperandBundlesAsDefs(OpBundles);
1956
1957 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1958
1959 applyFlags(*V);
1960 applyMetadata(*V);
1961
1962 if (!V->getType()->isVoidTy())
1963 State.set(this, V);
1964}
1965
1966/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1969 const VPRecipeWithIRFlags &R,
1970 ElementCount VF,
1971 VPCostContext &Ctx) {
1972 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1973 // Skip the reverse operation cost for the mask.
1974 // FIXME: Remove this once redundant mask reverse operations can be eliminated
1975 // by VPlanTransforms::cse before cost computation.
1976 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
1977 return InstructionCost(0);
1978
1979 // Some backends analyze intrinsic arguments to determine cost. Use the
1980 // underlying value for the operand if it has one. Otherwise try to use the
1981 // operand of the underlying call instruction, if there is one. Otherwise
1982 // clear Arguments.
1983 // TODO: Rework TTI interface to be independent of concrete IR values.
1985 for (const auto &[Idx, Op] : enumerate(Operands)) {
1986 auto *V = Op->getUnderlyingValue();
1987 if (!V) {
1988 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1989 Arguments.push_back(UI->getArgOperand(Idx));
1990 continue;
1991 }
1992 Arguments.clear();
1993 break;
1994 }
1995 Arguments.push_back(V);
1996 }
1997
1998 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1999 SmallVector<Type *> ParamTys =
2000 map_to_vector(Operands, [&](const VPValue *Op) {
2001 return toVectorTy(Ctx.Types.inferScalarType(Op), VF);
2002 });
2003
2004 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2005 IntrinsicCostAttributes CostAttrs(
2006 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
2007 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
2009 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
2010}
2011
2013 VPCostContext &Ctx) const {
2015 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
2016}
2017
2019 return Intrinsic::getBaseName(VectorIntrinsicID);
2020}
2021
2023 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2024 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
2025 auto [Idx, V] = X;
2027 Idx, nullptr);
2028 });
2029}
2030
2031#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2033 VPSlotTracker &SlotTracker) const {
2034 O << Indent << "WIDEN-INTRINSIC ";
2035 if (ResultTy->isVoidTy()) {
2036 O << "void ";
2037 } else {
2039 O << " = ";
2040 }
2041
2042 O << "call";
2043 printFlags(O);
2044 O << getIntrinsicName() << "(";
2045
2047 Op->printAsOperand(O, SlotTracker);
2048 });
2049 O << ")";
2050}
2051#endif
2052
2054 IRBuilderBase &Builder = State.Builder;
2055
2056 Value *Address = State.get(getOperand(0));
2057 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2058 VectorType *VTy = cast<VectorType>(Address->getType());
2059
2060 // The histogram intrinsic requires a mask even if the recipe doesn't;
2061 // if the mask operand was omitted then all lanes should be executed and
2062 // we just need to synthesize an all-true mask.
2063 Value *Mask = nullptr;
2064 if (VPValue *VPMask = getMask())
2065 Mask = State.get(VPMask);
2066 else
2067 Mask =
2068 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2069
2070 // If this is a subtract, we want to invert the increment amount. We may
2071 // add a separate intrinsic in future, but for now we'll try this.
2072 if (Opcode == Instruction::Sub)
2073 IncAmt = Builder.CreateNeg(IncAmt);
2074 else
2075 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2076
2077 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
2078 {VTy, IncAmt->getType()},
2079 {Address, IncAmt, Mask});
2080}
2081
2083 VPCostContext &Ctx) const {
2084 // FIXME: Take the gather and scatter into account as well. For now we're
2085 // generating the same cost as the fallback path, but we'll likely
2086 // need to create a new TTI method for determining the cost, including
2087 // whether we can use base + vec-of-smaller-indices or just
2088 // vec-of-pointers.
2089 assert(VF.isVector() && "Invalid VF for histogram cost");
2090 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2091 VPValue *IncAmt = getOperand(1);
2092 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2093 VectorType *VTy = VectorType::get(IncTy, VF);
2094
2095 // Assume that a non-constant update value (or a constant != 1) requires
2096 // a multiply, and add that into the cost.
2097 InstructionCost MulCost =
2098 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2099 if (match(IncAmt, m_One()))
2100 MulCost = TTI::TCC_Free;
2101
2102 // Find the cost of the histogram operation itself.
2103 Type *PtrTy = VectorType::get(AddressTy, VF);
2104 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2105 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2106 Type::getVoidTy(Ctx.LLVMCtx),
2107 {PtrTy, IncTy, MaskTy});
2108
2109 // Add the costs together with the add/sub operation.
2110 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2111 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2112}
2113
2114#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2116 VPSlotTracker &SlotTracker) const {
2117 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2119
2120 if (Opcode == Instruction::Sub)
2121 O << ", dec: ";
2122 else {
2123 assert(Opcode == Instruction::Add);
2124 O << ", inc: ";
2125 }
2127
2128 if (VPValue *Mask = getMask()) {
2129 O << ", mask: ";
2130 Mask->printAsOperand(O, SlotTracker);
2131 }
2132}
2133#endif
2134
2135VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2136 AllowReassoc = FMF.allowReassoc();
2137 NoNaNs = FMF.noNaNs();
2138 NoInfs = FMF.noInfs();
2139 NoSignedZeros = FMF.noSignedZeros();
2140 AllowReciprocal = FMF.allowReciprocal();
2141 AllowContract = FMF.allowContract();
2142 ApproxFunc = FMF.approxFunc();
2143}
2144
2146 switch (Opcode) {
2147 case Instruction::Add:
2148 case Instruction::Sub:
2149 case Instruction::Mul:
2150 case Instruction::Shl:
2152 return WrapFlagsTy(false, false);
2153 case Instruction::Trunc:
2154 return TruncFlagsTy(false, false);
2155 case Instruction::Or:
2156 return DisjointFlagsTy(false);
2157 case Instruction::AShr:
2158 case Instruction::LShr:
2159 case Instruction::UDiv:
2160 case Instruction::SDiv:
2161 return ExactFlagsTy(false);
2162 case Instruction::GetElementPtr:
2165 return GEPNoWrapFlags::none();
2166 case Instruction::ZExt:
2167 case Instruction::UIToFP:
2168 return NonNegFlagsTy(false);
2169 case Instruction::FAdd:
2170 case Instruction::FSub:
2171 case Instruction::FMul:
2172 case Instruction::FDiv:
2173 case Instruction::FRem:
2174 case Instruction::FNeg:
2175 case Instruction::FPExt:
2176 case Instruction::FPTrunc:
2177 return FastMathFlags();
2178 case Instruction::ICmp:
2179 case Instruction::FCmp:
2181 llvm_unreachable("opcode requires explicit flags");
2182 default:
2183 return VPIRFlags();
2184 }
2185}
2186
2187#if !defined(NDEBUG)
2188bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2189 switch (OpType) {
2190 case OperationType::OverflowingBinOp:
2191 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2192 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2193 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2194 case OperationType::Trunc:
2195 return Opcode == Instruction::Trunc;
2196 case OperationType::DisjointOp:
2197 return Opcode == Instruction::Or;
2198 case OperationType::PossiblyExactOp:
2199 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2200 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2201 case OperationType::GEPOp:
2202 return Opcode == Instruction::GetElementPtr ||
2203 Opcode == VPInstruction::PtrAdd ||
2204 Opcode == VPInstruction::WidePtrAdd;
2205 case OperationType::FPMathOp:
2206 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2207 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2208 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2209 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2210 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2211 Opcode == Instruction::Select ||
2212 Opcode == VPInstruction::WideIVStep ||
2214 case OperationType::FCmp:
2215 return Opcode == Instruction::FCmp;
2216 case OperationType::NonNegOp:
2217 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2218 case OperationType::Cmp:
2219 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2220 case OperationType::ReductionOp:
2222 case OperationType::Other:
2223 return true;
2224 }
2225 llvm_unreachable("Unknown OperationType enum");
2226}
2227
2228bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2229 // Handle opcodes without default flags.
2230 if (Opcode == Instruction::ICmp)
2231 return OpType == OperationType::Cmp;
2232 if (Opcode == Instruction::FCmp)
2233 return OpType == OperationType::FCmp;
2235 return OpType == OperationType::ReductionOp;
2236
2237 OperationType Required = getDefaultFlags(Opcode).OpType;
2238 return Required == OperationType::Other || Required == OpType;
2239}
2240#endif
2241
2242#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2244 switch (OpType) {
2245 case OperationType::Cmp:
2247 break;
2248 case OperationType::FCmp:
2251 break;
2252 case OperationType::DisjointOp:
2253 if (DisjointFlags.IsDisjoint)
2254 O << " disjoint";
2255 break;
2256 case OperationType::PossiblyExactOp:
2257 if (ExactFlags.IsExact)
2258 O << " exact";
2259 break;
2260 case OperationType::OverflowingBinOp:
2261 if (WrapFlags.HasNUW)
2262 O << " nuw";
2263 if (WrapFlags.HasNSW)
2264 O << " nsw";
2265 break;
2266 case OperationType::Trunc:
2267 if (TruncFlags.HasNUW)
2268 O << " nuw";
2269 if (TruncFlags.HasNSW)
2270 O << " nsw";
2271 break;
2272 case OperationType::FPMathOp:
2274 break;
2275 case OperationType::GEPOp: {
2277 if (Flags.isInBounds())
2278 O << " inbounds";
2279 else if (Flags.hasNoUnsignedSignedWrap())
2280 O << " nusw";
2281 if (Flags.hasNoUnsignedWrap())
2282 O << " nuw";
2283 break;
2284 }
2285 case OperationType::NonNegOp:
2286 if (NonNegFlags.NonNeg)
2287 O << " nneg";
2288 break;
2289 case OperationType::ReductionOp: {
2290 RecurKind RK = getRecurKind();
2291 O << " (";
2292 switch (RK) {
2293 case RecurKind::AnyOf:
2294 O << "any-of";
2295 break;
2297 O << "find-last";
2298 break;
2299 case RecurKind::SMax:
2300 O << "smax";
2301 break;
2302 case RecurKind::SMin:
2303 O << "smin";
2304 break;
2305 case RecurKind::UMax:
2306 O << "umax";
2307 break;
2308 case RecurKind::UMin:
2309 O << "umin";
2310 break;
2311 case RecurKind::FMinNum:
2312 O << "fminnum";
2313 break;
2314 case RecurKind::FMaxNum:
2315 O << "fmaxnum";
2316 break;
2318 O << "fminimum";
2319 break;
2321 O << "fmaximum";
2322 break;
2324 O << "fminimumnum";
2325 break;
2327 O << "fmaximumnum";
2328 break;
2329 default:
2331 break;
2332 }
2333 if (isReductionInLoop())
2334 O << ", in-loop";
2335 if (isReductionOrdered())
2336 O << ", ordered";
2337 O << ")";
2339 break;
2340 }
2341 case OperationType::Other:
2342 break;
2343 }
2344 O << " ";
2345}
2346#endif
2347
2349 auto &Builder = State.Builder;
2350 switch (Opcode) {
2351 case Instruction::Call:
2352 case Instruction::UncondBr:
2353 case Instruction::CondBr:
2354 case Instruction::PHI:
2355 case Instruction::GetElementPtr:
2356 llvm_unreachable("This instruction is handled by a different recipe.");
2357 case Instruction::UDiv:
2358 case Instruction::SDiv:
2359 case Instruction::SRem:
2360 case Instruction::URem:
2361 case Instruction::Add:
2362 case Instruction::FAdd:
2363 case Instruction::Sub:
2364 case Instruction::FSub:
2365 case Instruction::FNeg:
2366 case Instruction::Mul:
2367 case Instruction::FMul:
2368 case Instruction::FDiv:
2369 case Instruction::FRem:
2370 case Instruction::Shl:
2371 case Instruction::LShr:
2372 case Instruction::AShr:
2373 case Instruction::And:
2374 case Instruction::Or:
2375 case Instruction::Xor: {
2376 // Just widen unops and binops.
2378 for (VPValue *VPOp : operands())
2379 Ops.push_back(State.get(VPOp));
2380
2381 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2382
2383 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2384 applyFlags(*VecOp);
2385 applyMetadata(*VecOp);
2386 }
2387
2388 // Use this vector value for all users of the original instruction.
2389 State.set(this, V);
2390 break;
2391 }
2392 case Instruction::ExtractValue: {
2393 assert(getNumOperands() == 2 && "expected single level extractvalue");
2394 Value *Op = State.get(getOperand(0));
2395 Value *Extract = Builder.CreateExtractValue(
2396 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2397 State.set(this, Extract);
2398 break;
2399 }
2400 case Instruction::Freeze: {
2401 Value *Op = State.get(getOperand(0));
2402 Value *Freeze = Builder.CreateFreeze(Op);
2403 State.set(this, Freeze);
2404 break;
2405 }
2406 case Instruction::ICmp:
2407 case Instruction::FCmp: {
2408 // Widen compares. Generate vector compares.
2409 bool FCmp = Opcode == Instruction::FCmp;
2410 Value *A = State.get(getOperand(0));
2411 Value *B = State.get(getOperand(1));
2412 Value *C = nullptr;
2413 if (FCmp) {
2414 C = Builder.CreateFCmp(getPredicate(), A, B);
2415 } else {
2416 C = Builder.CreateICmp(getPredicate(), A, B);
2417 }
2418 if (auto *I = dyn_cast<Instruction>(C)) {
2419 applyFlags(*I);
2420 applyMetadata(*I);
2421 }
2422 State.set(this, C);
2423 break;
2424 }
2425 case Instruction::Select: {
2426 VPValue *CondOp = getOperand(0);
2427 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2428 Value *Op0 = State.get(getOperand(1));
2429 Value *Op1 = State.get(getOperand(2));
2430 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2431 State.set(this, Sel);
2432 if (auto *I = dyn_cast<Instruction>(Sel)) {
2434 applyFlags(*I);
2435 applyMetadata(*I);
2436 }
2437 break;
2438 }
2439 default:
2440 // This instruction is not vectorized by simple widening.
2441 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2442 << Instruction::getOpcodeName(Opcode));
2443 llvm_unreachable("Unhandled instruction!");
2444 } // end of switch.
2445
2446#if !defined(NDEBUG)
2447 // Verify that VPlan type inference results agree with the type of the
2448 // generated values.
2449 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2450 State.get(this)->getType() &&
2451 "inferred type and type from generated instructions do not match");
2452#endif
2453}
2454
2456 VPCostContext &Ctx) const {
2457 switch (Opcode) {
2458 case Instruction::UDiv:
2459 case Instruction::SDiv:
2460 case Instruction::SRem:
2461 case Instruction::URem:
2462 // If the div/rem operation isn't safe to speculate and requires
2463 // predication, then the only way we can even create a vplan is to insert
2464 // a select on the second input operand to ensure we use the value of 1
2465 // for the inactive lanes. The select will be costed separately.
2466 case Instruction::FNeg:
2467 case Instruction::Add:
2468 case Instruction::FAdd:
2469 case Instruction::Sub:
2470 case Instruction::FSub:
2471 case Instruction::Mul:
2472 case Instruction::FMul:
2473 case Instruction::FDiv:
2474 case Instruction::FRem:
2475 case Instruction::Shl:
2476 case Instruction::LShr:
2477 case Instruction::AShr:
2478 case Instruction::And:
2479 case Instruction::Or:
2480 case Instruction::Xor:
2481 case Instruction::Freeze:
2482 case Instruction::ExtractValue:
2483 case Instruction::ICmp:
2484 case Instruction::FCmp:
2485 case Instruction::Select:
2486 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2487 default:
2488 llvm_unreachable("Unsupported opcode for instruction");
2489 }
2490}
2491
2492#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2494 VPSlotTracker &SlotTracker) const {
2495 O << Indent << "WIDEN ";
2497 O << " = " << Instruction::getOpcodeName(Opcode);
2498 printFlags(O);
2500}
2501#endif
2502
2504 auto &Builder = State.Builder;
2505 /// Vectorize casts.
2506 assert(State.VF.isVector() && "Not vectorizing?");
2507 Type *DestTy = VectorType::get(getResultType(), State.VF);
2508 VPValue *Op = getOperand(0);
2509 Value *A = State.get(Op);
2510 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2511 State.set(this, Cast);
2512 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2513 applyFlags(*CastOp);
2514 applyMetadata(*CastOp);
2515 }
2516}
2517
2519 VPCostContext &Ctx) const {
2520 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2521 // the legacy cost model, including truncates/extends when evaluating a
2522 // reduction in a smaller type.
2523 if (!getUnderlyingValue())
2524 return 0;
2525 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2526}
2527
2528#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2530 VPSlotTracker &SlotTracker) const {
2531 O << Indent << "WIDEN-CAST ";
2533 O << " = " << Instruction::getOpcodeName(Opcode);
2534 printFlags(O);
2536 O << " to " << *getResultType();
2537}
2538#endif
2539
2541 VPCostContext &Ctx) const {
2542 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2543}
2544
2545#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2547 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2548 O << Indent;
2550 O << " = WIDEN-INDUCTION";
2551 printFlags(O);
2553
2554 if (auto *TI = getTruncInst())
2555 O << " (truncated to " << *TI->getType() << ")";
2556}
2557#endif
2558
2560 // The step may be defined by a recipe in the preheader (e.g. if it requires
2561 // SCEV expansion), but for the canonical induction the step is required to be
2562 // 1, which is represented as live-in.
2563 return match(getStartValue(), m_ZeroInt()) &&
2564 match(getStepValue(), m_One()) &&
2565 getScalarType() == getRegion()->getCanonicalIVType();
2566}
2567
2568#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2570 VPSlotTracker &SlotTracker) const {
2571 O << Indent;
2573 O << " = DERIVED-IV ";
2574 getStartValue()->printAsOperand(O, SlotTracker);
2575 O << " + ";
2576 getOperand(1)->printAsOperand(O, SlotTracker);
2577 O << " * ";
2578 getStepValue()->printAsOperand(O, SlotTracker);
2579}
2580#endif
2581
2583 // Fast-math-flags propagate from the original induction instruction.
2584 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2585 State.Builder.setFastMathFlags(getFastMathFlags());
2586
2587 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2588 /// variable on which to base the steps, \p Step is the size of the step.
2589
2590 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2591 Value *Step = State.get(getStepValue(), VPLane(0));
2592 IRBuilderBase &Builder = State.Builder;
2593
2594 // Ensure step has the same type as that of scalar IV.
2595 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2596 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2597
2598 // We build scalar steps for both integer and floating-point induction
2599 // variables. Here, we determine the kind of arithmetic we will perform.
2602 if (BaseIVTy->isIntegerTy()) {
2603 AddOp = Instruction::Add;
2604 MulOp = Instruction::Mul;
2605 } else {
2606 AddOp = InductionOpcode;
2607 MulOp = Instruction::FMul;
2608 }
2609
2610 // Determine the number of scalars we need to generate.
2611 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2612 // Compute the scalar steps and save the results in State.
2613
2614 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2615 assert(!State.Lane && "replicate regions must be dissolved before ::execute");
2616 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2617 : Constant::getNullValue(BaseIVTy);
2618
2619 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
2620 // It is okay if the induction variable type cannot hold the lane number,
2621 // we expect truncation in this case.
2622 Constant *LaneValue =
2623 BaseIVTy->isIntegerTy()
2624 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2625 /*ImplicitTrunc=*/true)
2626 : ConstantFP::get(BaseIVTy, Lane);
2627 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2628 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2629 "Expected StartIdx to be folded to a constant when VF is not "
2630 "scalable");
2631 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2632 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2633 State.set(this, Add, VPLane(Lane));
2634 }
2635}
2636
2637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2639 VPSlotTracker &SlotTracker) const {
2640 O << Indent;
2642 O << " = SCALAR-STEPS ";
2644}
2645#endif
2646
2648 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2650}
2651
2653 assert(State.VF.isVector() && "not widening");
2654 // Construct a vector GEP by widening the operands of the scalar GEP as
2655 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2656 // results in a vector of pointers when at least one operand of the GEP
2657 // is vector-typed. Thus, to keep the representation compact, we only use
2658 // vector-typed operands for loop-varying values.
2659
2660 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2661 return Op->isDefinedOutsideLoopRegions();
2662 });
2663 if (AllOperandsAreInvariant) {
2664 // If we are vectorizing, but the GEP has only loop-invariant operands,
2665 // the GEP we build (by only using vector-typed operands for
2666 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2667 // produce a vector of pointers, we need to either arbitrarily pick an
2668 // operand to broadcast, or broadcast a clone of the original GEP.
2669 // Here, we broadcast a clone of the original.
2670
2672 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2673 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2674
2675 auto *NewGEP =
2676 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2677 "", getGEPNoWrapFlags());
2678 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2679 State.set(this, Splat);
2680 return;
2681 }
2682
2683 // If the GEP has at least one loop-varying operand, we are sure to
2684 // produce a vector of pointers unless VF is scalar.
2685 // The pointer operand of the new GEP. If it's loop-invariant, we
2686 // won't broadcast it.
2687 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2688
2689 // Collect all the indices for the new GEP. If any index is
2690 // loop-invariant, we won't broadcast it.
2692 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2693 VPValue *Operand = getOperand(I);
2694 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2695 }
2696
2697 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2698 // but it should be a vector, otherwise.
2699 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2700 "", getGEPNoWrapFlags());
2701 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2702 "NewGEP is not a pointer vector");
2703 State.set(this, NewGEP);
2704}
2705
2706#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2708 VPSlotTracker &SlotTracker) const {
2709 O << Indent << "WIDEN-GEP ";
2710 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2711 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2712 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2713
2714 O << " ";
2716 O << " = getelementptr";
2717 printFlags(O);
2719}
2720#endif
2721
2723 assert(!getOffset() && "Unexpected offset operand");
2724 VPBuilder Builder(this);
2725 VPlan &Plan = *getParent()->getPlan();
2726 VPValue *VFVal = getVFValue();
2727 VPTypeAnalysis TypeInfo(Plan);
2728 const DataLayout &DL = Plan.getDataLayout();
2729 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
2730 VPValue *Stride =
2731 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
2732 Type *VFTy = TypeInfo.inferScalarType(VFVal);
2733 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
2735
2736 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2737 VPInstruction *VFMinusOne =
2738 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
2739 DebugLoc::getUnknown(), "", {true, true});
2740 VPInstruction *Offset0 =
2741 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
2742
2743 // Offset for PartN = Offset0 + Part * Stride * VF.
2744 VPValue *PartxStride =
2745 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
2746 VPValue *Offset = Builder.createAdd(
2747 Offset0,
2748 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
2750}
2751
2753 auto &Builder = State.Builder;
2754 assert(getOffset() && "Expected prior materialization of offset");
2755 Value *Ptr = State.get(getPointer(), true);
2756 Value *Offset = State.get(getOffset(), true);
2757 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2759 State.set(this, ResultPtr, /*IsScalar*/ true);
2760}
2761
2762#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2764 VPSlotTracker &SlotTracker) const {
2765 O << Indent;
2767 O << " = vector-end-pointer";
2768 printFlags(O);
2770}
2771#endif
2772
2774 auto &Builder = State.Builder;
2775 assert(getOffset() &&
2776 "Expected prior simplification of recipe without offset");
2777 Value *Ptr = State.get(getOperand(0), VPLane(0));
2778 Value *Offset = State.get(getOffset(), true);
2779 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2781 State.set(this, ResultPtr, /*IsScalar*/ true);
2782}
2783
2784#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2786 VPSlotTracker &SlotTracker) const {
2787 O << Indent;
2789 O << " = vector-pointer";
2790 printFlags(O);
2792}
2793#endif
2794
2796 VPCostContext &Ctx) const {
2797 // A blend will be expanded to a select VPInstruction, which will generate a
2798 // scalar select if only the first lane is used.
2800 VF = ElementCount::getFixed(1);
2801
2802 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2803 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2804 return (getNumIncomingValues() - 1) *
2805 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2806 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2807}
2808
2809#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2811 VPSlotTracker &SlotTracker) const {
2812 O << Indent << "BLEND ";
2814 O << " =";
2815 printFlags(O);
2816 if (getNumIncomingValues() == 1) {
2817 // Not a User of any mask: not really blending, this is a
2818 // single-predecessor phi.
2819 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2820 } else {
2821 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2822 if (I != 0)
2823 O << " ";
2824 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2825 if (I == 0 && isNormalized())
2826 continue;
2827 O << "/";
2828 getMask(I)->printAsOperand(O, SlotTracker);
2829 }
2830 }
2831}
2832#endif
2833
2835 assert(!State.Lane && "Reduction being replicated.");
2838 "In-loop AnyOf reductions aren't currently supported");
2839 // Propagate the fast-math flags carried by the underlying instruction.
2840 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2841 State.Builder.setFastMathFlags(getFastMathFlags());
2842 Value *NewVecOp = State.get(getVecOp());
2843 if (VPValue *Cond = getCondOp()) {
2844 Value *NewCond = State.get(Cond, State.VF.isScalar());
2845 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2846 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2847
2848 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2849 if (State.VF.isVector())
2850 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2851
2852 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2853 NewVecOp = Select;
2854 }
2855 Value *NewRed;
2856 Value *NextInChain;
2857 if (isOrdered()) {
2858 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2859 if (State.VF.isVector())
2860 NewRed =
2861 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2862 else
2863 NewRed = State.Builder.CreateBinOp(
2865 PrevInChain, NewVecOp);
2866 PrevInChain = NewRed;
2867 NextInChain = NewRed;
2868 } else if (isPartialReduction()) {
2869 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2870 "Unexpected partial reduction kind");
2871 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
2872 NewRed = State.Builder.CreateIntrinsic(
2873 PrevInChain->getType(),
2874 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2875 : Intrinsic::vector_partial_reduce_fadd,
2876 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
2877 "partial.reduce");
2878 PrevInChain = NewRed;
2879 NextInChain = NewRed;
2880 } else {
2881 assert(isInLoop() &&
2882 "The reduction must either be ordered, partial or in-loop");
2883 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2884 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2886 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2887 else
2888 NextInChain = State.Builder.CreateBinOp(
2890 PrevInChain, NewRed);
2891 }
2892 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
2893}
2894
2896 assert(!State.Lane && "Reduction being replicated.");
2897
2898 auto &Builder = State.Builder;
2899 // Propagate the fast-math flags carried by the underlying instruction.
2900 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2901 Builder.setFastMathFlags(getFastMathFlags());
2902
2904 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2905 Value *VecOp = State.get(getVecOp());
2906 Value *EVL = State.get(getEVL(), VPLane(0));
2907
2908 Value *Mask;
2909 if (VPValue *CondOp = getCondOp())
2910 Mask = State.get(CondOp);
2911 else
2912 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2913
2914 Value *NewRed;
2915 if (isOrdered()) {
2916 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2917 } else {
2918 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2920 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2921 else
2922 NewRed = Builder.CreateBinOp(
2924 Prev);
2925 }
2926 State.set(this, NewRed, /*IsScalar*/ true);
2927}
2928
2930 VPCostContext &Ctx) const {
2931 RecurKind RdxKind = getRecurrenceKind();
2932 Type *ElementTy = Ctx.Types.inferScalarType(this);
2933 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2934 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2936 std::optional<FastMathFlags> OptionalFMF =
2937 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2938
2939 if (isPartialReduction()) {
2940 InstructionCost CondCost = 0;
2941 if (isConditional()) {
2943 auto *CondTy = cast<VectorType>(
2944 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
2945 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
2946 CondTy, Pred, Ctx.CostKind);
2947 }
2948 return CondCost + Ctx.TTI.getPartialReductionCost(
2949 Opcode, ElementTy, ElementTy, ElementTy, VF,
2950 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
2951 OptionalFMF);
2952 }
2953
2954 // TODO: Support any-of reductions.
2955 assert(
2957 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2958 "Any-of reduction not implemented in VPlan-based cost model currently.");
2959
2960 // Note that TTI should model the cost of moving result to the scalar register
2961 // and the BinOp cost in the getMinMaxReductionCost().
2964 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2965 }
2966
2967 // Note that TTI should model the cost of moving result to the scalar register
2968 // and the BinOp cost in the getArithmeticReductionCost().
2969 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2970 Ctx.CostKind);
2971}
2972
2973VPExpressionRecipe::VPExpressionRecipe(
2974 ExpressionTypes ExpressionType,
2975 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2976 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2977 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2978 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2979 assert(
2980 none_of(ExpressionRecipes,
2981 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2982 "expression cannot contain recipes with side-effects");
2983
2984 // Maintain a copy of the expression recipes as a set of users.
2985 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2986 for (auto *R : ExpressionRecipes)
2987 ExpressionRecipesAsSetOfUsers.insert(R);
2988
2989 // Recipes in the expression, except the last one, must only be used by
2990 // (other) recipes inside the expression. If there are other users, external
2991 // to the expression, use a clone of the recipe for external users.
2992 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2993 if (R != ExpressionRecipes.back() &&
2994 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2995 return !ExpressionRecipesAsSetOfUsers.contains(U);
2996 })) {
2997 // There are users outside of the expression. Clone the recipe and use the
2998 // clone those external users.
2999 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3000 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
3001 VPUser &U, unsigned) {
3002 return !ExpressionRecipesAsSetOfUsers.contains(&U);
3003 });
3004 CopyForExtUsers->insertBefore(R);
3005 }
3006 if (R->getParent())
3007 R->removeFromParent();
3008 }
3009
3010 // Internalize all external operands to the expression recipes. To do so,
3011 // create new temporary VPValues for all operands defined by a recipe outside
3012 // the expression. The original operands are added as operands of the
3013 // VPExpressionRecipe itself.
3014 for (auto *R : ExpressionRecipes) {
3015 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3016 auto *Def = Op->getDefiningRecipe();
3017 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3018 continue;
3019 addOperand(Op);
3020 LiveInPlaceholders.push_back(new VPSymbolicValue(nullptr));
3021 }
3022 }
3023
3024 // Replace each external operand with the first one created for it in
3025 // LiveInPlaceholders.
3026 for (auto *R : ExpressionRecipes)
3027 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3028 R->replaceUsesOfWith(LiveIn, Tmp);
3029}
3030
3032 for (auto *R : ExpressionRecipes)
3033 // Since the list could contain duplicates, make sure the recipe hasn't
3034 // already been inserted.
3035 if (!R->getParent())
3036 R->insertBefore(this);
3037
3038 for (const auto &[Idx, Op] : enumerate(operands()))
3039 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3040
3041 replaceAllUsesWith(ExpressionRecipes.back());
3042 ExpressionRecipes.clear();
3043}
3044
3046 VPCostContext &Ctx) const {
3047 Type *RedTy = Ctx.Types.inferScalarType(this);
3048 auto *SrcVecTy = cast<VectorType>(
3049 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
3050 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3051 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3052 switch (ExpressionType) {
3053 case ExpressionTypes::ExtendedReduction: {
3054 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3055 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
3056 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3057 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3058
3059 if (RedR->isPartialReduction())
3060 return Ctx.TTI.getPartialReductionCost(
3061 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
3063 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3064 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3065 : std::nullopt);
3066 else if (!RedTy->isFloatingPointTy())
3067 // TTI::getExtendedReductionCost only supports integer types.
3068 return Ctx.TTI.getExtendedReductionCost(
3069 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3070 std::nullopt, Ctx.CostKind);
3071 else
3073 }
3074 case ExpressionTypes::MulAccReduction:
3075 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3076 Ctx.CostKind);
3077
3078 case ExpressionTypes::ExtNegatedMulAccReduction:
3079 assert(Opcode == Instruction::Add && "Unexpected opcode");
3080 Opcode = Instruction::Sub;
3081 [[fallthrough]];
3082 case ExpressionTypes::ExtMulAccReduction: {
3083 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3084 if (RedR->isPartialReduction()) {
3085 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3086 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3087 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3088 return Ctx.TTI.getPartialReductionCost(
3089 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
3090 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
3092 Ext0R->getOpcode()),
3094 Ext1R->getOpcode()),
3095 Mul->getOpcode(), Ctx.CostKind,
3096 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3097 : std::nullopt);
3098 }
3099 return Ctx.TTI.getMulAccReductionCost(
3100 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3101 Instruction::ZExt,
3102 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3103 }
3104 }
3105 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3106}
3107
3109 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3110 return R->mayReadFromMemory() || R->mayWriteToMemory();
3111 });
3112}
3113
3115 assert(
3116 none_of(ExpressionRecipes,
3117 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3118 "expression cannot contain recipes with side-effects");
3119 return false;
3120}
3121
3123 // Cannot use vputils::isSingleScalar(), because all external operands
3124 // of the expression will be live-ins while bundled.
3125 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3126 return RR && !RR->isPartialReduction();
3127}
3128
3129#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3130
3132 VPSlotTracker &SlotTracker) const {
3133 O << Indent << "EXPRESSION ";
3135 O << " = ";
3136 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3137 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3138
3139 switch (ExpressionType) {
3140 case ExpressionTypes::ExtendedReduction: {
3142 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3143 O << Instruction::getOpcodeName(Opcode) << " (";
3145 Red->printFlags(O);
3146
3147 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3148 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3149 << *Ext0->getResultType();
3150 if (Red->isConditional()) {
3151 O << ", ";
3152 Red->getCondOp()->printAsOperand(O, SlotTracker);
3153 }
3154 O << ")";
3155 break;
3156 }
3157 case ExpressionTypes::ExtNegatedMulAccReduction: {
3159 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3161 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3162 << " (sub (0, mul";
3163 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3164 Mul->printFlags(O);
3165 O << "(";
3167 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3168 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3169 << *Ext0->getResultType() << "), (";
3171 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3172 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3173 << *Ext1->getResultType() << ")";
3174 if (Red->isConditional()) {
3175 O << ", ";
3176 Red->getCondOp()->printAsOperand(O, SlotTracker);
3177 }
3178 O << "))";
3179 break;
3180 }
3181 case ExpressionTypes::MulAccReduction:
3182 case ExpressionTypes::ExtMulAccReduction: {
3184 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3186 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3187 << " (";
3188 O << "mul";
3189 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3190 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3191 : ExpressionRecipes[0]);
3192 Mul->printFlags(O);
3193 if (IsExtended)
3194 O << "(";
3196 if (IsExtended) {
3197 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3198 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3199 << *Ext0->getResultType() << "), (";
3200 } else {
3201 O << ", ";
3202 }
3204 if (IsExtended) {
3205 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3206 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3207 << *Ext1->getResultType() << ")";
3208 }
3209 if (Red->isConditional()) {
3210 O << ", ";
3211 Red->getCondOp()->printAsOperand(O, SlotTracker);
3212 }
3213 O << ")";
3214 break;
3215 }
3216 }
3217}
3218
3220 VPSlotTracker &SlotTracker) const {
3221 if (isPartialReduction())
3222 O << Indent << "PARTIAL-REDUCE ";
3223 else
3224 O << Indent << "REDUCE ";
3226 O << " = ";
3228 O << " +";
3229 printFlags(O);
3230 O << " reduce."
3233 << " (";
3235 if (isConditional()) {
3236 O << ", ";
3238 }
3239 O << ")";
3240}
3241
3243 VPSlotTracker &SlotTracker) const {
3244 O << Indent << "REDUCE ";
3246 O << " = ";
3248 O << " +";
3249 printFlags(O);
3250 O << " vp.reduce."
3253 << " (";
3255 O << ", ";
3257 if (isConditional()) {
3258 O << ", ";
3260 }
3261 O << ")";
3262}
3263
3264#endif
3265
3266/// A helper function to scalarize a single Instruction in the innermost loop.
3267/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3268/// operands from \p RepRecipe instead of \p Instr's operands.
3269static void scalarizeInstruction(const Instruction *Instr,
3270 VPReplicateRecipe *RepRecipe,
3271 const VPLane &Lane, VPTransformState &State) {
3272 assert((!Instr->getType()->isAggregateType() ||
3273 canVectorizeTy(Instr->getType())) &&
3274 "Expected vectorizable or non-aggregate type.");
3275
3276 // Does this instruction return a value ?
3277 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3278
3279 Instruction *Cloned = Instr->clone();
3280 if (!IsVoidRetTy) {
3281 Cloned->setName(Instr->getName() + ".cloned");
3282 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3283 // The operands of the replicate recipe may have been narrowed, resulting in
3284 // a narrower result type. Update the type of the cloned instruction to the
3285 // correct type.
3286 if (ResultTy != Cloned->getType())
3287 Cloned->mutateType(ResultTy);
3288 }
3289
3290 RepRecipe->applyFlags(*Cloned);
3291 RepRecipe->applyMetadata(*Cloned);
3292
3293 if (RepRecipe->hasPredicate())
3294 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3295
3296 if (auto DL = RepRecipe->getDebugLoc())
3297 State.setDebugLocFrom(DL);
3298
3299 // Replace the operands of the cloned instructions with their scalar
3300 // equivalents in the new loop.
3301 for (const auto &I : enumerate(RepRecipe->operands())) {
3302 auto InputLane = Lane;
3303 VPValue *Operand = I.value();
3304 if (vputils::isSingleScalar(Operand))
3305 InputLane = VPLane::getFirstLane();
3306 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3307 }
3308
3309 // Place the cloned scalar in the new loop.
3310 State.Builder.Insert(Cloned);
3311
3312 State.set(RepRecipe, Cloned, Lane);
3313
3314 // If we just cloned a new assumption, add it the assumption cache.
3315 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3316 State.AC->registerAssumption(II);
3317
3318 assert(
3319 (RepRecipe->getRegion() ||
3320 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3321 all_of(RepRecipe->operands(),
3322 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3323 "Expected a recipe is either within a region or all of its operands "
3324 "are defined outside the vectorized region.");
3325}
3326
3328 assert(!State.Lane && "replicate regions must be dissolved before ::execute");
3329 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3330 "must have already been unrolled");
3332 scalarizeInstruction(UI, this, VPLane(0), State);
3333}
3334
3335/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3336/// which the legacy cost model computes a SCEV expression when computing the
3337/// address cost. Computing SCEVs for VPValues is incomplete and returns
3338/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3339/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3340static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3342 const Loop *L) {
3343 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3344 if (isa<SCEVCouldNotCompute>(Addr))
3345 return Addr;
3346
3347 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3348}
3349
3350/// Return true if \p R is a predicated load/store with a loop-invariant address
3351/// only masked by the header mask.
3353 const SCEV *PtrSCEV,
3354 VPCostContext &Ctx) {
3355 const VPRegionBlock *ParentRegion = R.getRegion();
3356 if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
3357 !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
3358 return false;
3359 auto *BOM =
3361 return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
3362}
3363
3365 VPCostContext &Ctx) const {
3367 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3368 // transform, avoid computing their cost multiple times for now.
3369 Ctx.SkipCostComputation.insert(UI);
3370
3371 if (VF.isScalable() && !isSingleScalar())
3373
3374 switch (UI->getOpcode()) {
3375 case Instruction::Alloca:
3376 if (VF.isScalable())
3378 return Ctx.TTI.getArithmeticInstrCost(
3379 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3380 case Instruction::GetElementPtr:
3381 // We mark this instruction as zero-cost because the cost of GEPs in
3382 // vectorized code depends on whether the corresponding memory instruction
3383 // is scalarized or not. Therefore, we handle GEPs with the memory
3384 // instruction cost.
3385 return 0;
3386 case Instruction::Call: {
3387 auto *CalledFn =
3389
3392 for (const VPValue *ArgOp : ArgOps)
3393 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3394
3395 if (CalledFn->isIntrinsic() &&
3396 VPCostContext::isFreeScalarIntrinsic(CalledFn->getIntrinsicID())) {
3397 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3398 ElementCount::getFixed(1), Ctx) == 0 &&
3399 "scalarizing intrinsic should be free");
3400 return InstructionCost(0);
3401 }
3402
3403 Type *ResultTy = Ctx.Types.inferScalarType(this);
3404 InstructionCost ScalarCallCost =
3405 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3406 if (isSingleScalar()) {
3407 if (CalledFn->isIntrinsic())
3408 ScalarCallCost = std::min(
3409 ScalarCallCost,
3410 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3411 ElementCount::getFixed(1), Ctx));
3412 return ScalarCallCost;
3413 }
3414
3415 return ScalarCallCost * VF.getFixedValue() +
3416 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3417 }
3418 case Instruction::Add:
3419 case Instruction::Sub:
3420 case Instruction::FAdd:
3421 case Instruction::FSub:
3422 case Instruction::Mul:
3423 case Instruction::FMul:
3424 case Instruction::FDiv:
3425 case Instruction::FRem:
3426 case Instruction::Shl:
3427 case Instruction::LShr:
3428 case Instruction::AShr:
3429 case Instruction::And:
3430 case Instruction::Or:
3431 case Instruction::Xor:
3432 case Instruction::ICmp:
3433 case Instruction::FCmp:
3435 Ctx) *
3436 (isSingleScalar() ? 1 : VF.getFixedValue());
3437 case Instruction::SDiv:
3438 case Instruction::UDiv:
3439 case Instruction::SRem:
3440 case Instruction::URem: {
3441 InstructionCost ScalarCost =
3443 if (isSingleScalar())
3444 return ScalarCost;
3445
3446 // If any of the operands is from a different replicate region and has its
3447 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3448 // model to avoid cost mis-match.
3449 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3450 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3451 if (!PredR)
3452 return false;
3453 return Ctx.skipCostComputation(
3455 PredR->getOperand(0)->getUnderlyingValue()),
3456 VF.isVector());
3457 }))
3458 break;
3459
3460 ScalarCost = ScalarCost * VF.getFixedValue() +
3461 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3462 to_vector(operands()), VF);
3463 // If the recipe is not predicated (i.e. not in a replicate region), return
3464 // the scalar cost. Otherwise handle predicated cost.
3465 if (!getRegion()->isReplicator())
3466 return ScalarCost;
3467
3468 // Account for the phi nodes that we will create.
3469 ScalarCost += VF.getFixedValue() *
3470 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3471 // Scale the cost by the probability of executing the predicated blocks.
3472 // This assumes the predicated block for each vector lane is equally
3473 // likely.
3474 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3475 return ScalarCost;
3476 }
3477 case Instruction::Load:
3478 case Instruction::Store: {
3479 bool IsLoad = UI->getOpcode() == Instruction::Load;
3480 const VPValue *PtrOp = getOperand(!IsLoad);
3481 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3483 break;
3484
3485 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3486 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3487 const Align Alignment = getLoadStoreAlignment(UI);
3488 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3490 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3491 bool UsedByLoadStoreAddress =
3492 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this);
3493 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3494 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3495 UsedByLoadStoreAddress ? UI : nullptr);
3496
3497 // Check if this is a predicated load/store with a loop-invariant address
3498 // only masked by the header mask. If so, return the uniform mem op cost.
3499 if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
3500 InstructionCost UniformCost =
3501 ScalarMemOpCost +
3502 Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
3503 /*Ptr=*/nullptr, Ctx.CostKind);
3504 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3505 if (IsLoad) {
3506 return UniformCost +
3507 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
3508 VectorTy, VectorTy, {}, Ctx.CostKind);
3509 }
3510
3511 VPValue *StoredVal = getOperand(0);
3512 if (!StoredVal->isDefinedOutsideLoopRegions())
3513 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3514 Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
3515 return UniformCost;
3516 }
3517
3518 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3519 InstructionCost ScalarCost =
3520 ScalarMemOpCost +
3521 Ctx.TTI.getAddressComputationCost(
3522 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3523 Ctx.CostKind);
3524 if (isSingleScalar())
3525 return ScalarCost;
3526
3527 SmallVector<const VPValue *> OpsToScalarize;
3528 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3529 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3530 // don't assign scalarization overhead in general, if the target prefers
3531 // vectorized addressing or the loaded value is used as part of an address
3532 // of another load or store.
3533 if (!UsedByLoadStoreAddress) {
3534 bool EfficientVectorLoadStore =
3535 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3536 if (!(IsLoad && !PreferVectorizedAddressing) &&
3537 !(!IsLoad && EfficientVectorLoadStore))
3538 append_range(OpsToScalarize, operands());
3539
3540 if (!EfficientVectorLoadStore)
3541 ResultTy = Ctx.Types.inferScalarType(this);
3542 }
3543
3547 (ScalarCost * VF.getFixedValue()) +
3548 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3549
3550 const VPRegionBlock *ParentRegion = getRegion();
3551 if (ParentRegion && ParentRegion->isReplicator()) {
3552 if (!PtrSCEV)
3553 break;
3554 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3555 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3556
3557 auto *VecI1Ty = VectorType::get(
3558 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3559 Cost += Ctx.TTI.getScalarizationOverhead(
3560 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3561 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3562
3563 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3564 // Artificially setting to a high enough value to practically disable
3565 // vectorization with such operations.
3566 return 3000000;
3567 }
3568 }
3569 return Cost;
3570 }
3571 case Instruction::SExt:
3572 case Instruction::ZExt:
3573 case Instruction::FPToUI:
3574 case Instruction::FPToSI:
3575 case Instruction::FPExt:
3576 case Instruction::PtrToInt:
3577 case Instruction::PtrToAddr:
3578 case Instruction::IntToPtr:
3579 case Instruction::SIToFP:
3580 case Instruction::UIToFP:
3581 case Instruction::Trunc:
3582 case Instruction::FPTrunc:
3583 case Instruction::Select:
3584 case Instruction::AddrSpaceCast: {
3586 Ctx) *
3587 (isSingleScalar() ? 1 : VF.getFixedValue());
3588 }
3589 case Instruction::ExtractValue:
3590 case Instruction::InsertValue:
3591 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3592 }
3593
3594 return Ctx.getLegacyCost(UI, VF);
3595}
3596
3597#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3599 VPSlotTracker &SlotTracker) const {
3600 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3601
3602 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3604 O << " = ";
3605 }
3606 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3607 O << "call";
3608 printFlags(O);
3609 O << "@" << CB->getCalledFunction()->getName() << "(";
3611 O, [&O, &SlotTracker](VPValue *Op) {
3612 Op->printAsOperand(O, SlotTracker);
3613 });
3614 O << ")";
3615 } else {
3617 printFlags(O);
3619 }
3620
3621 // Find if the recipe is used by a widened recipe via an intervening
3622 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3623 if (any_of(users(), [](const VPUser *U) {
3624 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3625 return !vputils::onlyScalarValuesUsed(PredR);
3626 return false;
3627 }))
3628 O << " (S->V)";
3629}
3630#endif
3631
3633 llvm_unreachable("recipe must be removed when dissolving replicate region");
3634}
3635
3637 VPCostContext &Ctx) const {
3638 // The legacy cost model doesn't assign costs to branches for individual
3639 // replicate regions. Match the current behavior in the VPlan cost model for
3640 // now.
3641 return 0;
3642}
3643
3645 llvm_unreachable("recipe must be removed when dissolving replicate region");
3646}
3647
3648#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3650 VPSlotTracker &SlotTracker) const {
3651 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3653 O << " = ";
3655}
3656#endif
3657
3659 VPCostContext &Ctx) const {
3661 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3662 ->getAddressSpace();
3663 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3664 ? Instruction::Load
3665 : Instruction::Store;
3666
3667 if (!Consecutive) {
3668 // TODO: Using the original IR may not be accurate.
3669 // Currently, ARM will use the underlying IR to calculate gather/scatter
3670 // instruction cost.
3671 [[maybe_unused]] auto IsReverseMask = [this]() {
3672 VPValue *Mask = getMask();
3673 if (!Mask)
3674 return false;
3675
3678
3679 return match(Mask, m_Reverse(m_VPValue()));
3680 };
3681 assert(!IsReverseMask() &&
3682 "Inconsecutive memory access should not have reverse order");
3684 Type *PtrTy = Ptr->getType();
3685
3686 // If the address value is uniform across all lanes, then the address can be
3687 // calculated with scalar type and broadcast.
3689 PtrTy = toVectorTy(PtrTy, VF);
3690
3691 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_gather
3692 : isa<VPWidenStoreRecipe>(this) ? Intrinsic::masked_scatter
3693 : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
3694 : Intrinsic::vp_scatter;
3695 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3696 Ctx.CostKind) +
3697 Ctx.TTI.getMemIntrinsicInstrCost(
3699 &Ingredient),
3700 Ctx.CostKind);
3701 }
3702
3704 if (IsMasked) {
3705 unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
3706 : Intrinsic::masked_store;
3707 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3708 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3709 } else {
3710 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3712 : getOperand(1));
3713 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3714 OpInfo, &Ingredient);
3715 }
3716 return Cost;
3717}
3718
3720 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3721 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3722 bool CreateGather = !isConsecutive();
3723
3724 auto &Builder = State.Builder;
3725 Value *Mask = nullptr;
3726 if (auto *VPMask = getMask())
3727 Mask = State.get(VPMask);
3728
3729 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3730 Value *NewLI;
3731 if (CreateGather) {
3732 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3733 "wide.masked.gather");
3734 } else if (Mask) {
3735 NewLI =
3736 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3737 PoisonValue::get(DataTy), "wide.masked.load");
3738 } else {
3739 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3740 }
3742 State.set(this, NewLI);
3743}
3744
3745#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3747 VPSlotTracker &SlotTracker) const {
3748 O << Indent << "WIDEN ";
3750 O << " = load ";
3752}
3753#endif
3754
3756 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3757 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3758 bool CreateGather = !isConsecutive();
3759
3760 auto &Builder = State.Builder;
3761 CallInst *NewLI;
3762 Value *EVL = State.get(getEVL(), VPLane(0));
3763 Value *Addr = State.get(getAddr(), !CreateGather);
3764 Value *Mask = nullptr;
3765 if (VPValue *VPMask = getMask())
3766 Mask = State.get(VPMask);
3767 else
3768 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3769
3770 if (CreateGather) {
3771 NewLI =
3772 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3773 nullptr, "wide.masked.gather");
3774 } else {
3775 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3776 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3777 }
3778 NewLI->addParamAttr(
3780 applyMetadata(*NewLI);
3781 Instruction *Res = NewLI;
3782 State.set(this, Res);
3783}
3784
3786 VPCostContext &Ctx) const {
3787 if (!Consecutive || IsMasked)
3788 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3789
3790 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3791 // here because the EVL recipes using EVL to replace the tail mask. But in the
3792 // legacy model, it will always calculate the cost of mask.
3793 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3794 // don't need to compare to the legacy cost model.
3796 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3797 ->getAddressSpace();
3798 return Ctx.TTI.getMemIntrinsicInstrCost(
3799 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3800 Ctx.CostKind);
3801}
3802
3803#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3805 VPSlotTracker &SlotTracker) const {
3806 O << Indent << "WIDEN ";
3808 O << " = vp.load ";
3810}
3811#endif
3812
3814 VPValue *StoredVPValue = getStoredValue();
3815 bool CreateScatter = !isConsecutive();
3816
3817 auto &Builder = State.Builder;
3818
3819 Value *Mask = nullptr;
3820 if (auto *VPMask = getMask())
3821 Mask = State.get(VPMask);
3822
3823 Value *StoredVal = State.get(StoredVPValue);
3824 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3825 Instruction *NewSI = nullptr;
3826 if (CreateScatter)
3827 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3828 else if (Mask)
3829 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3830 else
3831 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3832 applyMetadata(*NewSI);
3833}
3834
3835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3837 VPSlotTracker &SlotTracker) const {
3838 O << Indent << "WIDEN store ";
3840}
3841#endif
3842
3844 VPValue *StoredValue = getStoredValue();
3845 bool CreateScatter = !isConsecutive();
3846
3847 auto &Builder = State.Builder;
3848
3849 CallInst *NewSI = nullptr;
3850 Value *StoredVal = State.get(StoredValue);
3851 Value *EVL = State.get(getEVL(), VPLane(0));
3852 Value *Mask = nullptr;
3853 if (VPValue *VPMask = getMask())
3854 Mask = State.get(VPMask);
3855 else
3856 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3857
3858 Value *Addr = State.get(getAddr(), !CreateScatter);
3859 if (CreateScatter) {
3860 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3861 Intrinsic::vp_scatter,
3862 {StoredVal, Addr, Mask, EVL});
3863 } else {
3864 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3865 Intrinsic::vp_store,
3866 {StoredVal, Addr, Mask, EVL});
3867 }
3868 NewSI->addParamAttr(
3870 applyMetadata(*NewSI);
3871}
3872
3874 VPCostContext &Ctx) const {
3875 if (!Consecutive || IsMasked)
3876 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3877
3878 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3879 // here because the EVL recipes using EVL to replace the tail mask. But in the
3880 // legacy model, it will always calculate the cost of mask.
3881 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3882 // don't need to compare to the legacy cost model.
3884 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3885 ->getAddressSpace();
3886 return Ctx.TTI.getMemIntrinsicInstrCost(
3887 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
3888 Ctx.CostKind);
3889}
3890
3891#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3893 VPSlotTracker &SlotTracker) const {
3894 O << Indent << "WIDEN vp.store ";
3896}
3897#endif
3898
3900 VectorType *DstVTy, const DataLayout &DL) {
3901 // Verify that V is a vector type with same number of elements as DstVTy.
3902 auto VF = DstVTy->getElementCount();
3903 auto *SrcVecTy = cast<VectorType>(V->getType());
3904 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3905 Type *SrcElemTy = SrcVecTy->getElementType();
3906 Type *DstElemTy = DstVTy->getElementType();
3907 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3908 "Vector elements must have same size");
3909
3910 // Do a direct cast if element types are castable.
3911 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3912 return Builder.CreateBitOrPointerCast(V, DstVTy);
3913 }
3914 // V cannot be directly casted to desired vector type.
3915 // May happen when V is a floating point vector but DstVTy is a vector of
3916 // pointers or vice-versa. Handle this using a two-step bitcast using an
3917 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3918 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3919 "Only one type should be a pointer type");
3920 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3921 "Only one type should be a floating point type");
3922 Type *IntTy =
3923 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3924 auto *VecIntTy = VectorType::get(IntTy, VF);
3925 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3926 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3927}
3928
3929/// Return a vector containing interleaved elements from multiple
3930/// smaller input vectors.
3932 const Twine &Name) {
3933 unsigned Factor = Vals.size();
3934 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3935
3936 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3937#ifndef NDEBUG
3938 for (Value *Val : Vals)
3939 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3940#endif
3941
3942 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3943 // must use intrinsics to interleave.
3944 if (VecTy->isScalableTy()) {
3945 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3946 return Builder.CreateVectorInterleave(Vals, Name);
3947 }
3948
3949 // Fixed length. Start by concatenating all vectors into a wide vector.
3950 Value *WideVec = concatenateVectors(Builder, Vals);
3951
3952 // Interleave the elements into the wide vector.
3953 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3954 return Builder.CreateShuffleVector(
3955 WideVec, createInterleaveMask(NumElts, Factor), Name);
3956}
3957
3958// Try to vectorize the interleave group that \p Instr belongs to.
3959//
3960// E.g. Translate following interleaved load group (factor = 3):
3961// for (i = 0; i < N; i+=3) {
3962// R = Pic[i]; // Member of index 0
3963// G = Pic[i+1]; // Member of index 1
3964// B = Pic[i+2]; // Member of index 2
3965// ... // do something to R, G, B
3966// }
3967// To:
3968// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3969// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3970// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3971// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3972//
3973// Or translate following interleaved store group (factor = 3):
3974// for (i = 0; i < N; i+=3) {
3975// ... do something to R, G, B
3976// Pic[i] = R; // Member of index 0
3977// Pic[i+1] = G; // Member of index 1
3978// Pic[i+2] = B; // Member of index 2
3979// }
3980// To:
3981// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3982// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3983// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3984// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3985// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3987 assert(!State.Lane && "Interleave group being replicated.");
3988 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3989 "Masking gaps for scalable vectors is not yet supported.");
3991 Instruction *Instr = Group->getInsertPos();
3992
3993 // Prepare for the vector type of the interleaved load/store.
3994 Type *ScalarTy = getLoadStoreType(Instr);
3995 unsigned InterleaveFactor = Group->getFactor();
3996 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3997
3998 VPValue *BlockInMask = getMask();
3999 VPValue *Addr = getAddr();
4000 Value *ResAddr = State.get(Addr, VPLane(0));
4001
4002 auto CreateGroupMask = [&BlockInMask, &State,
4003 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4004 if (State.VF.isScalable()) {
4005 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4006 assert(InterleaveFactor <= 8 &&
4007 "Unsupported deinterleave factor for scalable vectors");
4008 auto *ResBlockInMask = State.get(BlockInMask);
4009 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4010 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4011 }
4012
4013 if (!BlockInMask)
4014 return MaskForGaps;
4015
4016 Value *ResBlockInMask = State.get(BlockInMask);
4017 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4018 ResBlockInMask,
4019 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4020 "interleaved.mask");
4021 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4022 ShuffledMask, MaskForGaps)
4023 : ShuffledMask;
4024 };
4025
4026 const DataLayout &DL = Instr->getDataLayout();
4027 // Vectorize the interleaved load group.
4028 if (isa<LoadInst>(Instr)) {
4029 Value *MaskForGaps = nullptr;
4030 if (needsMaskForGaps()) {
4031 MaskForGaps =
4032 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4033 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4034 }
4035
4036 Instruction *NewLoad;
4037 if (BlockInMask || MaskForGaps) {
4038 Value *GroupMask = CreateGroupMask(MaskForGaps);
4039 Value *PoisonVec = PoisonValue::get(VecTy);
4040 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4041 Group->getAlign(), GroupMask,
4042 PoisonVec, "wide.masked.vec");
4043 } else
4044 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4045 Group->getAlign(), "wide.vec");
4046 applyMetadata(*NewLoad);
4047 // TODO: Also manage existing metadata using VPIRMetadata.
4048 Group->addMetadata(NewLoad);
4049
4051 if (VecTy->isScalableTy()) {
4052 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4053 // so must use intrinsics to deinterleave.
4054 assert(InterleaveFactor <= 8 &&
4055 "Unsupported deinterleave factor for scalable vectors");
4056 NewLoad = State.Builder.CreateIntrinsic(
4057 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4058 NewLoad->getType(), NewLoad,
4059 /*FMFSource=*/nullptr, "strided.vec");
4060 }
4061
4062 auto CreateStridedVector = [&InterleaveFactor, &State,
4063 &NewLoad](unsigned Index) -> Value * {
4064 assert(Index < InterleaveFactor && "Illegal group index");
4065 if (State.VF.isScalable())
4066 return State.Builder.CreateExtractValue(NewLoad, Index);
4067
4068 // For fixed length VF, use shuffle to extract the sub-vectors from the
4069 // wide load.
4070 auto StrideMask =
4071 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4072 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4073 "strided.vec");
4074 };
4075
4076 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4077 Instruction *Member = Group->getMember(I);
4078
4079 // Skip the gaps in the group.
4080 if (!Member)
4081 continue;
4082
4083 Value *StridedVec = CreateStridedVector(I);
4084
4085 // If this member has different type, cast the result type.
4086 if (Member->getType() != ScalarTy) {
4087 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4088 StridedVec =
4089 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4090 }
4091
4092 if (Group->isReverse())
4093 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4094
4095 State.set(VPDefs[J], StridedVec);
4096 ++J;
4097 }
4098 return;
4099 }
4100
4101 // The sub vector type for current instruction.
4102 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4103
4104 // Vectorize the interleaved store group.
4105 Value *MaskForGaps =
4106 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4107 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4108 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4109 ArrayRef<VPValue *> StoredValues = getStoredValues();
4110 // Collect the stored vector from each member.
4111 SmallVector<Value *, 4> StoredVecs;
4112 unsigned StoredIdx = 0;
4113 for (unsigned i = 0; i < InterleaveFactor; i++) {
4114 assert((Group->getMember(i) || MaskForGaps) &&
4115 "Fail to get a member from an interleaved store group");
4116 Instruction *Member = Group->getMember(i);
4117
4118 // Skip the gaps in the group.
4119 if (!Member) {
4120 Value *Undef = PoisonValue::get(SubVT);
4121 StoredVecs.push_back(Undef);
4122 continue;
4123 }
4124
4125 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4126 ++StoredIdx;
4127
4128 if (Group->isReverse())
4129 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4130
4131 // If this member has different type, cast it to a unified type.
4132
4133 if (StoredVec->getType() != SubVT)
4134 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4135
4136 StoredVecs.push_back(StoredVec);
4137 }
4138
4139 // Interleave all the smaller vectors into one wider vector.
4140 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4141 Instruction *NewStoreInstr;
4142 if (BlockInMask || MaskForGaps) {
4143 Value *GroupMask = CreateGroupMask(MaskForGaps);
4144 NewStoreInstr = State.Builder.CreateMaskedStore(
4145 IVec, ResAddr, Group->getAlign(), GroupMask);
4146 } else
4147 NewStoreInstr =
4148 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4149
4150 applyMetadata(*NewStoreInstr);
4151 // TODO: Also manage existing metadata using VPIRMetadata.
4152 Group->addMetadata(NewStoreInstr);
4153}
4154
4155#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4157 VPSlotTracker &SlotTracker) const {
4159 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4160 IG->getInsertPos()->printAsOperand(O, false);
4161 O << ", ";
4163 VPValue *Mask = getMask();
4164 if (Mask) {
4165 O << ", ";
4166 Mask->printAsOperand(O, SlotTracker);
4167 }
4168
4169 unsigned OpIdx = 0;
4170 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4171 if (!IG->getMember(i))
4172 continue;
4173 if (getNumStoreOperands() > 0) {
4174 O << "\n" << Indent << " store ";
4176 O << " to index " << i;
4177 } else {
4178 O << "\n" << Indent << " ";
4180 O << " = load from index " << i;
4181 }
4182 ++OpIdx;
4183 }
4184}
4185#endif
4186
4188 assert(!State.Lane && "Interleave group being replicated.");
4189 assert(State.VF.isScalable() &&
4190 "Only support scalable VF for EVL tail-folding.");
4192 "Masking gaps for scalable vectors is not yet supported.");
4194 Instruction *Instr = Group->getInsertPos();
4195
4196 // Prepare for the vector type of the interleaved load/store.
4197 Type *ScalarTy = getLoadStoreType(Instr);
4198 unsigned InterleaveFactor = Group->getFactor();
4199 assert(InterleaveFactor <= 8 &&
4200 "Unsupported deinterleave/interleave factor for scalable vectors");
4201 ElementCount WideVF = State.VF * InterleaveFactor;
4202 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4203
4204 VPValue *Addr = getAddr();
4205 Value *ResAddr = State.get(Addr, VPLane(0));
4206 Value *EVL = State.get(getEVL(), VPLane(0));
4207 Value *InterleaveEVL = State.Builder.CreateMul(
4208 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4209 /* NUW= */ true, /* NSW= */ true);
4210 LLVMContext &Ctx = State.Builder.getContext();
4211
4212 Value *GroupMask = nullptr;
4213 if (VPValue *BlockInMask = getMask()) {
4214 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4215 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4216 } else {
4217 GroupMask =
4218 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4219 }
4220
4221 // Vectorize the interleaved load group.
4222 if (isa<LoadInst>(Instr)) {
4223 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4224 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4225 "wide.vp.load");
4226 NewLoad->addParamAttr(0,
4227 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4228
4229 applyMetadata(*NewLoad);
4230 // TODO: Also manage existing metadata using VPIRMetadata.
4231 Group->addMetadata(NewLoad);
4232
4233 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4234 // so must use intrinsics to deinterleave.
4235 NewLoad = State.Builder.CreateIntrinsic(
4236 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4237 NewLoad->getType(), NewLoad,
4238 /*FMFSource=*/nullptr, "strided.vec");
4239
4240 const DataLayout &DL = Instr->getDataLayout();
4241 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4242 Instruction *Member = Group->getMember(I);
4243 // Skip the gaps in the group.
4244 if (!Member)
4245 continue;
4246
4247 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4248 // If this member has different type, cast the result type.
4249 if (Member->getType() != ScalarTy) {
4250 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4251 StridedVec =
4252 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4253 }
4254
4255 State.set(getVPValue(J), StridedVec);
4256 ++J;
4257 }
4258 return;
4259 } // End for interleaved load.
4260
4261 // The sub vector type for current instruction.
4262 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4263 // Vectorize the interleaved store group.
4264 ArrayRef<VPValue *> StoredValues = getStoredValues();
4265 // Collect the stored vector from each member.
4266 SmallVector<Value *, 4> StoredVecs;
4267 const DataLayout &DL = Instr->getDataLayout();
4268 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4269 Instruction *Member = Group->getMember(I);
4270 // Skip the gaps in the group.
4271 if (!Member) {
4272 StoredVecs.push_back(PoisonValue::get(SubVT));
4273 continue;
4274 }
4275
4276 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4277 // If this member has different type, cast it to a unified type.
4278 if (StoredVec->getType() != SubVT)
4279 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4280
4281 StoredVecs.push_back(StoredVec);
4282 ++StoredIdx;
4283 }
4284
4285 // Interleave all the smaller vectors into one wider vector.
4286 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4287 CallInst *NewStore =
4288 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4289 {IVec, ResAddr, GroupMask, InterleaveEVL});
4290 NewStore->addParamAttr(1,
4291 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4292
4293 applyMetadata(*NewStore);
4294 // TODO: Also manage existing metadata using VPIRMetadata.
4295 Group->addMetadata(NewStore);
4296}
4297
4298#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4300 VPSlotTracker &SlotTracker) const {
4302 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4303 IG->getInsertPos()->printAsOperand(O, false);
4304 O << ", ";
4306 O << ", ";
4308 if (VPValue *Mask = getMask()) {
4309 O << ", ";
4310 Mask->printAsOperand(O, SlotTracker);
4311 }
4312
4313 unsigned OpIdx = 0;
4314 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4315 if (!IG->getMember(i))
4316 continue;
4317 if (getNumStoreOperands() > 0) {
4318 O << "\n" << Indent << " vp.store ";
4320 O << " to index " << i;
4321 } else {
4322 O << "\n" << Indent << " ";
4324 O << " = vp.load from index " << i;
4325 }
4326 ++OpIdx;
4327 }
4328}
4329#endif
4330
4332 VPCostContext &Ctx) const {
4333 Instruction *InsertPos = getInsertPos();
4334 // Find the VPValue index of the interleave group. We need to skip gaps.
4335 unsigned InsertPosIdx = 0;
4336 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4337 if (auto *Member = IG->getMember(Idx)) {
4338 if (Member == InsertPos)
4339 break;
4340 InsertPosIdx++;
4341 }
4342 Type *ValTy = Ctx.Types.inferScalarType(
4343 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4344 : getStoredValues()[InsertPosIdx]);
4345 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4346 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4347 ->getAddressSpace();
4348
4349 unsigned InterleaveFactor = IG->getFactor();
4350 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4351
4352 // Holds the indices of existing members in the interleaved group.
4354 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4355 if (IG->getMember(IF))
4356 Indices.push_back(IF);
4357
4358 // Calculate the cost of the whole interleaved group.
4359 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4360 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4361 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4362
4363 if (!IG->isReverse())
4364 return Cost;
4365
4366 return Cost + IG->getNumMembers() *
4367 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4368 VectorTy, VectorTy, {}, Ctx.CostKind,
4369 0);
4370}
4371
4373 return vputils::onlyScalarValuesUsed(this) &&
4374 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4375}
4376
4377#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4379 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4380 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4381 "unexpected number of operands");
4382 O << Indent << "EMIT ";
4384 O << " = WIDEN-POINTER-INDUCTION ";
4386 O << ", ";
4388 O << ", ";
4390 if (getNumOperands() == 5) {
4391 O << ", ";
4393 O << ", ";
4395 }
4396}
4397
4399 VPSlotTracker &SlotTracker) const {
4400 O << Indent << "EMIT ";
4402 O << " = EXPAND SCEV " << *Expr;
4403}
4404#endif
4405
4407 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4408 Type *STy = CanonicalIV->getType();
4409 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4410 ElementCount VF = State.VF;
4411 Value *VStart = VF.isScalar()
4412 ? CanonicalIV
4413 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4414 Value *VStep = Builder.CreateElementCount(
4415 STy, VF.multiplyCoefficientBy(getUnrollPart(*this)));
4416 if (VF.isVector()) {
4417 VStep = Builder.CreateVectorSplat(VF, VStep);
4418 VStep =
4419 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4420 }
4421 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4422 State.set(this, CanonicalVectorIV);
4423}
4424
4425#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4427 VPSlotTracker &SlotTracker) const {
4428 O << Indent << "EMIT ";
4430 O << " = WIDEN-CANONICAL-INDUCTION ";
4432}
4433#endif
4434
4436 auto &Builder = State.Builder;
4437 // Create a vector from the initial value.
4438 auto *VectorInit = getStartValue()->getLiveInIRValue();
4439
4440 Type *VecTy = State.VF.isScalar()
4441 ? VectorInit->getType()
4442 : VectorType::get(VectorInit->getType(), State.VF);
4443
4444 BasicBlock *VectorPH =
4445 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4446 if (State.VF.isVector()) {
4447 auto *IdxTy = Builder.getInt32Ty();
4448 auto *One = ConstantInt::get(IdxTy, 1);
4449 IRBuilder<>::InsertPointGuard Guard(Builder);
4450 Builder.SetInsertPoint(VectorPH->getTerminator());
4451 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4452 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4453 VectorInit = Builder.CreateInsertElement(
4454 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4455 }
4456
4457 // Create a phi node for the new recurrence.
4458 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4459 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4460 Phi->addIncoming(VectorInit, VectorPH);
4461 State.set(this, Phi);
4462}
4463
4466 VPCostContext &Ctx) const {
4467 if (VF.isScalar())
4468 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4469
4470 return 0;
4471}
4472
4473#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4475 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4476 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4478 O << " = phi ";
4480}
4481#endif
4482
4484 // Reductions do not have to start at zero. They can start with
4485 // any loop invariant values.
4486 VPValue *StartVPV = getStartValue();
4487
4488 // In order to support recurrences we need to be able to vectorize Phi nodes.
4489 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4490 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4491 // this value when we vectorize all of the instructions that use the PHI.
4492 BasicBlock *VectorPH =
4493 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4494 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4495 Value *StartV = State.get(StartVPV, ScalarPHI);
4496 Type *VecTy = StartV->getType();
4497
4498 BasicBlock *HeaderBB = State.CFG.PrevBB;
4499 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4500 "recipe must be in the vector loop header");
4501 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4502 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4503 State.set(this, Phi, isInLoop());
4504
4505 Phi->addIncoming(StartV, VectorPH);
4506}
4507
4508#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4510 VPSlotTracker &SlotTracker) const {
4511 O << Indent << "WIDEN-REDUCTION-PHI ";
4512
4514 O << " = phi";
4515 printFlags(O);
4517 if (getVFScaleFactor() > 1)
4518 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4519}
4520#endif
4521
4523 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4524 return vputils::onlyFirstLaneUsed(this);
4525}
4526
4528 Value *Op0 = State.get(getOperand(0));
4529 Type *VecTy = Op0->getType();
4530 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4531 State.set(this, VecPhi);
4532}
4533
4535 VPCostContext &Ctx) const {
4536 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4537}
4538
4539#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4541 VPSlotTracker &SlotTracker) const {
4542 O << Indent << "WIDEN-PHI ";
4543
4545 O << " = phi ";
4547}
4548#endif
4549
4551 BasicBlock *VectorPH =
4552 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4553 Value *StartMask = State.get(getOperand(0));
4554 PHINode *Phi =
4555 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4556 Phi->addIncoming(StartMask, VectorPH);
4557 State.set(this, Phi);
4558}
4559
4560#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4562 VPSlotTracker &SlotTracker) const {
4563 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4564
4566 O << " = phi ";
4568}
4569#endif
4570
4571#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4573 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4574 O << Indent << "CURRENT-ITERATION-PHI ";
4575
4577 O << " = phi ";
4579}
4580#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, VPCostContext &Ctx)
Return true if R is a predicated load/store with a loop-invariant address only masked by the header m...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
SmallVector< Value *, 2 > VectorParts
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static unsigned getCalledFnOperandIndex(const VPInstruction &VPI)
For call VPInstructions, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getUnknown()
Definition DebugLoc.h:161
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:93
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
void setAllowReciprocal(bool B=true)
Definition FMF.h:90
bool allowReciprocal() const
Definition FMF.h:71
void setNoSignedZeros(bool B=true)
Definition FMF.h:87
bool allowReassoc() const
Flag queries.
Definition FMF.h:67
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:68
void setApproxFunc(bool B=true)
Definition FMF.h:96
void setNoInfs(bool B=true)
Definition FMF.h:84
bool allowContract() const
Definition FMF.h:72
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2681
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2674
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2693
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2091
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2378
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1782
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1866
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2374
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1176
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2386
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1790
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2484
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4146
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4199
iterator end()
Definition VPlan.h:4183
const VPRecipeBase & front() const
Definition VPlan.h:4193
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4212
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2810
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2805
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2801
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:97
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:225
VPlan * getPlan()
Definition VPlan.cpp:178
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:367
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:183
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:504
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:477
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:489
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:499
VPIRValue * getStartValue() const
Definition VPlan.h:3938
VPValue * getStepValue() const
Definition VPlan.h:3940
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2330
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2073
Class to record and manage LLVM IR flags.
Definition VPlan.h:691
FastMathFlagsTy FMFs
Definition VPlan.h:779
ReductionFlagsTy ReductionFlags
Definition VPlan.h:781
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:773
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:996
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1060
TruncFlagsTy TruncFlags
Definition VPlan.h:774
CmpInst::Predicate getPredicate() const
Definition VPlan.h:968
ExactFlagsTy ExactFlags
Definition VPlan.h:776
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:777
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:986
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:991
DisjointFlagsTy DisjointFlags
Definition VPlan.h:775
FCmpFlagsTy FCmpFlags
Definition VPlan.h:780
NonNegFlagsTy NonNegFlags
Definition VPlan.h:778
bool isReductionInLoop() const
Definition VPlan.h:1066
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:925
uint8_t CmpPredStorage
Definition VPlan.h:772
RecurKind getRecurKind() const
Definition VPlan.h:1054
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1690
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1332
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1323
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1339
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1313
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1326
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1266
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1317
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1261
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1258
@ VScale
Returns the value for vscale.
Definition VPlan.h:1335
@ CanonicalIVIncrementForPart
Definition VPlan.h:1242
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1269
bool hasResult() const
Definition VPlan.h:1417
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1497
unsigned getOpcode() const
Definition VPlan.h:1401
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1442
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2914
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2918
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2916
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2908
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2937
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2902
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3011
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3024
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2974
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1604
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4290
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1629
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1589
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:405
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4491
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:479
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:557
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
bool isSafeToSpeculativelyExecute() const
Return true if we can safely execute this recipe unconditionally even if it is masked originally.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:525
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:469
friend class VPValue
Definition VPlanValue.h:310
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3172
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2725
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2749
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3114
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3125
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3127
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3110
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3116
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3123
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3118
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4356
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4432
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3194
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3235
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:3259
VPValue * getStepValue() const
Definition VPlan.h:4010
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4018
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:676
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:611
This class can be used to assign names to VPValues.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:1159
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:335
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1526
operand_range operands()
Definition VPlanValue.h:403
unsigned getNumOperands() const
Definition VPlanValue.h:373
operand_iterator op_begin()
Definition VPlanValue.h:399
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:374
virtual bool usesFirstLaneOnly(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:418
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:138
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1477
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:128
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1522
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1480
VPValue * getVFValue() const
Definition VPlan.h:2171
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2168
int64_t getStride() const
Definition VPlan.h:2169
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2240
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
operand_range args()
Definition VPlan.h:2028
Function * getCalledScalarFunction() const
Definition VPlan.h:2024
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Instruction::CastOps getOpcode() const
Definition VPlan.h:1874
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1877
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2125
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2393
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2396
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2494
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2509
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2518
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1959
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1962
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3504
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3539
Instruction & Ingredient
Definition VPlan.h:3495
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3501
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3549
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3498
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3542
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getOpcode() const
Definition VPlan.h:1817
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4504
const DataLayout & getDataLayout() const
Definition VPlan.h:4709
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1067
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4811
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:816
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMaximum
FP max with llvm.maximum semantics.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
LLVMContext & LLVMCtx
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1947
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1748
PHINode & getIRPhi()
Definition VPlan.h:1761
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:1113
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1114
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:280
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3625
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3708
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3711
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3671