LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanHelpers.h"
17#include "VPlanPatternMatch.h"
18#include "VPlanUtils.h"
19#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenMemIntrinsicSC:
75 case VPWidenIntrinsicSC:
76 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
77 case VPActiveLaneMaskPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenMemIntrinsicSC:
128 case VPWidenIntrinsicSC:
129 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
130 case VPBranchOnMaskSC:
131 case VPDerivedIVSC:
132 case VPCurrentIterationPHISC:
133 case VPFirstOrderRecurrencePHISC:
134 case VPReductionPHISC:
135 case VPPredInstPHISC:
136 case VPScalarIVStepsSC:
137 case VPWidenStoreEVLSC:
138 case VPWidenStoreSC:
139 return false;
140 case VPBlendSC:
141 case VPReductionEVLSC:
142 case VPReductionSC:
143 case VPVectorPointerSC:
144 case VPWidenCanonicalIVSC:
145 case VPWidenCastSC:
146 case VPWidenGEPSC:
147 case VPWidenIntOrFpInductionSC:
148 case VPWidenPHISC:
149 case VPWidenPointerInductionSC:
150 case VPWidenSC: {
151 const Instruction *I =
152 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
153 (void)I;
154 assert((!I || !I->mayReadFromMemory()) &&
155 "underlying instruction may read from memory");
156 return false;
157 }
158 default:
159 // FIXME: Return false if the recipe represents an interleaved store.
160 return true;
161 }
162}
163
165 switch (getVPRecipeID()) {
166 case VPExpressionSC:
167 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
168 case VPActiveLaneMaskPHISC:
169 case VPDerivedIVSC:
170 case VPCurrentIterationPHISC:
171 case VPFirstOrderRecurrencePHISC:
172 case VPReductionPHISC:
173 case VPPredInstPHISC:
174 case VPVectorEndPointerSC:
175 return false;
176 case VPInstructionSC: {
177 auto *VPI = cast<VPInstruction>(this);
178 return mayWriteToMemory() ||
179 VPI->getOpcode() == VPInstruction::BranchOnCount ||
180 VPI->getOpcode() == VPInstruction::BranchOnCond ||
181 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
182 }
183 case VPWidenCallSC: {
184 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
185 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
186 }
187 case VPWidenMemIntrinsicSC:
188 case VPWidenIntrinsicSC:
189 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
190 case VPBlendSC:
191 case VPReductionEVLSC:
192 case VPReductionSC:
193 case VPScalarIVStepsSC:
194 case VPVectorPointerSC:
195 case VPWidenCanonicalIVSC:
196 case VPWidenCastSC:
197 case VPWidenGEPSC:
198 case VPWidenIntOrFpInductionSC:
199 case VPWidenPHISC:
200 case VPWidenPointerInductionSC:
201 case VPWidenSC: {
202 const Instruction *I =
203 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
204 (void)I;
205 assert((!I || !I->mayHaveSideEffects()) &&
206 "underlying instruction has side-effects");
207 return false;
208 }
209 case VPInterleaveEVLSC:
210 case VPInterleaveSC:
211 return mayWriteToMemory();
212 case VPWidenLoadEVLSC:
213 case VPWidenLoadSC:
214 case VPWidenStoreEVLSC:
215 case VPWidenStoreSC:
216 assert(
217 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
219 "mayHaveSideffects result for ingredient differs from this "
220 "implementation");
221 return mayWriteToMemory();
222 case VPReplicateSC: {
223 auto *R = cast<VPReplicateRecipe>(this);
224 return R->getUnderlyingInstr()->mayHaveSideEffects();
225 }
226 default:
227 return true;
228 }
229}
230
232 switch (getVPRecipeID()) {
233 default:
234 return false;
235 case VPInstructionSC: {
236 unsigned Opcode = cast<VPInstruction>(this)->getOpcode();
237 if (Instruction::isCast(Opcode))
238 return true;
239
240 switch (Opcode) {
241 default:
242 return false;
243 case Instruction::Add:
244 case Instruction::Sub:
245 case Instruction::Mul:
246 case Instruction::GetElementPtr:
247 return true;
248 }
249 }
250 }
251}
252
254 assert(!Parent && "Recipe already in some VPBasicBlock");
255 assert(InsertPos->getParent() &&
256 "Insertion position not in any VPBasicBlock");
257 InsertPos->getParent()->insert(this, InsertPos->getIterator());
258}
259
260void VPRecipeBase::insertBefore(VPBasicBlock &BB,
262 assert(!Parent && "Recipe already in some VPBasicBlock");
263 assert(I == BB.end() || I->getParent() == &BB);
264 BB.insert(this, I);
265}
266
268 assert(!Parent && "Recipe already in some VPBasicBlock");
269 assert(InsertPos->getParent() &&
270 "Insertion position not in any VPBasicBlock");
271 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
272}
273
275 assert(getParent() && "Recipe not in any VPBasicBlock");
277 Parent = nullptr;
278}
279
281 assert(getParent() && "Recipe not in any VPBasicBlock");
283}
284
287 insertAfter(InsertPos);
288}
289
295
297 // Get the underlying instruction for the recipe, if there is one. It is used
298 // to
299 // * decide if cost computation should be skipped for this recipe,
300 // * apply forced target instruction cost.
301 Instruction *UI = nullptr;
302 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
303 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
304 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
305 UI = IG->getInsertPos();
306 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
307 UI = &WidenMem->getIngredient();
308
309 InstructionCost RecipeCost;
310 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
311 RecipeCost = 0;
312 } else {
313 RecipeCost = computeCost(VF, Ctx);
314 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
315 RecipeCost.isValid()) {
316 if (UI)
318 else
319 RecipeCost = InstructionCost(0);
320 }
321 }
322
323 LLVM_DEBUG({
324 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
325 dump();
326 });
327 return RecipeCost;
328}
329
331 VPCostContext &Ctx) const {
332 llvm_unreachable("subclasses should implement computeCost");
333}
334
336 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
338}
339
341 assert(OpType == Other.OpType && "OpType must match");
342 switch (OpType) {
343 case OperationType::OverflowingBinOp:
344 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
345 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
346 break;
347 case OperationType::Trunc:
348 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
349 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
350 break;
351 case OperationType::DisjointOp:
352 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
353 break;
354 case OperationType::PossiblyExactOp:
355 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
356 break;
357 case OperationType::GEPOp:
358 GEPFlagsStorage &= Other.GEPFlagsStorage;
359 break;
360 case OperationType::FPMathOp:
361 case OperationType::FCmp:
362 assert((OpType != OperationType::FCmp ||
363 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
364 "Cannot drop CmpPredicate");
365 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
366 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
367 break;
368 case OperationType::NonNegOp:
369 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
370 break;
371 case OperationType::Cmp:
372 assert(CmpPredStorage == Other.CmpPredStorage &&
373 "Cannot drop CmpPredicate");
374 break;
375 case OperationType::ReductionOp:
376 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
377 "Cannot change RecurKind");
378 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
379 "Cannot change IsOrdered");
380 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
381 "Cannot change IsInLoop");
382 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
383 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
384 break;
385 case OperationType::Other:
386 break;
387 }
388}
389
391 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
392 OpType == OperationType::ReductionOp ||
393 OpType == OperationType::Other) &&
394 "recipe doesn't have fast math flags");
395 if (OpType == OperationType::Other)
396 return FastMathFlags();
397 const FastMathFlagsTy &F = getFMFsRef();
398 FastMathFlags Res;
399 Res.setAllowReassoc(F.AllowReassoc);
400 Res.setNoNaNs(F.NoNaNs);
401 Res.setNoInfs(F.NoInfs);
402 Res.setNoSignedZeros(F.NoSignedZeros);
403 Res.setAllowReciprocal(F.AllowReciprocal);
404 Res.setAllowContract(F.AllowContract);
405 Res.setApproxFunc(F.ApproxFunc);
406 return Res;
407}
408
409#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
411
412void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
413 VPSlotTracker &SlotTracker) const {
414 printRecipe(O, Indent, SlotTracker);
415 if (auto DL = getDebugLoc()) {
416 O << ", !dbg ";
417 DL.print(O);
418 }
419
420 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
422}
423#endif
424
426 : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()),
427 Expr(Expr) {}
428
429/// For call VPInstruction operands, return the operand index of the called
430/// function. The function is either the last operand (for unmasked calls) or
431/// the second-to-last operand (for masked calls).
433 unsigned NumOps = Operands.size();
434 auto *LastOp = dyn_cast<VPIRValue>(Operands[NumOps - 1]);
435 if (LastOp && isa<Function>(LastOp->getValue()))
436 return NumOps - 1;
438 "expected function operand");
439 return NumOps - 2;
440}
441
442/// For call VPInstruction operands, return the called function.
444 unsigned Idx = getCalledFnOperandIndex(Operands);
445 return cast<Function>(cast<VPIRValue>(Operands[Idx])->getValue());
446}
447
449 ArrayRef<VPValue *> Operands) {
450 assert(!Operands.empty() &&
451 "zero-operand VPInstruction opcodes must pass explicit ResultTy");
452 // Assert operand \p Idx (if present and typed) has type \p ExpectedTy.
453 [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx,
454 Type *ExpectedTy) {
455 if (!ExpectedTy || Operands.size() <= Idx)
456 return;
457 [[maybe_unused]] Type *OpTy = Operands[Idx]->getScalarType();
458 assert((!OpTy || OpTy == ExpectedTy) &&
459 "different types inferred for different operands");
460 };
461
462 Type *Op0Ty = Operands[0]->getScalarType();
463 LLVMContext &Ctx = Op0Ty->getContext();
464 switch (Opcode) {
468 case Instruction::Store:
469 case Instruction::Switch:
470 return Type::getVoidTy(Ctx);
471 case Instruction::ICmp:
472 case Instruction::FCmp:
474 AssertOperandType(1, Op0Ty);
475 return IntegerType::get(Ctx, 1);
479 assert((!Op0Ty || Op0Ty->isIntegerTy(1)) && "expected bool operand");
480 AssertOperandType(1, Op0Ty);
481 return IntegerType::get(Ctx, 1);
483 return IntegerType::get(Ctx, 32);
484 case Instruction::Select: {
485 Type *Op1Ty = Operands[1]->getScalarType();
486 AssertOperandType(2, Op1Ty);
487 return Op1Ty;
488 }
490 assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and "
491 "at least one source vector operand");
492 Type *Op1Ty = Operands[1]->getScalarType();
493 for (unsigned Idx = 2; Idx != Operands.size(); ++Idx)
494 AssertOperandType(Idx, Op1Ty);
495 return Op1Ty;
496 }
497 case Instruction::ExtractValue: {
498 assert(Operands.size() == 2 && "expected single level extractvalue");
499 auto *StructTy = cast<StructType>(Op0Ty);
500 return StructTy->getTypeAtIndex(
501 cast<VPConstantInt>(Operands[1])->getZExtValue());
502 }
507 case Instruction::Load:
508 case Instruction::Alloca:
509 llvm_unreachable("type must be passed explicitly");
510 case Instruction::Call:
511 return getCalledFunction(Operands)->getReturnType();
512 default:
513 break;
514 }
515
516 // Opcodes that require all operands to share the same scalar type as the
517 // result.
518 bool AllOperandsSameType =
519 Instruction::isBinaryOp(Opcode) ||
525 Opcode);
526 if (AllOperandsSameType)
527 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
528 AssertOperandType(Idx, Op0Ty);
529
530 return Op0Ty;
531}
532
534 ArrayRef<VPValue *> Operands) {
535 unsigned Opcode = I->getOpcode();
536 if (Instruction::isCast(Opcode) ||
537 is_contained(ArrayRef<unsigned>({Instruction::ExtractValue,
538 Instruction::Load, Instruction::Alloca}),
539 Opcode))
540 return I->getType();
541 return computeScalarTypeForInstruction(Opcode, Operands);
542}
543
545 const VPIRFlags &Flags, const VPIRMetadata &MD,
546 DebugLoc DL, const Twine &Name, Type *ResultTy)
548 VPRecipeBase::VPInstructionSC, Operands,
549 ResultTy ? ResultTy
550 : computeScalarTypeForInstruction(Opcode, Operands),
551 Flags, DL),
552 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
554 "Set flags not supported for the provided opcode");
556 "Opcode requires specific flags to be set");
560 "number of operands does not match opcode");
561}
562
564 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
565 return 1;
566
567 if (Instruction::isBinaryOp(Opcode))
568 return 2;
569
570 switch (Opcode) {
574 return 0;
575 case Instruction::Alloca:
576 case Instruction::ExtractValue:
577 case Instruction::Freeze:
578 case Instruction::Load:
592 return 1;
593 case Instruction::ICmp:
594 case Instruction::FCmp:
595 case Instruction::ExtractElement:
596 case Instruction::Store:
606 return 2;
607 case Instruction::InsertElement:
608 case Instruction::Select:
611 return 3;
612 case Instruction::Call:
614 1;
615 case Instruction::GetElementPtr:
616 case Instruction::PHI:
617 case Instruction::Switch:
627 // Cannot determine the number of operands from the opcode.
628 return -1u;
629 }
630 llvm_unreachable("all cases should be handled above");
631}
632
636
637bool VPInstruction::canGenerateScalarForFirstLane() const {
639 return true;
641 return true;
642 switch (Opcode) {
643 case Instruction::Freeze:
644 case Instruction::ICmp:
645 case Instruction::PHI:
646 case Instruction::Select:
656 return true;
657 default:
658 return false;
659 }
660}
661
663 if (Kind == RecurKind::Sub)
664 return Instruction::Add;
665 if (Kind == RecurKind::FSub)
666 return Instruction::FAdd;
667 llvm_unreachable("RecurKind should be Sub/FSub.");
668}
669
670Value *VPInstruction::generate(VPTransformState &State) {
671 IRBuilderBase &Builder = State.Builder;
672
674 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
675 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
676 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
677 auto *Res =
678 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
679 if (auto *I = dyn_cast<Instruction>(Res))
680 applyFlags(*I);
681 return Res;
682 }
683
684 switch (getOpcode()) {
685 case VPInstruction::Not: {
686 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
687 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
688 return Builder.CreateNot(A, Name);
689 }
690 case Instruction::ExtractElement: {
691 assert(State.VF.isVector() && "Only extract elements from vectors");
692 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
693 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
694 Value *Vec = State.get(getOperand(0));
695 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
696 return Builder.CreateExtractElement(Vec, Idx, Name);
697 }
698 case Instruction::InsertElement: {
699 assert(State.VF.isVector() && "Can only insert elements into vectors");
700 Value *Vec = State.get(getOperand(0), /*IsScalar=*/false);
701 Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
702 Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
703 return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
704 }
705 case Instruction::Freeze: {
707 return Builder.CreateFreeze(Op, Name);
708 }
709 case Instruction::FCmp:
710 case Instruction::ICmp: {
711 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
712 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
713 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
714 return Builder.CreateCmp(getPredicate(), A, B, Name);
715 }
716 case Instruction::PHI: {
717 llvm_unreachable("should be handled by VPPhi::execute");
718 }
719 case Instruction::Select: {
720 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
721 Value *Cond =
722 State.get(getOperand(0),
723 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
724 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
725 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
726 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlagsOrNone(),
727 Name);
728 }
730 // Get first lane of vector induction variable.
731 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
732 // Get the original loop tripcount.
733 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
734
735 // If this part of the active lane mask is scalar, generate the CMP directly
736 // to avoid unnecessary extracts.
737 if (State.VF.isScalar())
738 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
739 Name);
740
741 ElementCount EC = State.VF.multiplyCoefficientBy(
742 cast<VPConstantInt>(getOperand(2))->getZExtValue());
743 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
744 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
745 {PredTy, ScalarTC->getType()},
746 {VIVElem0, ScalarTC}, nullptr, Name);
747 }
749 Value *Op = State.get(getOperand(0));
750 auto *VecTy = cast<VectorType>(Op->getType());
751 assert(VecTy->getScalarSizeInBits() == 1 &&
752 "NumActiveLanes only implemented for i1 vectors");
753
754 Type *Ty = getScalarType();
755 Value *ZExt = Builder.CreateCast(
756 Instruction::ZExt, Op, VectorType::get(Ty, VecTy->getElementCount()));
757 Value *NumActive =
758 Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
759 return NumActive;
760 }
762 // Generate code to combine the previous and current values in vector v3.
763 //
764 // vector.ph:
765 // v_init = vector(..., ..., ..., a[-1])
766 // br vector.body
767 //
768 // vector.body
769 // i = phi [0, vector.ph], [i+4, vector.body]
770 // v1 = phi [v_init, vector.ph], [v2, vector.body]
771 // v2 = a[i, i+1, i+2, i+3];
772 // v3 = vector(v1(3), v2(0, 1, 2))
773
774 auto *V1 = State.get(getOperand(0));
775 if (!V1->getType()->isVectorTy())
776 return V1;
777 Value *V2 = State.get(getOperand(1));
778 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
779 }
781 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
782 Value *VFxUF = State.get(getOperand(1), VPLane(0));
783 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
784 Value *Cmp =
785 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
787 return Builder.CreateSelect(Cmp, Sub, Zero);
788 }
790 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
791 // be outside of the main loop.
792 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
793 // Compute EVL
794 assert(AVL->getType()->isIntegerTy() &&
795 "Requested vector length should be an integer.");
796
797 assert(State.VF.isScalable() && "Expected scalable vector factor.");
798 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
799
800 Value *EVL = Builder.CreateIntrinsic(
801 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
802 {AVL, VFArg, Builder.getTrue()});
803 return EVL;
804 }
806 Value *Cond = State.get(getOperand(0), VPLane(0));
807 // Replace the temporary unreachable terminator with a new conditional
808 // branch, hooking it up to backward destination for latch blocks now, and
809 // to forward destination(s) later when they are created.
810 // Second successor may be backwards - iff it is already in VPBB2IRBB.
811 VPBasicBlock *SecondVPSucc =
812 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
813 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
814 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
815 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
816 // First successor is always forward, reset it to nullptr.
817 Br->setSuccessor(0, nullptr);
819 applyMetadata(*Br);
820 return Br;
821 }
823 return Builder.CreateVectorSplat(
824 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
825 }
827 // For struct types, we need to build a new 'wide' struct type, where each
828 // element is widened, i.e., we create a struct of vectors.
829 auto *StructTy = cast<StructType>(getOperand(0)->getScalarType());
830 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
831 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
832 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
833 FieldIndex++) {
834 Value *ScalarValue =
835 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
836 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
837 VectorValue =
838 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
839 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
840 }
841 }
842 return Res;
843 }
845 auto *ScalarTy = getOperand(0)->getScalarType();
846 auto NumOfElements = ElementCount::getFixed(getNumOperands());
847 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
848 for (const auto &[Idx, Op] : enumerate(operands()))
849 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
850 Builder.getInt32(Idx));
851 return Res;
852 }
854 if (State.VF.isScalar())
855 return State.get(getOperand(0), true);
856 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
858 // If this start vector is scaled then it should produce a vector with fewer
859 // elements than the VF.
860 ElementCount VF = State.VF.divideCoefficientBy(
861 cast<VPConstantInt>(getOperand(2))->getZExtValue());
862 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
863 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
864 Builder.getInt32(0));
865 }
867 RecurKind RK = getRecurKind();
868 bool IsOrdered = isReductionOrdered();
869 bool IsInLoop = isReductionInLoop();
871 "FindIV should use min/max reduction kinds");
872
873 // The recipe may have multiple operands to be reduced together.
874 unsigned NumOperandsToReduce = getNumOperands();
875 VectorParts RdxParts(NumOperandsToReduce);
876 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
877 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
878
879 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
881
882 // Reduce multiple operands into one.
883 Value *ReducedPartRdx = RdxParts[0];
884 if (IsOrdered) {
885 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
886 } else {
887 // Floating-point operations should have some FMF to enable the reduction.
888 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
889 Value *RdxPart = RdxParts[Part];
891 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
892 else {
893 // For sub-recurrences, each part's reduction variable is already
894 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
898 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
899 ReducedPartRdx =
900 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
901 }
902 }
903 }
904
905 // Create the reduction after the loop. Note that inloop reductions create
906 // the target reduction in the loop using a Reduction recipe.
907 if (State.VF.isVector() && !IsInLoop) {
908 // TODO: Support in-order reductions based on the recurrence descriptor.
909 // All ops in the reduction inherit fast-math-flags from the recurrence
910 // descriptor.
911 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
912 }
913
914 return ReducedPartRdx;
915 }
918 unsigned Offset =
920 Value *Res;
921 if (State.VF.isVector()) {
922 assert(Offset <= State.VF.getKnownMinValue() &&
923 "invalid offset to extract from");
924 // Extract lane VF - Offset from the operand.
925 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
926 } else {
927 // TODO: Remove ExtractLastLane for scalar VFs.
928 assert(Offset <= 1 && "invalid offset to extract from");
929 Res = State.get(getOperand(0));
930 }
932 Res->setName(Name);
933 return Res;
934 }
936 Value *A = State.get(getOperand(0));
937 Value *B = State.get(getOperand(1));
938 return Builder.CreateLogicalAnd(A, B, Name);
939 }
941 Value *A = State.get(getOperand(0));
942 Value *B = State.get(getOperand(1));
943 return Builder.CreateLogicalOr(A, B, Name);
944 }
946 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
947 "can only generate first lane for PtrAdd");
948 Value *Ptr = State.get(getOperand(0), VPLane(0));
949 Value *Addend = State.get(getOperand(1), VPLane(0));
950 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
951 }
953 Value *Ptr =
955 Value *Addend = State.get(getOperand(1));
956 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
957 }
959 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
960 for (VPValue *Op : drop_begin(operands()))
961 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
962 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
963 }
965 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
966 "simplified to ExtractElement.");
967 Value *LaneToExtract = State.get(getOperand(0), true);
968 Type *IdxTy = getOperand(0)->getScalarType();
969 Value *Res = nullptr;
970 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
971
972 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
973 Value *VectorStart =
974 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
975 Value *VectorIdx = Idx == 1
976 ? LaneToExtract
977 : Builder.CreateSub(LaneToExtract, VectorStart);
978 Value *Ext = State.VF.isScalar()
979 ? State.get(getOperand(Idx))
980 : Builder.CreateExtractElement(
981 State.get(getOperand(Idx)), VectorIdx);
982 if (Res) {
983 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
984 Res = Builder.CreateSelect(Cmp, Ext, Res);
985 } else {
986 Res = Ext;
987 }
988 }
989 return Res;
990 }
992 Type *Ty = this->getScalarType();
993 if (getNumOperands() == 1) {
994 Value *Mask = State.get(getOperand(0));
995 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
996 /*ZeroIsPoison=*/false, Name);
997 }
998 // If there are multiple operands, create a chain of selects to pick the
999 // first operand with an active lane and add the number of lanes of the
1000 // preceding operands.
1001 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
1002 unsigned LastOpIdx = getNumOperands() - 1;
1003 Value *Res = nullptr;
1004 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1005 Value *TrailingZeros =
1006 State.VF.isScalar()
1007 ? Builder.CreateZExt(
1008 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
1009 Builder.getFalse()),
1010 Ty)
1012 Ty, State.get(getOperand(Idx)),
1013 /*ZeroIsPoison=*/false, Name);
1014 Value *Current = Builder.CreateAdd(
1015 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
1016 TrailingZeros);
1017 if (Res) {
1018 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
1019 Res = Builder.CreateSelect(Cmp, Current, Res);
1020 } else {
1021 Res = Current;
1022 }
1023 }
1024
1025 return Res;
1026 }
1028 return State.get(getOperand(0), true);
1030 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
1032 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
1033 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
1034 Value *Data = State.get(getOperand(Idx));
1035 Value *Mask = State.get(getOperand(Idx + 1));
1036 Type *VTy = Data->getType();
1037
1038 if (State.VF.isScalar())
1039 Result = Builder.CreateSelect(Mask, Data, Result);
1040 else
1041 Result = Builder.CreateIntrinsic(
1042 Intrinsic::experimental_vector_extract_last_active, {VTy},
1043 {Data, Mask, Result});
1044 }
1045
1046 return Result;
1047 }
1048 default:
1049 llvm_unreachable("Unsupported opcode for instruction");
1050 }
1051}
1052
1054 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1055 Type *ScalarTy = this->getScalarType();
1056 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1057 switch (Opcode) {
1058 case Instruction::FNeg:
1059 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1060 case Instruction::UDiv:
1061 case Instruction::SDiv:
1062 case Instruction::SRem:
1063 case Instruction::URem:
1064 case Instruction::Add:
1065 case Instruction::FAdd:
1066 case Instruction::Sub:
1067 case Instruction::FSub:
1068 case Instruction::Mul:
1069 case Instruction::FMul:
1070 case Instruction::FDiv:
1071 case Instruction::FRem:
1072 case Instruction::Shl:
1073 case Instruction::LShr:
1074 case Instruction::AShr:
1075 case Instruction::And:
1076 case Instruction::Or:
1077 case Instruction::Xor: {
1078 // Certain instructions can be cheaper if they have a constant second
1079 // operand. One example of this are shifts on x86.
1080 VPValue *RHS = getOperand(1);
1081 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1082
1083 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1086
1089 if (CtxI)
1090 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1091 return Ctx.TTI.getArithmeticInstrCost(
1092 Opcode, ResultTy, Ctx.CostKind,
1093 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1094 RHSInfo, Operands, CtxI, &Ctx.TLI);
1095 }
1096 case Instruction::Freeze:
1097 // NOTE: The only way to ask for the cost is via getInstructionCost, which
1098 // requires the actual vector instruction. Instead, both here and in the
1099 // LoopVectorizationCostModel::getInstructionCost the costs mirror the
1100 // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
1101 // them in sync.
1102 return TTI::TCC_Free;
1103 case Instruction::ExtractValue:
1104 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1105 Ctx.CostKind);
1106 case Instruction::ICmp:
1107 case Instruction::FCmp: {
1108 Type *ScalarOpTy = getOperand(0)->getScalarType();
1109 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1111 return Ctx.TTI.getCmpSelInstrCost(
1112 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1113 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1114 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1115 }
1116 case Instruction::BitCast: {
1117 Type *ScalarTy = this->getScalarType();
1118 if (ScalarTy->isPointerTy())
1119 return 0;
1120 [[fallthrough]];
1121 }
1122 case Instruction::SExt:
1123 case Instruction::ZExt:
1124 case Instruction::FPToUI:
1125 case Instruction::FPToSI:
1126 case Instruction::FPExt:
1127 case Instruction::PtrToInt:
1128 case Instruction::PtrToAddr:
1129 case Instruction::IntToPtr:
1130 case Instruction::SIToFP:
1131 case Instruction::UIToFP:
1132 case Instruction::Trunc:
1133 case Instruction::FPTrunc:
1134 case Instruction::AddrSpaceCast: {
1135 // Computes the CastContextHint from a recipe that may access memory.
1136 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1137 if (isa<VPInterleaveBase>(R))
1139 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1140 // Only compute CCH for memory operations, matching the legacy model
1141 // which only considers loads/stores for cast context hints.
1142 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1143 if (!isa<LoadInst, StoreInst>(UI))
1145 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1147 }
1148 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1149 if (WidenMemoryRecipe == nullptr)
1151 if (VF.isScalar())
1153 if (!WidenMemoryRecipe->isConsecutive())
1155 if (WidenMemoryRecipe->isMasked())
1158 };
1159
1160 VPValue *Operand = getOperand(0);
1162 bool IsReverse = false;
1163 // For Trunc/FPTrunc, get the context from the only user.
1164 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1165 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1166 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1167 return nullptr;
1168 return dyn_cast<VPRecipeBase>(*R->user_begin());
1169 };
1170 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1171 if (match(Recipe,
1175 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1176 IsReverse = true;
1177 }
1178 if (Recipe)
1179 CCH = ComputeCCH(Recipe);
1180 }
1181 }
1182 // For Z/Sext, get the context from the operand.
1183 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1184 Opcode == Instruction::FPExt) {
1185 if (auto *Recipe = Operand->getDefiningRecipe()) {
1186 VPValue *ReverseOp;
1187 if (match(Recipe,
1188 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1190 m_VPValue(ReverseOp))))) {
1191 Recipe = ReverseOp->getDefiningRecipe();
1192 IsReverse = true;
1193 }
1194 if (Recipe)
1195 CCH = ComputeCCH(Recipe);
1196 }
1197 }
1198 if (IsReverse && CCH != TTI::CastContextHint::None)
1200
1201 auto *ScalarSrcTy = Operand->getScalarType();
1202 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1203 // Arm TTI will use the underlying instruction to determine the cost.
1204 return Ctx.TTI.getCastInstrCost(
1205 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1207 }
1208 case Instruction::Select: {
1210 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1211 Type *ScalarTy = this->getScalarType();
1212
1213 VPValue *Op0, *Op1;
1214 bool IsLogicalAnd =
1215 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1216 bool IsLogicalOr =
1217 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1218 // Also match the inverted forms:
1219 // select x, false, y --> !x & y (still AND)
1220 // select x, y, true --> !x | y (still OR)
1221 IsLogicalAnd |=
1222 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1223 IsLogicalOr |=
1224 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1225
1226 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1227 (IsLogicalAnd || IsLogicalOr)) {
1228 // select x, y, false --> x & y
1229 // select x, true, y --> x | y
1230 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1231 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1232
1234 if (SI && all_of(operands(),
1235 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1236 append_range(Operands, SI->operands());
1237 return Ctx.TTI.getArithmeticInstrCost(
1238 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1239 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1240 }
1241
1242 Type *CondTy = getOperand(0)->getScalarType();
1243 if (!IsScalarCond && VF.isVector())
1244 CondTy = VectorType::get(CondTy, VF);
1245
1246 llvm::CmpPredicate Pred;
1247 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1248 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1249 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1250 Pred = Cmp->getPredicate();
1251 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1252 return Ctx.TTI.getCmpSelInstrCost(
1253 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1254 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1255 }
1256 }
1257 llvm_unreachable("called for unsupported opcode");
1258}
1259
1261 VPCostContext &Ctx) const {
1263 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1264 // TODO: Compute cost for VPInstructions without underlying values once
1265 // the legacy cost model has been retired.
1266 return 0;
1267 }
1268
1270 "Should only generate a vector value or single scalar, not scalars "
1271 "for all lanes.");
1273 getOpcode(),
1275 }
1276
1277 switch (getOpcode()) {
1278 case Instruction::Select: {
1280 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1281 auto *CondTy = getOperand(0)->getScalarType();
1282 auto *VecTy = getOperand(1)->getScalarType();
1283 if (!vputils::onlyFirstLaneUsed(this)) {
1284 CondTy = toVectorTy(CondTy, VF);
1285 VecTy = toVectorTy(VecTy, VF);
1286 }
1287 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1288 Ctx.CostKind);
1289 }
1290 case Instruction::ExtractElement:
1292 if (VF.isScalar()) {
1293 // ExtractLane with VF=1 takes care of handling extracting across multiple
1294 // parts.
1295 return 0;
1296 }
1297
1298 // Add on the cost of extracting the element.
1299 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1300 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1301 Ctx.CostKind);
1302 }
1303 case VPInstruction::AnyOf: {
1304 auto *VecTy = toVectorTy(this->getScalarType(), VF);
1305 return Ctx.TTI.getArithmeticReductionCost(
1306 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1307 }
1309 Type *Ty = this->getScalarType();
1310 Type *ScalarTy = getOperand(0)->getScalarType();
1311 if (VF.isScalar())
1312 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1314 CmpInst::ICMP_EQ, Ctx.CostKind);
1315 // Calculate the cost of determining the lane index.
1316 auto *PredTy = toVectorTy(ScalarTy, VF);
1317 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1318 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1319 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1320 }
1322 Type *Ty = this->getScalarType();
1323 Type *ScalarTy = getOperand(0)->getScalarType();
1324 if (VF.isScalar())
1325 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1327 CmpInst::ICMP_EQ, Ctx.CostKind);
1328 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1329 auto *PredTy = toVectorTy(ScalarTy, VF);
1330 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1331 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1332 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1333 // Add cost of NOT operation on the predicate.
1334 Cost += Ctx.TTI.getArithmeticInstrCost(
1335 Instruction::Xor, PredTy, Ctx.CostKind,
1336 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1337 {TargetTransformInfo::OK_UniformConstantValue,
1338 TargetTransformInfo::OP_None});
1339 // Add cost of SUB operation on the index.
1340 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1341 return Cost;
1342 }
1344 Type *ScalarTy = this->getScalarType();
1345 Type *VecTy = toVectorTy(ScalarTy, VF);
1346 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1348 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1349 {VecTy, MaskTy, ScalarTy});
1350 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1351 }
1353 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1354 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1355 return Ctx.TTI.getShuffleCost(
1357 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1358 }
1360 Type *ArgTy = getOperand(0)->getScalarType();
1361 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1362 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1363 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1364 {ArgTy, ArgTy});
1365 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1366 }
1368 Type *Arg0Ty = getOperand(0)->getScalarType();
1369 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1370 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1371 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1372 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1373 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1374 }
1376 assert(VF.isVector() && "Reverse operation must be vector type");
1377 Type *EltTy = this->getScalarType();
1378 // Skip the reverse operation cost for the mask.
1379 // FIXME: Remove this once redundant mask reverse operations can be
1380 // eliminated by VPlanTransforms::cse before cost computation.
1381 if (EltTy->isIntegerTy(1))
1382 return 0;
1383 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1384 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
1385 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1386 /*Index=*/0);
1387 }
1389 // Add on the cost of extracting the element.
1390 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1391 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1392 VecTy, Ctx.CostKind, 0);
1393 }
1394 case VPInstruction::Not: {
1395 Type *ValTy = this->getScalarType();
1396 // InstCombine will fold `xor` to the conditional branch.
1397 if (auto *U = const_cast<VPUser *>(getSingleUser()))
1398 if (match(U, m_BranchOnCond(m_VPValue())))
1399 return 0;
1400 if (!vputils::onlyFirstLaneUsed(this))
1401 ValTy = toVectorTy(ValTy, VF);
1402 return Ctx.TTI.getArithmeticInstrCost(Instruction::Xor, ValTy,
1403 Ctx.CostKind);
1404 }
1406 // If TC <= VF then this is just a branch.
1407 // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
1408 // where it checks TC <= VF * UF, but we don't know UF yet. This means in
1409 // some cases we get a cost that's too high due to counting a cmp that
1410 // later gets removed.
1411 // FIXME: The compare could also be removed if TC = M * vscale,
1412 // VF = N * vscale, and M <= N. Detecting that would require having the
1413 // trip count as a SCEV though.
1416 if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
1417 return 0;
1418 // Otherwise BranchOnCount generates ICmpEQ followed by a branch.
1419 Type *ValTy = getOperand(0)->getScalarType();
1420 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy,
1422 CmpInst::ICMP_EQ, Ctx.CostKind);
1423 }
1424 case Instruction::FCmp:
1425 case Instruction::ICmp:
1427 getOpcode(),
1430 if (VF == ElementCount::getScalable(1))
1432 [[fallthrough]];
1433 default:
1434 // TODO: Compute cost other VPInstructions once the legacy cost model has
1435 // been retired.
1437 "unexpected VPInstruction witht underlying value");
1438 return 0;
1439 }
1440}
1441
1454
1456 switch (getOpcode()) {
1457 case Instruction::Load:
1458 case Instruction::PHI:
1462 return true;
1463 default:
1465 }
1466}
1467
1469#ifndef NDEBUG
1470 Type *Ty = Op->getScalarType();
1471 switch (getOpcode()) {
1475 assert(Ty == getOperand(0)->getScalarType() &&
1476 "types of operand 0 and new operand must match");
1477 break;
1481 assert(Ty == getOperand(0)->getScalarType() &&
1482 "appended operand must match operand 0's scalar type");
1483 break;
1485 assert(Ty == getOperand(1)->getScalarType() &&
1486 "appended operand must match operand 1's scalar type");
1487 break;
1489 // The recipe is constructed with 3 operands (result, data, mask). Extra
1490 // operands beyond that are appended in (data, mask) pairs.
1491 constexpr unsigned NumInitialOperands = 3;
1492 assert(getNumOperands() >= NumInitialOperands &&
1493 "ExtractLastActive must have at least the initial 3 operands");
1494 bool IsMaskSlot = ((getNumOperands() - NumInitialOperands) & 1u) == 1u;
1495 assert((IsMaskSlot ? Ty->isIntegerTy(1)
1496 : Ty == getOperand(1)->getScalarType()) &&
1497 "ExtractLastActive expects alternating data/mask operands "
1498 "matching operand 1's type and i1, respectively");
1499 break;
1500 }
1501 default:
1502 llvm_unreachable("opcode does not support growing the operand list "
1503 "outside of construction");
1504 }
1505#endif
1507}
1508
1510 assert(!isMasked() && "cannot execute masked VPInstruction");
1511 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1513 "Set flags not supported for the provided opcode");
1515 "Opcode requires specific flags to be set");
1516 if (hasFastMathFlags())
1517 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
1518 Value *GeneratedValue = generate(State);
1519 if (!hasResult())
1520 return;
1521 assert(GeneratedValue && "generate must produce a value");
1522 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1525 assert((((GeneratedValue->getType()->isVectorTy() ||
1526 GeneratedValue->getType()->isStructTy()) ==
1527 !GeneratesPerFirstLaneOnly) ||
1528 State.VF.isScalar()) &&
1529 "scalar value but not only first lane defined");
1530 State.set(this, GeneratedValue,
1531 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1533 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1534 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1535 // vectorization explicitly connects VPlans.
1536 setUnderlyingValue(GeneratedValue);
1537 }
1538}
1539
1543 return false;
1544 switch (getOpcode()) {
1545 case Instruction::ExtractValue:
1546 case Instruction::InsertValue:
1547 case Instruction::GetElementPtr:
1548 case Instruction::ExtractElement:
1549 case Instruction::InsertElement:
1550 case Instruction::Freeze:
1551 case Instruction::FCmp:
1552 case Instruction::ICmp:
1553 case Instruction::Select:
1554 case Instruction::PHI:
1579 case VPInstruction::Not:
1588 return false;
1589 case Instruction::Call:
1592 default:
1593 return true;
1594 }
1595}
1596
1598 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1600 return vputils::onlyFirstLaneUsed(this);
1601
1602 switch (getOpcode()) {
1603 default:
1604 return false;
1605 case Instruction::ExtractElement:
1606 return Op == getOperand(1);
1607 case Instruction::InsertElement:
1608 return Op == getOperand(1) || Op == getOperand(2);
1609 case Instruction::PHI:
1610 return true;
1611 case Instruction::FCmp:
1612 case Instruction::ICmp:
1613 case Instruction::Select:
1614 case Instruction::Or:
1615 case Instruction::Freeze:
1616 case VPInstruction::Not:
1617 // TODO: Cover additional opcodes.
1618 return vputils::onlyFirstLaneUsed(this);
1619 case Instruction::Load:
1629 return true;
1632 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1633 // operand, after replicating its operands only the first lane is used.
1634 // Before replicating, it will have only a single operand.
1635 return getNumOperands() > 1;
1637 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1639 // WidePtrAdd supports scalar and vector base addresses.
1640 return false;
1643 return Op == getOperand(0);
1644 };
1645 llvm_unreachable("switch should return");
1646}
1647
1649 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1651 return vputils::onlyFirstPartUsed(this);
1652
1653 switch (getOpcode()) {
1654 default:
1655 return false;
1656 case Instruction::FCmp:
1657 case Instruction::ICmp:
1658 case Instruction::Select:
1659 return vputils::onlyFirstPartUsed(this);
1664 return true;
1665 };
1666 llvm_unreachable("switch should return");
1667}
1668
1669#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1671 VPSlotTracker SlotTracker(getParent()->getPlan());
1673}
1674
1676 VPSlotTracker &SlotTracker) const {
1677 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1678
1679 if (hasResult()) {
1681 O << " = ";
1682 }
1683
1684 switch (getOpcode()) {
1685 case VPInstruction::Not:
1686 O << "not";
1687 break;
1689 O << "active lane mask";
1690 break;
1692 O << "incoming-alias-mask";
1693 break;
1695 O << "EXPLICIT-VECTOR-LENGTH";
1696 break;
1698 O << "first-order splice";
1699 break;
1701 O << "branch-on-cond";
1702 break;
1704 O << "branch-on-two-conds";
1705 break;
1707 O << "TC > VF ? TC - VF : 0";
1708 break;
1710 O << "VF * Part +";
1711 break;
1713 O << "branch-on-count";
1714 break;
1716 O << "broadcast";
1717 break;
1719 O << "buildstructvector";
1720 break;
1722 O << "buildvector";
1723 break;
1725 O << "exiting-iv-value";
1726 break;
1728 O << "masked-cond";
1729 break;
1731 O << "extract-lane";
1732 break;
1734 O << "extract-last-lane";
1735 break;
1737 O << "extract-last-part";
1738 break;
1740 O << "extract-penultimate-element";
1741 break;
1743 O << "compute-reduction-result";
1744 break;
1746 O << "logical-and";
1747 break;
1749 O << "logical-or";
1750 break;
1752 O << "ptradd";
1753 break;
1755 O << "wide-ptradd";
1756 break;
1758 O << "any-of";
1759 break;
1761 O << "first-active-lane";
1762 break;
1764 O << "last-active-lane";
1765 break;
1767 O << "reduction-start-vector";
1768 break;
1770 O << "resume-for-epilogue";
1771 break;
1773 O << "reverse";
1774 break;
1776 O << "unpack";
1777 break;
1779 O << "extract-last-active";
1780 break;
1782 O << "num-active-lanes";
1783 break;
1784 default:
1786 }
1787
1788 printFlags(O);
1790}
1791#endif
1792
1794 Type *ResultTy = getResultType();
1796 Value *Op = State.get(getOperand(0), VPLane(0));
1797 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1798 Op, ResultTy);
1799 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
1800 applyFlags(*CastOp);
1801 applyMetadata(*CastOp);
1802 }
1803 State.set(this, Cast, VPLane(0));
1804 return;
1805 }
1806 switch (getOpcode()) {
1808 Value *StepVector =
1809 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1810 State.set(this, StepVector);
1811 break;
1812 }
1813 case VPInstruction::VScale: {
1814 Value *VScale = State.Builder.CreateVScale(ResultTy);
1815 State.set(this, VScale, true);
1816 break;
1817 }
1818
1819 default:
1820 llvm_unreachable("opcode not implemented yet");
1821 }
1822}
1823
1824#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1826 VPSlotTracker &SlotTracker) const {
1827 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1829 O << " = ";
1830
1831 Type *ResultTy = getResultType();
1832 switch (getOpcode()) {
1834 O << "wide-iv-step ";
1836 break;
1838 O << "step-vector " << *ResultTy;
1839 break;
1841 O << "vscale " << *ResultTy;
1842 break;
1843 case Instruction::Load:
1844 O << "load ";
1846 break;
1847 default:
1848 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1850 printFlags(O);
1852 O << " to " << *ResultTy;
1853 }
1854}
1855#endif
1856
1858 PHINode *NewPhi = State.Builder.CreatePHI(getScalarType(), 2, getName());
1859 unsigned NumIncoming = getNumIncoming();
1860 // Detect header phis: the parent block dominates its second incoming block
1861 // (the latch). Those IR incoming values have not been generated yet and need
1862 // to be added after they have been executed.
1863 if (NumIncoming == 2 &&
1864 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1865 NumIncoming = 1;
1866 }
1867 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1868 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1869 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1870 NewPhi->addIncoming(IncV, PredBB);
1871 }
1872 State.set(this, NewPhi, VPLane(0));
1873}
1874
1875#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1876void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1877 VPSlotTracker &SlotTracker) const {
1878 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1880 O << " = phi";
1881 printFlags(O);
1883}
1884#endif
1885
1886VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1887 if (auto *Phi = dyn_cast<PHINode>(&I))
1888 return new VPIRPhi(*Phi);
1889 return new VPIRInstruction(I);
1890}
1891
1893 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1894 "PHINodes must be handled by VPIRPhi");
1895 // Advance the insert point after the wrapped IR instruction. This allows
1896 // interleaving VPIRInstructions and other recipes.
1897 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1898}
1899
1901 VPCostContext &Ctx) const {
1902 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1903 // hence it does not contribute to the cost-modeling for the VPlan.
1904 return 0;
1905}
1906
1907#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1909 VPSlotTracker &SlotTracker) const {
1910 O << Indent << "IR " << I;
1911}
1912#endif
1913
1915 PHINode *Phi = &getIRPhi();
1916 for (const auto &[Idx, Op] : enumerate(operands())) {
1917 VPValue *ExitValue = Op;
1918 auto Lane = vputils::isSingleScalar(ExitValue)
1920 : VPLane::getLastLaneForVF(State.VF);
1921 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1922 auto *PredVPBB = Pred->getExitingBasicBlock();
1923 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1924 // Set insertion point in PredBB in case an extract needs to be generated.
1925 // TODO: Model extracts explicitly.
1926 State.Builder.SetInsertPoint(PredBB->getTerminator());
1927 Value *V = State.get(ExitValue, VPLane(Lane));
1928 // If there is no existing block for PredBB in the phi, add a new incoming
1929 // value. Otherwise update the existing incoming value for PredBB.
1930 if (Phi->getBasicBlockIndex(PredBB) == -1)
1931 Phi->addIncoming(V, PredBB);
1932 else
1933 Phi->setIncomingValueForBlock(PredBB, V);
1934 }
1935
1936 // Advance the insert point after the wrapped IR instruction. This allows
1937 // interleaving VPIRInstructions and other recipes.
1938 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1939}
1940
1942 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1943 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1944 "Number of phi operands must match number of predecessors");
1945 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1946 R->removeOperand(Position);
1947}
1948
1949VPValue *
1951 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1952 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1953}
1954
1956 VPValue *V) const {
1957 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1958 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1959}
1960
1961#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1963 VPSlotTracker &SlotTracker) const {
1964 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1965 [this, &O, &SlotTracker](auto Op) {
1966 O << "[ ";
1967 Op.value()->printAsOperand(O, SlotTracker);
1968 O << ", ";
1969 getIncomingBlock(Op.index())->printAsOperand(O);
1970 O << " ]";
1971 });
1972}
1973#endif
1974
1975#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1977 VPSlotTracker &SlotTracker) const {
1979
1980 if (getNumOperands() != 0) {
1981 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1983 [&O, &SlotTracker](auto Op) {
1984 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1985 O << " from ";
1986 std::get<1>(Op)->printAsOperand(O);
1987 });
1988 O << ")";
1989 }
1990}
1991#endif
1992
1994 for (const auto &[Kind, Node] : Metadata)
1995 I.setMetadata(Kind, Node);
1996}
1997
1999 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
2000 for (const auto &[KindA, MDA] : Metadata) {
2001 for (const auto &[KindB, MDB] : Other.Metadata) {
2002 if (KindA == KindB && MDA == MDB) {
2003 MetadataIntersection.emplace_back(KindA, MDA);
2004 break;
2005 }
2006 }
2007 }
2008 Metadata = std::move(MetadataIntersection);
2009}
2010
2011#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2013 const Module *M = SlotTracker.getModule();
2014 if (Metadata.empty() || !M)
2015 return;
2016
2017 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
2018 O << " (";
2019 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
2020 auto [Kind, Node] = KindNodePair;
2021 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
2022 "Unexpected unnamed metadata kind");
2023 O << "!" << MDNames[Kind] << " ";
2024 Node->printAsOperand(O, M);
2025 });
2026 O << ")";
2027}
2028#endif
2029
2031 assert(State.VF.isVector() && "not widening");
2032 assert(Variant != nullptr && "Can't create vector function.");
2033
2034 FunctionType *VFTy = Variant->getFunctionType();
2035 // Add return type if intrinsic is overloaded on it.
2037 for (const auto &I : enumerate(args())) {
2038 Value *Arg;
2039 // Some vectorized function variants may also take a scalar argument,
2040 // e.g. linear parameters for pointers. This needs to be the scalar value
2041 // from the start of the respective part when interleaving.
2042 if (!VFTy->getParamType(I.index())->isVectorTy())
2043 Arg = State.get(I.value(), VPLane(0));
2044 else
2045 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2046 Args.push_back(Arg);
2047 }
2048
2051 if (CI)
2052 CI->getOperandBundlesAsDefs(OpBundles);
2053
2054 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
2055 applyFlags(*V);
2056 applyMetadata(*V);
2057 V->setCallingConv(Variant->getCallingConv());
2058
2059 if (!V->getType()->isVoidTy())
2060 State.set(this, V);
2061}
2062
2064 VPCostContext &Ctx) const {
2065 assert(getVectorizedTypeVF(Variant->getReturnType()) == VF &&
2066 "Variant return type must match VF");
2067 return computeCallCost(Variant, Ctx);
2068}
2069
2071 VPCostContext &Ctx) {
2072 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
2073 Variant->getFunctionType()->params(),
2074 Ctx.CostKind);
2075}
2076
2078 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2079 assert(Variant && "Variant not set");
2080 FunctionType *VFTy = Variant->getFunctionType();
2081 return all_of(enumerate(args()), [VFTy, &Op](const auto &Arg) {
2082 auto [Idx, V] = Arg;
2083 Type *ArgTy = VFTy->getParamType(Idx);
2084 return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() ||
2085 ArgTy->isPointerTy() || ArgTy->isByteTy();
2086 });
2087}
2088
2089#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2091 VPSlotTracker &SlotTracker) const {
2092 O << Indent << "WIDEN-CALL ";
2093
2094 Function *CalledFn = getCalledScalarFunction();
2095 if (CalledFn->getReturnType()->isVoidTy())
2096 O << "void ";
2097 else {
2099 O << " = ";
2100 }
2101
2102 O << "call";
2103 printFlags(O);
2104 O << " @" << CalledFn->getName() << "(";
2105 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
2106 Op->printAsOperand(O, SlotTracker);
2107 });
2108 O << ")";
2109
2110 O << " (using library function";
2111 if (Variant->hasName())
2112 O << ": " << Variant->getName();
2113 O << ")";
2114}
2115#endif
2116
2118 assert(State.VF.isVector() && "not widening");
2119
2120 SmallVector<Type *, 2> TysForDecl;
2121 // Add return type if intrinsic is overloaded on it.
2122 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
2123 State.TTI)) {
2124 Type *RetTy = toVectorizedTy(getScalarType(), State.VF);
2125 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
2126 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
2128 Idx, State.TTI))
2129 TysForDecl.push_back(Ty);
2130 }
2131 }
2133 for (const auto &I : enumerate(operands())) {
2134 // Some intrinsics have a scalar argument - don't replace it with a
2135 // vector.
2136 Value *Arg;
2137 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
2138 State.TTI))
2139 Arg = State.get(I.value(), VPLane(0));
2140 else
2141 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2142 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
2143 State.TTI))
2144 TysForDecl.push_back(Arg->getType());
2145 Args.push_back(Arg);
2146 }
2147
2148 // Use vector version of the intrinsic.
2149 Module *M = State.Builder.GetInsertBlock()->getModule();
2150 Function *VectorF =
2151 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
2152 assert(VectorF &&
2153 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
2154
2157 if (CI)
2158 CI->getOperandBundlesAsDefs(OpBundles);
2159
2160 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
2161
2162 applyFlags(*V);
2163 applyMetadata(*V);
2164
2165 return V;
2166}
2167
2169 CallInst *V = createVectorCall(State);
2170 if (!V->getType()->isVoidTy())
2171 State.set(this, V);
2172}
2173
2176 const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) {
2177 Type *ScalarRetTy = R.getScalarType();
2178 // Skip the reverse operation cost for the mask.
2179 // FIXME: Remove this once redundant mask reverse operations can be eliminated
2180 // by VPlanTransforms::cse before cost computation.
2181 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
2182 return InstructionCost(0);
2183
2184 // Some backends analyze intrinsic arguments to determine cost. Use the
2185 // underlying value for the operand if it has one. Otherwise try to use the
2186 // operand of the underlying call instruction, if there is one. Otherwise
2187 // clear Arguments.
2188 // TODO: Rework TTI interface to be independent of concrete IR values.
2190 for (const auto &[Idx, Op] : enumerate(Operands)) {
2191 auto *V = Op->getUnderlyingValue();
2192 if (!V) {
2193 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
2194 Arguments.push_back(UI->getArgOperand(Idx));
2195 continue;
2196 }
2197 Arguments.clear();
2198 break;
2199 }
2200 Arguments.push_back(V);
2201 }
2202
2203 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
2204 SmallVector<Type *> ParamTys =
2205 map_to_vector(Operands, [&](const VPValue *Op) {
2206 return toVectorTy(Op->getScalarType(), VF);
2207 });
2208
2209 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2210 IntrinsicCostAttributes CostAttrs(
2211 ID, RetTy, Arguments, ParamTys, R.getFastMathFlagsOrNone(),
2212 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
2214 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
2215}
2216
2218 VPCostContext &Ctx) const {
2220 return computeCallCost(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
2221}
2222
2224 return Intrinsic::getBaseName(VectorIntrinsicID);
2225}
2226
2228 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2229 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
2230 auto [Idx, V] = X;
2232 Idx, nullptr);
2233 });
2234}
2235
2236#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2238 VPSlotTracker &SlotTracker) const {
2239 O << Indent << "WIDEN-INTRINSIC ";
2240 if (getScalarType()->isVoidTy()) {
2241 O << "void ";
2242 } else {
2244 O << " = ";
2245 }
2246
2247 O << "call";
2248 printFlags(O);
2249 O << getIntrinsicName() << "(";
2250
2252 Op->printAsOperand(O, SlotTracker);
2253 });
2254 O << ")";
2255}
2256#endif
2257
2259 CallInst *MemI = createVectorCall(State);
2260 MemI->addParamAttr(
2261 0, Attribute::getWithAlignment(MemI->getContext(), Alignment));
2262 State.set(this, MemI);
2263}
2264
2266 Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment,
2267 VPCostContext &Ctx) {
2268 return Ctx.TTI.getMemIntrinsicInstrCost(
2269 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment),
2270 Ctx.CostKind);
2271}
2272
2275 VPCostContext &Ctx) const {
2276 Type *Ty = toVectorTy(getScalarType(), VF);
2278 !match(getOperand(2), m_True()), Alignment,
2279 Ctx);
2280}
2281
2283 IRBuilderBase &Builder = State.Builder;
2284
2285 Value *Address = State.get(getOperand(0));
2286 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2287 VectorType *VTy = cast<VectorType>(Address->getType());
2288
2289 // The histogram intrinsic requires a mask even if the recipe doesn't;
2290 // if the mask operand was omitted then all lanes should be executed and
2291 // we just need to synthesize an all-true mask.
2292 Value *Mask = nullptr;
2293 if (VPValue *VPMask = getMask())
2294 Mask = State.get(VPMask);
2295 else
2296 Mask =
2297 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2298
2299 // If this is a subtract, we want to invert the increment amount. We may
2300 // add a separate intrinsic in future, but for now we'll try this.
2301 if (Opcode == Instruction::Sub)
2302 IncAmt = Builder.CreateNeg(IncAmt);
2303 else
2304 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2305
2306 auto *HistogramInst = State.Builder.CreateIntrinsic(
2307 Intrinsic::experimental_vector_histogram_add, {VTy, IncAmt->getType()},
2308 {Address, IncAmt, Mask});
2309 applyMetadata(*HistogramInst);
2310}
2311
2313 VPCostContext &Ctx) const {
2314 // FIXME: Take the gather and scatter into account as well. For now we're
2315 // generating the same cost as the fallback path, but we'll likely
2316 // need to create a new TTI method for determining the cost, including
2317 // whether we can use base + vec-of-smaller-indices or just
2318 // vec-of-pointers.
2319 assert(VF.isVector() && "Invalid VF for histogram cost");
2320 Type *AddressTy = getOperand(0)->getScalarType();
2321 VPValue *IncAmt = getOperand(1);
2322 Type *IncTy = IncAmt->getScalarType();
2323 VectorType *VTy = VectorType::get(IncTy, VF);
2324
2325 // Assume that a non-constant update value (or a constant != 1) requires
2326 // a multiply, and add that into the cost.
2327 InstructionCost MulCost =
2328 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2329 if (match(IncAmt, m_One()))
2330 MulCost = TTI::TCC_Free;
2331
2332 // Find the cost of the histogram operation itself.
2333 Type *PtrTy = VectorType::get(AddressTy, VF);
2334 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2335 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2336 Type::getVoidTy(Ctx.LLVMCtx),
2337 {PtrTy, IncTy, MaskTy});
2338
2339 // Add the costs together with the add/sub operation.
2340 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2341 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2342}
2343
2344#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2346 VPSlotTracker &SlotTracker) const {
2347 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2349
2350 if (Opcode == Instruction::Sub)
2351 O << ", dec: ";
2352 else {
2353 assert(Opcode == Instruction::Add);
2354 O << ", inc: ";
2355 }
2357
2358 if (VPValue *Mask = getMask()) {
2359 O << ", mask: ";
2360 Mask->printAsOperand(O, SlotTracker);
2361 }
2362}
2363#endif
2364
2365VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2366 AllowReassoc = FMF.allowReassoc();
2367 NoNaNs = FMF.noNaNs();
2368 NoInfs = FMF.noInfs();
2369 NoSignedZeros = FMF.noSignedZeros();
2370 AllowReciprocal = FMF.allowReciprocal();
2371 AllowContract = FMF.allowContract();
2372 ApproxFunc = FMF.approxFunc();
2373}
2374
2376 switch (Opcode) {
2377 case Instruction::Add:
2378 case Instruction::Sub:
2379 case Instruction::Mul:
2380 case Instruction::Shl:
2382 return WrapFlagsTy(false, false);
2383 case Instruction::Trunc:
2384 return TruncFlagsTy(false, false);
2385 case Instruction::Or:
2386 return DisjointFlagsTy(false);
2387 case Instruction::AShr:
2388 case Instruction::LShr:
2389 case Instruction::UDiv:
2390 case Instruction::SDiv:
2391 return ExactFlagsTy(false);
2392 case Instruction::GetElementPtr:
2395 return GEPNoWrapFlags::none();
2396 case Instruction::ZExt:
2397 case Instruction::UIToFP:
2398 return NonNegFlagsTy(false);
2399 case Instruction::FAdd:
2400 case Instruction::FSub:
2401 case Instruction::FMul:
2402 case Instruction::FDiv:
2403 case Instruction::FRem:
2404 case Instruction::FNeg:
2405 case Instruction::FPExt:
2406 case Instruction::FPTrunc:
2407 return FastMathFlags();
2408 case Instruction::ICmp:
2409 case Instruction::FCmp:
2411 llvm_unreachable("opcode requires explicit flags");
2412 default:
2413 return VPIRFlags();
2414 }
2415}
2416
2417#if !defined(NDEBUG)
2418bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2419 switch (OpType) {
2420 case OperationType::OverflowingBinOp:
2421 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2422 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2423 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2424 case OperationType::Trunc:
2425 return Opcode == Instruction::Trunc;
2426 case OperationType::DisjointOp:
2427 return Opcode == Instruction::Or;
2428 case OperationType::PossiblyExactOp:
2429 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2430 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2431 case OperationType::GEPOp:
2432 return Opcode == Instruction::GetElementPtr ||
2433 Opcode == VPInstruction::PtrAdd ||
2434 Opcode == VPInstruction::WidePtrAdd;
2435 case OperationType::FPMathOp:
2436 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2437 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2438 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2439 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2440 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2441 Opcode == Instruction::Select || Opcode == Instruction::SIToFP ||
2442 Opcode == Instruction::UIToFP ||
2443 Opcode == VPInstruction::WideIVStep ||
2445 case OperationType::FCmp:
2446 return Opcode == Instruction::FCmp;
2447 case OperationType::NonNegOp:
2448 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2449 case OperationType::Cmp:
2450 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2451 case OperationType::ReductionOp:
2453 case OperationType::Other:
2454 return true;
2455 }
2456 llvm_unreachable("Unknown OperationType enum");
2457}
2458
2459bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2460 // Handle opcodes without default flags.
2461 if (Opcode == Instruction::ICmp)
2462 return OpType == OperationType::Cmp;
2463 if (Opcode == Instruction::FCmp)
2464 return OpType == OperationType::FCmp;
2466 return OpType == OperationType::ReductionOp;
2467
2468 OperationType Required = getDefaultFlags(Opcode).OpType;
2469 return Required == OperationType::Other || Required == OpType;
2470}
2471#endif
2472
2473#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2474static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) {
2475 switch (Kind) {
2476 case RecurKind::None:
2477 OS << "none";
2478 break;
2479 case RecurKind::Add:
2480 OS << "add";
2481 break;
2482 case RecurKind::Sub:
2483 OS << "sub";
2484 break;
2486 OS << "add-chain-with-subs";
2487 break;
2488 case RecurKind::Mul:
2489 OS << "mul";
2490 break;
2491 case RecurKind::Or:
2492 OS << "or";
2493 break;
2494 case RecurKind::And:
2495 OS << "and";
2496 break;
2497 case RecurKind::Xor:
2498 OS << "xor";
2499 break;
2500 case RecurKind::SMin:
2501 OS << "smin";
2502 break;
2503 case RecurKind::SMax:
2504 OS << "smax";
2505 break;
2506 case RecurKind::UMin:
2507 OS << "umin";
2508 break;
2509 case RecurKind::UMax:
2510 OS << "umax";
2511 break;
2512 case RecurKind::FAdd:
2513 OS << "fadd";
2514 break;
2516 OS << "fadd-chain-with-subs";
2517 break;
2518 case RecurKind::FSub:
2519 OS << "fsub";
2520 break;
2521 case RecurKind::FMul:
2522 OS << "fmul";
2523 break;
2524 case RecurKind::FMin:
2525 OS << "fmin";
2526 break;
2527 case RecurKind::FMax:
2528 OS << "fmax";
2529 break;
2530 case RecurKind::FMinNum:
2531 OS << "fminnum";
2532 break;
2533 case RecurKind::FMaxNum:
2534 OS << "fmaxnum";
2535 break;
2537 OS << "fminimum";
2538 break;
2540 OS << "fmaximum";
2541 break;
2543 OS << "fminimumnum";
2544 break;
2546 OS << "fmaximumnum";
2547 break;
2548 case RecurKind::FMulAdd:
2549 OS << "fmuladd";
2550 break;
2551 case RecurKind::AnyOf:
2552 OS << "any-of";
2553 break;
2554 case RecurKind::FindIV:
2555 OS << "find-iv";
2556 break;
2558 OS << "find-last";
2559 break;
2560 }
2561}
2562
2564 switch (OpType) {
2565 case OperationType::Cmp:
2567 break;
2568 case OperationType::FCmp:
2571 break;
2572 case OperationType::DisjointOp:
2573 if (DisjointFlags.IsDisjoint)
2574 O << " disjoint";
2575 break;
2576 case OperationType::PossiblyExactOp:
2577 if (ExactFlags.IsExact)
2578 O << " exact";
2579 break;
2580 case OperationType::OverflowingBinOp:
2581 if (WrapFlags.HasNUW)
2582 O << " nuw";
2583 if (WrapFlags.HasNSW)
2584 O << " nsw";
2585 break;
2586 case OperationType::Trunc:
2587 if (TruncFlags.HasNUW)
2588 O << " nuw";
2589 if (TruncFlags.HasNSW)
2590 O << " nsw";
2591 break;
2592 case OperationType::FPMathOp:
2594 break;
2595 case OperationType::GEPOp: {
2597 if (Flags.isInBounds())
2598 O << " inbounds";
2599 else if (Flags.hasNoUnsignedSignedWrap())
2600 O << " nusw";
2601 if (Flags.hasNoUnsignedWrap())
2602 O << " nuw";
2603 break;
2604 }
2605 case OperationType::NonNegOp:
2606 if (NonNegFlags.NonNeg)
2607 O << " nneg";
2608 break;
2609 case OperationType::ReductionOp: {
2610 O << " (";
2612 if (isReductionInLoop())
2613 O << ", in-loop";
2614 if (isReductionOrdered())
2615 O << ", ordered";
2616 O << ")";
2618 break;
2619 }
2620 case OperationType::Other:
2621 break;
2622 }
2623 O << " ";
2624}
2625#endif
2626
2628 auto &Builder = State.Builder;
2629 switch (Opcode) {
2630 case Instruction::Call:
2631 case Instruction::UncondBr:
2632 case Instruction::CondBr:
2633 case Instruction::PHI:
2634 case Instruction::GetElementPtr:
2635 llvm_unreachable("This instruction is handled by a different recipe.");
2636 case Instruction::UDiv:
2637 case Instruction::SDiv:
2638 case Instruction::SRem:
2639 case Instruction::URem:
2640 case Instruction::Add:
2641 case Instruction::FAdd:
2642 case Instruction::Sub:
2643 case Instruction::FSub:
2644 case Instruction::FNeg:
2645 case Instruction::Mul:
2646 case Instruction::FMul:
2647 case Instruction::FDiv:
2648 case Instruction::FRem:
2649 case Instruction::Shl:
2650 case Instruction::LShr:
2651 case Instruction::AShr:
2652 case Instruction::And:
2653 case Instruction::Or:
2654 case Instruction::Xor: {
2655 // Just widen unops and binops.
2657 for (VPValue *VPOp : operands())
2658 Ops.push_back(State.get(VPOp));
2659
2660 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2661
2662 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2663 applyFlags(*VecOp);
2664 applyMetadata(*VecOp);
2665 }
2666
2667 // Use this vector value for all users of the original instruction.
2668 State.set(this, V);
2669 break;
2670 }
2671 case Instruction::ExtractValue: {
2672 assert(getNumOperands() == 2 && "expected single level extractvalue");
2673 Value *Op = State.get(getOperand(0));
2674 Value *Extract = Builder.CreateExtractValue(
2675 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2676 State.set(this, Extract);
2677 break;
2678 }
2679 case Instruction::Freeze: {
2680 Value *Op = State.get(getOperand(0));
2681 Value *Freeze = Builder.CreateFreeze(Op);
2682 State.set(this, Freeze);
2683 break;
2684 }
2685 case Instruction::ICmp:
2686 case Instruction::FCmp: {
2687 // Widen compares. Generate vector compares.
2688 bool FCmp = Opcode == Instruction::FCmp;
2689 Value *A = State.get(getOperand(0));
2690 Value *B = State.get(getOperand(1));
2691 Value *C = nullptr;
2692 if (FCmp) {
2693 C = Builder.CreateFCmp(getPredicate(), A, B);
2694 } else {
2695 C = Builder.CreateICmp(getPredicate(), A, B);
2696 }
2697 if (auto *I = dyn_cast<Instruction>(C)) {
2698 applyFlags(*I);
2699 applyMetadata(*I);
2700 }
2701 State.set(this, C);
2702 break;
2703 }
2704 case Instruction::Select: {
2705 VPValue *CondOp = getOperand(0);
2706 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2707 Value *Op0 = State.get(getOperand(1));
2708 Value *Op1 = State.get(getOperand(2));
2709 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2710 State.set(this, Sel);
2711 if (auto *I = dyn_cast<Instruction>(Sel)) {
2713 applyFlags(*I);
2714 applyMetadata(*I);
2715 }
2716 break;
2717 }
2718 default:
2719 // This instruction is not vectorized by simple widening.
2720 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2721 << Instruction::getOpcodeName(Opcode));
2722 llvm_unreachable("Unhandled instruction!");
2723 } // end of switch.
2724
2725#if !defined(NDEBUG)
2726 // Verify that VPlan type inference results agree with the type of the
2727 // generated values.
2728 assert(VectorType::get(this->getScalarType(), State.VF) ==
2729 State.get(this)->getType() &&
2730 "inferred type and type from generated instructions do not match");
2731#endif
2732}
2733
2735 VPCostContext &Ctx) const {
2736 switch (Opcode) {
2737 case Instruction::UDiv:
2738 case Instruction::SDiv:
2739 case Instruction::SRem:
2740 case Instruction::URem:
2741 // If the div/rem operation isn't safe to speculate and requires
2742 // predication, then the only way we can even create a vplan is to insert
2743 // a select on the second input operand to ensure we use the value of 1
2744 // for the inactive lanes. The select will be costed separately.
2745 case Instruction::FNeg:
2746 case Instruction::Add:
2747 case Instruction::FAdd:
2748 case Instruction::Sub:
2749 case Instruction::FSub:
2750 case Instruction::Mul:
2751 case Instruction::FMul:
2752 case Instruction::FDiv:
2753 case Instruction::FRem:
2754 case Instruction::Shl:
2755 case Instruction::LShr:
2756 case Instruction::AShr:
2757 case Instruction::And:
2758 case Instruction::Or:
2759 case Instruction::Xor:
2760 case Instruction::Freeze:
2761 case Instruction::ExtractValue:
2762 case Instruction::ICmp:
2763 case Instruction::FCmp:
2764 case Instruction::Select:
2765 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2766 default:
2767 llvm_unreachable("Unsupported opcode for instruction");
2768 }
2769}
2770
2771#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2773 VPSlotTracker &SlotTracker) const {
2774 O << Indent << "WIDEN ";
2776 O << " = " << Instruction::getOpcodeName(Opcode);
2777 printFlags(O);
2779}
2780#endif
2781
2783 auto &Builder = State.Builder;
2784 /// Vectorize casts.
2785 assert(State.VF.isVector() && "Not vectorizing?");
2786 Type *DestTy = VectorType::get(getScalarType(), State.VF);
2787 VPValue *Op = getOperand(0);
2788 Value *A = State.get(Op);
2789 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2790 State.set(this, Cast);
2791 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2792 applyFlags(*CastOp);
2793 applyMetadata(*CastOp);
2794 }
2795}
2796
2801
2802#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2804 VPSlotTracker &SlotTracker) const {
2805 O << Indent << "WIDEN-CAST ";
2807 O << " = " << Instruction::getOpcodeName(Opcode);
2808 printFlags(O);
2810 O << " to " << *getScalarType();
2811}
2812#endif
2813
2815 VPCostContext &Ctx) const {
2816 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2817}
2818
2819#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2821 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2822 O << Indent;
2824 O << " = WIDEN-INDUCTION";
2825 printFlags(O);
2827
2828 if (auto *TI = getTruncInst())
2829 O << " (truncated to " << *TI->getType() << ")";
2830}
2831#endif
2832
2834 // The step may be defined by a recipe in the preheader (e.g. if it requires
2835 // SCEV expansion), but for the canonical induction the step is required to be
2836 // 1, which is represented as live-in.
2837 return match(getStartValue(), m_ZeroInt()) &&
2838 match(getStepValue(), m_One()) &&
2839 getScalarType() == getRegion()->getCanonicalIVType();
2840}
2841
2843 VPCostContext &Ctx) const {
2844 // The cost model for this is modelled on expandVPDerivedIV in
2845 // VPlanTransforms.cpp. In order to avoid overly pessimistic costs that can
2846 // negatively affect vectorization it takes into account any expected
2847 // simplifications that happen in simplifyRecipe.
2848 switch (getInductionKind()) {
2849 default:
2850 // TODO: Compute cost for remaining kinds.
2851 break;
2853 // There are currently no tests that expose a path where all lanes are
2854 // used, so it's better to bail out for now.
2855 if (!vputils::onlyFirstLaneUsed(this))
2856 break;
2857
2858 // Start off by assuming we need both mul and add, then refine this.
2859 bool NeedsMul = true, NeedsAdd = true, NeedsShl = false;
2860
2861 // If the start value is zero the add gets folded away.
2862 if (auto *VPV = dyn_cast<VPIRValue>(getStartValue()))
2863 if (auto *StartC = dyn_cast<ConstantInt>(VPV->getValue()))
2864 NeedsAdd = !StartC->isZero();
2865
2866 // For some values of step the arithmetic changes:
2867 // 1. A step of 1 requires no operation.
2868 // 2. A step of -1 requires a negate.
2869 // 3. A power-of-2 step will use a shl, instead of a mul.
2870 Type *StepTy = getStepValue()->getScalarType();
2872 if (auto *VPV = dyn_cast<VPIRValue>(getStepValue())) {
2873 if (auto *StepC = dyn_cast<ConstantInt>(VPV->getValue())) {
2874 if (StepC->isOne())
2875 NeedsMul = false;
2876 else if (StepC->isMinusOne()) {
2877 // This will most likely end up as a negate in simplifyRecipe, and
2878 // the negate will be combined with the add to make a sub.
2879 // NOTE: This is perhaps an invalid assumption that the cost of an
2880 // 'add' is the same as a 'sub'.
2881 NeedsMul = false;
2882 NeedsAdd = true;
2883 } else if (StepC->getValue().isPowerOf2()) {
2884 // This will most likely end up as a shift-left in simplifyRecipe
2885 NeedsMul = false;
2886 NeedsShl = true;
2887 }
2888 }
2889 }
2890
2891 // Add the cost of the conversion from index to step type if the index
2892 // will be used.
2893 Type *IndexTy = getIndex()->getScalarType();
2894 unsigned StepTySize = StepTy->getScalarSizeInBits();
2895 unsigned IndexTySize = IndexTy->getScalarSizeInBits();
2896 if ((NeedsAdd || NeedsMul || NeedsShl) && StepTySize != IndexTySize) {
2897 unsigned CastOpc =
2898 StepTySize < IndexTySize ? Instruction::Trunc : Instruction::SExt;
2899 Cost += Ctx.TTI.getCastInstrCost(
2900 CastOpc, StepTy, IndexTy, TTI::CastContextHint::None, Ctx.CostKind);
2901 }
2902
2903 if (NeedsMul)
2904 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, StepTy,
2905 Ctx.CostKind);
2906 if (NeedsShl)
2907 Cost += Ctx.TTI.getArithmeticInstrCost(
2908 Instruction::Shl, StepTy, Ctx.CostKind,
2909 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2910 {TargetTransformInfo::OK_UniformConstantValue,
2911 TargetTransformInfo::OP_None});
2912 if (NeedsAdd)
2913 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Add, StepTy,
2914 Ctx.CostKind);
2915 return Cost;
2916 }
2917 }
2918
2919 return 0;
2920}
2921
2922#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2924 VPSlotTracker &SlotTracker) const {
2925 O << Indent;
2927 O << " = DERIVED-IV ";
2928 getStartValue()->printAsOperand(O, SlotTracker);
2929 O << " + ";
2930 getOperand(1)->printAsOperand(O, SlotTracker);
2931 O << " * ";
2932 getStepValue()->printAsOperand(O, SlotTracker);
2933}
2934#endif
2935
2937 // Fast-math-flags propagate from the original induction instruction.
2938 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2939 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
2940
2941 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2942 /// variable on which to base the steps, \p Step is the size of the step.
2943
2944 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2945 Value *Step = State.get(getStepValue(), VPLane(0));
2946 IRBuilderBase &Builder = State.Builder;
2947
2948 // Ensure step has the same type as that of scalar IV.
2949 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2950 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2951
2952 // We build scalar steps for both integer and floating-point induction
2953 // variables. Here, we determine the kind of arithmetic we will perform.
2956 if (BaseIVTy->isIntegerTy()) {
2957 AddOp = Instruction::Add;
2958 MulOp = Instruction::Mul;
2959 } else {
2960 AddOp = InductionOpcode;
2961 MulOp = Instruction::FMul;
2962 }
2963
2964 // Determine the number of scalars we need to generate.
2965 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2966 // Compute the scalar steps and save the results in State.
2967
2968 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2969 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2970 : Constant::getNullValue(BaseIVTy);
2971
2972 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
2973 // It is okay if the induction variable type cannot hold the lane number,
2974 // we expect truncation in this case.
2975 Constant *LaneValue =
2976 BaseIVTy->isIntegerTy()
2977 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2978 /*ImplicitTrunc=*/true)
2979 : ConstantFP::get(BaseIVTy, Lane);
2980 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2981 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2982 "Expected StartIdx to be folded to a constant when VF is not "
2983 "scalable");
2984 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2985 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2986 State.set(this, Add, VPLane(Lane));
2987 }
2988}
2989
2990#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2992 VPSlotTracker &SlotTracker) const {
2993 O << Indent;
2995 O << " = SCALAR-STEPS ";
2997}
2998#endif
2999
3001 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
3003}
3004
3006 assert(State.VF.isVector() && "not widening");
3007 auto Ops = map_to_vector(operands(), [&](VPValue *Op) {
3008 return State.get(Op, vputils::isSingleScalar(Op));
3009 });
3010 auto *GEP =
3011 State.Builder.CreateGEP(getSourceElementType(), Ops.front(),
3012 drop_begin(Ops), "wide.gep", getGEPNoWrapFlags());
3013 State.set(this, GEP, vputils::isSingleScalar(this));
3014}
3015
3016#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3018 VPSlotTracker &SlotTracker) const {
3019 O << Indent << "WIDEN-GEP ";
3021 O << " = getelementptr";
3022 printFlags(O);
3024}
3025#endif
3026
3028 assert(!getOffset() && "Unexpected offset operand");
3029 VPBuilder Builder(this);
3030 VPlan &Plan = *getParent()->getPlan();
3031 VPValue *VFVal = getVFValue();
3032 const DataLayout &DL = Plan.getDataLayout();
3033 Type *IndexTy = DL.getIndexType(this->getScalarType());
3034 VPValue *Stride =
3035 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
3036 Type *VFTy = VFVal->getScalarType();
3037 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
3039
3040 // Offset for Part0 = Offset0 = Stride * (VF - 1).
3041 VPInstruction *VFMinusOne =
3042 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
3043 DebugLoc::getUnknown(), "", {true, true});
3044 VPInstruction *Offset0 =
3045 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
3046
3047 // Offset for PartN = Offset0 + Part * Stride * VF.
3048 VPValue *PartxStride =
3049 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
3050 VPValue *Offset = Builder.createAdd(
3051 Offset0,
3052 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
3054}
3055
3057 auto &Builder = State.Builder;
3058 assert(getOffset() && "Expected prior materialization of offset");
3059 Value *Ptr = State.get(getPointer(), true);
3060 Value *Offset = State.get(getOffset(), true);
3061 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3063 State.set(this, ResultPtr, /*IsScalar*/ true);
3064}
3065
3066#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3068 VPSlotTracker &SlotTracker) const {
3069 O << Indent;
3071 O << " = vector-end-pointer";
3072 printFlags(O);
3074}
3075#endif
3076
3078 assert(getVFxPart() &&
3079 "Expected prior simplification of recipe without VFxPart");
3080
3081 auto &Builder = State.Builder;
3082 Value *Ptr = State.get(getOperand(0), VPLane(0));
3083 Value *Offset = State.get(getVFxPart(), true);
3084 // TODO: Expand to VPInstruction to support constant folding.
3085 if (!match(getStride(), m_One())) {
3086 Value *Stride = Builder.CreateZExtOrTrunc(State.get(getStride(), true),
3087 Offset->getType());
3088 Offset = Builder.CreateMul(Offset, Stride);
3089 }
3090 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3092 State.set(this, ResultPtr, /*IsScalar*/ true);
3093}
3094
3095#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3097 VPSlotTracker &SlotTracker) const {
3098 O << Indent;
3100 O << " = vector-pointer";
3101 printFlags(O);
3103}
3104#endif
3105
3107 VPCostContext &Ctx) const {
3108 // A blend will be expanded to a select VPInstruction, which will generate a
3109 // scalar select if only the first lane is used.
3111 VF = ElementCount::getFixed(1);
3112
3113 Type *ResultTy = toVectorTy(this->getScalarType(), VF);
3114 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
3115 return (getNumIncomingValues() - 1) *
3116 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
3117 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
3118}
3119
3120#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3122 VPSlotTracker &SlotTracker) const {
3123 O << Indent << "BLEND ";
3125 O << " =";
3126 printFlags(O);
3127 if (getNumIncomingValues() == 1) {
3128 // Not a User of any mask: not really blending, this is a
3129 // single-predecessor phi.
3130 getIncomingValue(0)->printAsOperand(O, SlotTracker);
3131 } else {
3132 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
3133 if (I != 0)
3134 O << " ";
3135 getIncomingValue(I)->printAsOperand(O, SlotTracker);
3136 if (I == 0 && isNormalized())
3137 continue;
3138 O << "/";
3139 getMask(I)->printAsOperand(O, SlotTracker);
3140 }
3141 }
3142}
3143#endif
3144
3148 "In-loop AnyOf reductions aren't currently supported");
3149 // Propagate the fast-math flags carried by the underlying instruction.
3150 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
3151 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
3152 Value *NewVecOp = State.get(getVecOp());
3153 if (VPValue *Cond = getCondOp()) {
3154 Value *NewCond = State.get(Cond, State.VF.isScalar());
3155 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
3156 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
3157
3158 Value *Start =
3160 if (State.VF.isVector())
3161 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
3162
3163 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
3164 NewVecOp = Select;
3165 }
3166 Value *NewRed;
3167 Value *NextInChain;
3168 if (isOrdered()) {
3169 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3170 if (State.VF.isVector())
3171 NewRed =
3172 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
3173 else
3174 NewRed = State.Builder.CreateBinOp(
3176 PrevInChain, NewVecOp);
3177 PrevInChain = NewRed;
3178 NextInChain = NewRed;
3179 } else if (isPartialReduction()) {
3180 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
3181 "Unexpected partial reduction kind");
3182 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
3183 NewRed = State.Builder.CreateIntrinsic(
3184 PrevInChain->getType(),
3185 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
3186 : Intrinsic::vector_partial_reduce_fadd,
3187 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
3188 "partial.reduce");
3189 PrevInChain = NewRed;
3190 NextInChain = NewRed;
3191 } else {
3192 assert(isInLoop() &&
3193 "The reduction must either be ordered, partial or in-loop");
3194 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3195 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
3197 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
3198 else
3199 NextInChain = State.Builder.CreateBinOp(
3201 PrevInChain, NewRed);
3202 }
3203 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
3204}
3205
3207
3208 auto &Builder = State.Builder;
3209 // Propagate the fast-math flags carried by the underlying instruction.
3210 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
3211 Builder.setFastMathFlags(getFastMathFlagsOrNone());
3212
3214 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
3215 Value *VecOp = State.get(getVecOp());
3216 Value *EVL = State.get(getEVL(), VPLane(0));
3217
3218 Value *Mask;
3219 if (VPValue *CondOp = getCondOp())
3220 Mask = State.get(CondOp);
3221 else
3222 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3223
3224 Value *NewRed;
3225 if (isOrdered()) {
3226 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
3227 } else {
3228 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
3230 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
3231 else
3232 NewRed = Builder.CreateBinOp(
3234 Prev);
3235 }
3236 State.set(this, NewRed, /*IsScalar*/ true);
3237}
3238
3240 VPCostContext &Ctx) const {
3241 RecurKind RdxKind = getRecurrenceKind();
3242 Type *ElementTy = this->getScalarType();
3243 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
3244 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
3246 std::optional<FastMathFlags> OptionalFMF =
3247 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
3248
3249 if (isPartialReduction()) {
3250 InstructionCost CondCost = 0;
3251 if (isConditional()) {
3253 auto *CondTy =
3255 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
3256 CondTy, Pred, Ctx.CostKind);
3257 }
3258 return CondCost + Ctx.TTI.getPartialReductionCost(
3259 Opcode, ElementTy, ElementTy, ElementTy, VF,
3260 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
3261 OptionalFMF);
3262 }
3263
3264 // TODO: Support any-of reductions.
3265 assert(
3267 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
3268 "Any-of reduction not implemented in VPlan-based cost model currently.");
3269
3270 // Note that TTI should model the cost of moving result to the scalar register
3271 // and the BinOp cost in the getMinMaxReductionCost().
3274 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
3275 }
3276
3277 // Note that TTI should model the cost of moving result to the scalar register
3278 // and the BinOp cost in the getArithmeticReductionCost().
3279 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
3280 Ctx.CostKind);
3281}
3282
3283VPExpressionRecipe::VPExpressionRecipe(
3284 ExpressionTypes ExpressionType,
3285 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
3286 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {},
3287 cast<VPReductionRecipe>(ExpressionRecipes.back())
3288 ->getChainOp()
3289 ->getScalarType()),
3290 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
3291 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
3292 assert(
3293 none_of(ExpressionRecipes,
3294 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3295 "expression cannot contain recipes with side-effects");
3296
3297 // Maintain a copy of the expression recipes as a set of users.
3298 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
3299 for (auto *R : ExpressionRecipes)
3300 ExpressionRecipesAsSetOfUsers.insert(R);
3301
3302 // Recipes in the expression, except the last one, must only be used by
3303 // (other) recipes inside the expression. If there are other users, external
3304 // to the expression, use a clone of the recipe for external users.
3305 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
3306 if (R != ExpressionRecipes.back() &&
3307 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
3308 return !ExpressionRecipesAsSetOfUsers.contains(U);
3309 })) {
3310 // There are users outside of the expression. Clone the recipe and use the
3311 // clone those external users.
3312 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3313 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
3314 VPUser &U, unsigned) {
3315 return !ExpressionRecipesAsSetOfUsers.contains(&U);
3316 });
3317 CopyForExtUsers->insertBefore(R);
3318 }
3319 if (R->getParent())
3320 R->removeFromParent();
3321 }
3322
3323 // Internalize all external operands to the expression recipes. To do so,
3324 // create new temporary VPValues for all operands defined by a recipe outside
3325 // the expression. The original operands are added as operands of the
3326 // VPExpressionRecipe itself.
3327 for (auto *R : ExpressionRecipes) {
3328 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3329 auto *Def = Op->getDefiningRecipe();
3330 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3331 continue;
3332 addOperand(Op);
3333 LiveInPlaceholders.push_back(new VPSymbolicValue(Op->getScalarType()));
3334 }
3335 }
3336
3337 // Replace each external operand with the first one created for it in
3338 // LiveInPlaceholders.
3339 for (auto *R : ExpressionRecipes)
3340 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3341 R->replaceUsesOfWith(LiveIn, Tmp);
3342}
3343
3345 for (auto *R : ExpressionRecipes)
3346 // Since the list could contain duplicates, make sure the recipe hasn't
3347 // already been inserted.
3348 if (!R->getParent())
3349 R->insertBefore(this);
3350
3351 for (const auto &[Idx, Op] : enumerate(operands()))
3352 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3353
3354 replaceAllUsesWith(ExpressionRecipes.back());
3355 ExpressionRecipes.clear();
3356}
3357
3359 VPCostContext &Ctx) const {
3360 Type *RedTy = this->getScalarType();
3361 auto *SrcVecTy =
3363 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3364 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3365 switch (ExpressionType) {
3366 case ExpressionTypes::NegatedExtendedReduction:
3367 assert((Opcode == Instruction::Add || Opcode == Instruction::FAdd) &&
3368 "Unexpected opcode");
3369 Opcode = Opcode == Instruction::Add ? Instruction::Sub : Instruction::FSub;
3370 [[fallthrough]];
3371 case ExpressionTypes::ExtendedReduction: {
3372 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3373 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3374
3375 if (RedR->isPartialReduction())
3376 return Ctx.TTI.getPartialReductionCost(
3377 Opcode, getOperand(0)->getScalarType(), nullptr, RedTy, VF,
3379 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3380 RedTy->isFloatingPointTy()
3381 ? std::optional{RedR->getFastMathFlagsOrNone()}
3382 : std::nullopt);
3383 else if (!RedTy->isFloatingPointTy())
3384 // TTI::getExtendedReductionCost only supports integer types.
3385 return Ctx.TTI.getExtendedReductionCost(
3386 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3387 std::nullopt, Ctx.CostKind);
3388 else
3390 }
3391 case ExpressionTypes::MulAccReduction:
3392 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3393 Ctx.CostKind);
3394
3395 case ExpressionTypes::ExtNegatedMulAccReduction:
3396 switch (Opcode) {
3397 case Instruction::Add:
3398 Opcode = Instruction::Sub;
3399 break;
3400 case Instruction::FAdd:
3401 Opcode = Instruction::FSub;
3402 break;
3403 default:
3404 llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction");
3405 }
3406 [[fallthrough]];
3407 case ExpressionTypes::ExtMulAccReduction: {
3408 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3409 if (RedR->isPartialReduction()) {
3410 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3411 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3412 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3413 return Ctx.TTI.getPartialReductionCost(
3414 Opcode, getOperand(0)->getScalarType(),
3415 getOperand(1)->getScalarType(), RedTy, VF,
3417 Ext0R->getOpcode()),
3419 Ext1R->getOpcode()),
3420 Mul->getOpcode(), Ctx.CostKind,
3421 RedTy->isFloatingPointTy()
3422 ? std::optional{RedR->getFastMathFlagsOrNone()}
3423 : std::nullopt);
3424 }
3425 assert(Opcode != Instruction::FSub && "Only integer types are supported");
3426 return Ctx.TTI.getMulAccReductionCost(
3427 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3428 Instruction::ZExt,
3429 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3430 }
3431 }
3432 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3433}
3434
3436 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3437 return R->mayReadFromMemory() || R->mayWriteToMemory();
3438 });
3439}
3440
3442 assert(
3443 none_of(ExpressionRecipes,
3444 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3445 "expression cannot contain recipes with side-effects");
3446 return false;
3447}
3448
3450 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3451 return RR && !RR->isPartialReduction();
3452}
3453
3454#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3455
3457 VPSlotTracker &SlotTracker) const {
3458 O << Indent << "EXPRESSION ";
3460 O << " = ";
3461 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3462 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3463 VPValue *RdxStart =
3464 getOperand(getNumOperands() - (Red->isConditional() ? 2 : 1));
3465
3466 switch (ExpressionType) {
3467 case ExpressionTypes::NegatedExtendedReduction:
3468 case ExpressionTypes::ExtendedReduction: {
3469 bool Negated = ExpressionType == ExpressionTypes::NegatedExtendedReduction;
3471 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3472 O << Instruction::getOpcodeName(Opcode) << " (";
3473 if (Negated)
3474 O << (Opcode == Instruction::Add ? "sub (0, " : "fneg(");
3476 if (Negated)
3477 O << ")";
3478 Red->printFlags(O);
3479
3480 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3481 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3482 << *Ext0->getScalarType();
3483 if (Red->isConditional()) {
3484 O << ", ";
3486 }
3487 O << ")";
3488 break;
3489 }
3490 case ExpressionTypes::ExtNegatedMulAccReduction: {
3491 RdxStart->printAsOperand(O, SlotTracker);
3492 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3494 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3495 << " (sub (0, mul";
3496 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3497 Mul->printFlags(O);
3498 O << "(";
3500 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3501 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3502 << *Ext0->getScalarType() << "), (";
3504 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3505 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3506 << *Ext1->getScalarType() << ")";
3507 if (Red->isConditional()) {
3508 O << ", ";
3510 }
3511 O << "))";
3512 break;
3513 }
3514 case ExpressionTypes::MulAccReduction:
3515 case ExpressionTypes::ExtMulAccReduction: {
3516 RdxStart->printAsOperand(O, SlotTracker);
3517 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3519 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3520 << " (";
3521 O << "mul";
3522 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3523 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3524 : ExpressionRecipes[0]);
3525 Mul->printFlags(O);
3526 if (IsExtended)
3527 O << "(";
3529 if (IsExtended) {
3530 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3531 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3532 << *Ext0->getScalarType() << "), (";
3533 } else {
3534 O << ", ";
3535 }
3537 if (IsExtended) {
3538 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3539 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3540 << *Ext1->getScalarType() << ")";
3541 }
3542 if (Red->isConditional()) {
3543 O << ", ";
3545 }
3546 O << ")";
3547 break;
3548 }
3549 }
3550}
3551
3553 VPSlotTracker &SlotTracker) const {
3554 if (isPartialReduction())
3555 O << Indent << "PARTIAL-REDUCE ";
3556 else
3557 O << Indent << "REDUCE ";
3559 O << " = ";
3561 O << " +";
3562 printFlags(O);
3563 O << " reduce.";
3565 O << " (";
3567 if (isConditional()) {
3568 O << ", ";
3570 }
3571 O << ")";
3572}
3573
3575 VPSlotTracker &SlotTracker) const {
3576 O << Indent << "REDUCE ";
3578 O << " = ";
3580 O << " +";
3581 printFlags(O);
3582 O << " vp.reduce."
3585 << " (";
3587 O << ", ";
3589 if (isConditional()) {
3590 O << ", ";
3592 }
3593 O << ")";
3594}
3595
3596#endif
3597
3599 assert(IsSingleScalar &&
3600 "VPReplicateRecipes must be unrolled before ::execute");
3601 auto *Instr = getUnderlyingInstr();
3602 Instruction *Cloned = Instr->clone();
3603 Type *ResultTy = getScalarType();
3604 if (!ResultTy->isVoidTy()) {
3605 Cloned->setName(Instr->getName() + ".cloned");
3606 // The operands of the replicate recipe may have been narrowed, resulting in
3607 // a narrower result type. Update the type of the cloned instruction to the
3608 // correct type.
3609 if (ResultTy != Cloned->getType())
3610 Cloned->mutateType(ResultTy);
3611 }
3612
3613 applyFlags(*Cloned);
3614 applyMetadata(*Cloned);
3615
3616 if (hasPredicate())
3617 cast<CmpInst>(Cloned)->setPredicate(getPredicate());
3618
3619 // Replace the operands of the cloned instructions with their scalar
3620 // equivalents in the new loop.
3621 for (const auto &[Idx, V] : enumerate(operands()))
3622 Cloned->setOperand(Idx, State.get(V, true));
3623
3624 // Place the cloned scalar in the new loop.
3625 State.Builder.Insert(Cloned);
3626
3627 State.set(this, Cloned, true);
3628
3629 // If we just cloned a new assumption, add it the assumption cache.
3630 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3631 State.AC->registerAssumption(II);
3632}
3633
3634/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3635/// which the legacy cost model computes a SCEV expression when computing the
3636/// address cost. Computing SCEVs for VPValues is incomplete and returns
3637/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3638/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3639static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3641 const Loop *L) {
3642 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3643 if (isa<SCEVCouldNotCompute>(Addr))
3644 return Addr;
3645
3646 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3647}
3648
3650 VPCostContext &Ctx) const {
3652 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3653 // transform, avoid computing their cost multiple times for now.
3654 Ctx.SkipCostComputation.insert(UI);
3655
3656 if (VF.isScalable() && !isSingleScalar())
3658
3659 switch (UI->getOpcode()) {
3660 case Instruction::Alloca:
3661 if (VF.isScalable())
3663 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul,
3664 this->getScalarType(), Ctx.CostKind);
3665 case Instruction::GetElementPtr:
3666 // We mark this instruction as zero-cost because the cost of GEPs in
3667 // vectorized code depends on whether the corresponding memory instruction
3668 // is scalarized or not. Therefore, we handle GEPs with the memory
3669 // instruction cost.
3670 return 0;
3671 case Instruction::Call: {
3672 auto *CalledFn =
3674 Type *ResultTy = this->getScalarType();
3676 return computeCallCost(CalledFn, ResultTy, ArgOps, isSingleScalar(), VF,
3677 Ctx);
3678 }
3679 case Instruction::Add:
3680 case Instruction::Sub:
3681 case Instruction::FAdd:
3682 case Instruction::FSub:
3683 case Instruction::Mul:
3684 case Instruction::FMul:
3685 case Instruction::FDiv:
3686 case Instruction::FRem:
3687 case Instruction::Shl:
3688 case Instruction::LShr:
3689 case Instruction::AShr:
3690 case Instruction::And:
3691 case Instruction::Or:
3692 case Instruction::Xor:
3693 case Instruction::ICmp:
3694 case Instruction::FCmp:
3696 Ctx) *
3697 (isSingleScalar() ? 1 : VF.getFixedValue());
3698 case Instruction::SDiv:
3699 case Instruction::UDiv:
3700 case Instruction::SRem:
3701 case Instruction::URem: {
3702 InstructionCost ScalarCost =
3704 if (isSingleScalar())
3705 return ScalarCost;
3706
3707 // If any of the operands is from a different replicate region and has its
3708 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3709 // model to avoid cost mis-match.
3710 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3711 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3712 if (!PredR)
3713 return false;
3714 return Ctx.skipCostComputation(
3716 PredR->getOperand(0)->getUnderlyingValue()),
3717 VF.isVector());
3718 }))
3719 break;
3720
3721 ScalarCost = ScalarCost * VF.getFixedValue() +
3722 Ctx.getScalarizationOverhead(this->getScalarType(),
3723 to_vector(operands()), VF);
3724 // If the recipe is not predicated (i.e. not in a replicate region), return
3725 // the scalar cost. Otherwise handle predicated cost.
3726 if (!getRegion()->isReplicator())
3727 return ScalarCost;
3728
3729 // Account for the phi nodes that we will create.
3730 ScalarCost += VF.getFixedValue() *
3731 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3732 // Scale the cost by the probability of executing the predicated blocks.
3733 // This assumes the predicated block for each vector lane is equally
3734 // likely.
3735 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3736 return ScalarCost;
3737 }
3738 case Instruction::Load:
3739 case Instruction::Store: {
3740 bool IsLoad = UI->getOpcode() == Instruction::Load;
3741 const VPValue *PtrOp = getOperand(!IsLoad);
3742 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3744 break;
3745
3746 Type *ValTy = (IsLoad ? this : getOperand(0))->getScalarType();
3747 Type *ScalarPtrTy = PtrOp->getScalarType();
3748 const Align Alignment = getLoadStoreAlignment(UI);
3749 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3751 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3752 bool UsedByLoadStoreAddress =
3753 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this);
3754 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3755 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3756 UsedByLoadStoreAddress ? UI : nullptr);
3757
3758 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3759 InstructionCost ScalarCost =
3760 ScalarMemOpCost +
3761 Ctx.TTI.getAddressComputationCost(
3762 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3763 Ctx.CostKind);
3764 if (isSingleScalar())
3765 return ScalarCost;
3766
3767 SmallVector<const VPValue *> OpsToScalarize;
3768 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3769 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3770 // don't assign scalarization overhead in general, if the target prefers
3771 // vectorized addressing or the loaded value is used as part of an address
3772 // of another load or store.
3773 if (!UsedByLoadStoreAddress) {
3774 bool EfficientVectorLoadStore =
3775 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3776 if (!(IsLoad && !PreferVectorizedAddressing) &&
3777 !(!IsLoad && EfficientVectorLoadStore))
3778 append_range(OpsToScalarize, operands());
3779
3780 if (!EfficientVectorLoadStore)
3781 ResultTy = this->getScalarType();
3782 }
3783
3787 (ScalarCost * VF.getFixedValue()) +
3788 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3789
3790 const VPRegionBlock *ParentRegion = getRegion();
3791 if (ParentRegion && ParentRegion->isReplicator()) {
3792 if (!PtrSCEV)
3793 break;
3794 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3795 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3796
3797 auto *VecI1Ty = VectorType::get(
3798 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3799 Cost += Ctx.TTI.getScalarizationOverhead(
3800 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3801 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3802
3803 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3804 // Artificially setting to a high enough value to practically disable
3805 // vectorization with such operations.
3806 return 3000000;
3807 }
3808 }
3809 return Cost;
3810 }
3811 case Instruction::SExt:
3812 case Instruction::ZExt:
3813 case Instruction::FPToUI:
3814 case Instruction::FPToSI:
3815 case Instruction::FPExt:
3816 case Instruction::PtrToInt:
3817 case Instruction::PtrToAddr:
3818 case Instruction::IntToPtr:
3819 case Instruction::SIToFP:
3820 case Instruction::UIToFP:
3821 case Instruction::Trunc:
3822 case Instruction::FPTrunc:
3823 case Instruction::Select:
3824 case Instruction::AddrSpaceCast: {
3826 Ctx) *
3827 (isSingleScalar() ? 1 : VF.getFixedValue());
3828 }
3829 case Instruction::ExtractValue:
3830 case Instruction::InsertValue:
3831 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3832 }
3833
3834 return Ctx.getLegacyCost(UI, VF);
3835}
3836
3838 Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps,
3839 bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) {
3841 ArgOps, [&](const VPValue *Op) { return Op->getScalarType(); });
3842
3843 Intrinsic::ID IntrinID = CalledFn->getIntrinsicID();
3844 auto GetIntrinsicCost = [&] {
3845 if (!IntrinID)
3847 return Ctx.TTI.getIntrinsicInstrCost(
3848 IntrinsicCostAttributes(IntrinID, ResultTy, Tys), Ctx.CostKind);
3849 };
3850
3851 if (IntrinID && VPCostContext::isFreeScalarIntrinsic(IntrinID)) {
3852 assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free");
3853 return 0;
3854 }
3855
3856 InstructionCost ScalarCallCost =
3857 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3858 if (IsSingleScalar) {
3859 ScalarCallCost = std::min(ScalarCallCost, GetIntrinsicCost());
3860 return ScalarCallCost;
3861 }
3862
3863 // Scalarization overhead is undefined for scalable VFs.
3864 if (VF.isScalable())
3866
3867 return ScalarCallCost * VF.getFixedValue() +
3868 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3869}
3870
3871#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3873 VPSlotTracker &SlotTracker) const {
3874 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3875
3876 if (!getScalarType()->isVoidTy()) {
3878 O << " = ";
3879 }
3880 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3881 O << "call";
3882 printFlags(O);
3883 O << "@" << CB->getCalledFunction()->getName() << "(";
3885 O, [&O, &SlotTracker](VPValue *Op) {
3886 Op->printAsOperand(O, SlotTracker);
3887 });
3888 O << ")";
3889 } else {
3891 printFlags(O);
3893 }
3894
3895 // Find if the recipe is used by a widened recipe via an intervening
3896 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3897 if (any_of(users(), [](const VPUser *U) {
3898 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3899 return !vputils::onlyScalarValuesUsed(PredR);
3900 return false;
3901 }))
3902 O << " (S->V)";
3903}
3904#endif
3905
3907 llvm_unreachable("recipe must be removed when dissolving replicate region");
3908}
3909
3911 VPCostContext &Ctx) const {
3912 // The legacy cost model doesn't assign costs to branches for individual
3913 // replicate regions. Match the current behavior in the VPlan cost model for
3914 // now.
3915 return 0;
3916}
3917
3919 llvm_unreachable("recipe must be removed when dissolving replicate region");
3920}
3921
3922#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3924 VPSlotTracker &SlotTracker) const {
3925 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3927 O << " = ";
3929}
3930#endif
3931
3933 VPCostContext &Ctx) const {
3934 const VPRecipeBase *R = getAsRecipe();
3936 Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(R)->getScalarType()
3937 : R->getOperand(1)->getScalarType();
3938 Type *Ty = toVectorTy(ScalarTy, VF);
3939 unsigned AS =
3940 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
3941 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
3942
3943 if (!Consecutive) {
3944 // TODO: Using the original IR may not be accurate.
3945 // Currently, ARM will use the underlying IR to calculate gather/scatter
3946 // instruction cost.
3947 [[maybe_unused]] auto IsReverseMask = [this, R]() {
3948 VPValue *Mask = getMask();
3949 if (!Mask)
3950 return false;
3951
3954
3955 return match(Mask, m_Reverse(m_VPValue()));
3956 };
3957 assert(!IsReverseMask() &&
3958 "Inconsecutive memory access should not have reverse order");
3959 Type *PtrTy = getAddr()->getScalarType();
3960 const Value *Ptr = getAddr()->getUnderlyingValue();
3961
3962 // If the address value is uniform across all lanes, then the address can be
3963 // calculated with scalar type and broadcast.
3965 PtrTy = toVectorTy(PtrTy, VF);
3966
3967 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_gather
3968 : isa<VPWidenStoreRecipe>(R) ? Intrinsic::masked_scatter
3969 : isa<VPWidenLoadEVLRecipe>(R) ? Intrinsic::vp_gather
3970 : Intrinsic::vp_scatter;
3971 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3972 Ctx.CostKind) +
3973 Ctx.TTI.getMemIntrinsicInstrCost(
3975 &Ingredient),
3976 Ctx.CostKind);
3977 }
3978
3980 if (IsMasked) {
3981 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_load
3982 : Intrinsic::masked_store;
3983 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3984 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3985 } else {
3986 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3988 : R->getOperand(1));
3989 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3990 OpInfo, &Ingredient);
3991 }
3992 return Cost;
3993}
3994
3996 Type *ScalarDataTy = getScalarType();
3997 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3998 bool CreateGather = !isConsecutive();
3999
4000 auto &Builder = State.Builder;
4001 Value *Mask = nullptr;
4002 if (auto *VPMask = getMask())
4003 Mask = State.get(VPMask);
4004
4005 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
4006 Value *NewLI;
4007 if (CreateGather) {
4008 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
4009 "wide.masked.gather");
4010 } else if (Mask) {
4011 NewLI =
4012 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
4013 PoisonValue::get(DataTy), "wide.masked.load");
4014 } else {
4015 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
4016 }
4018 State.set(this, NewLI);
4019}
4020
4021#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4023 VPSlotTracker &SlotTracker) const {
4024 O << Indent << "WIDEN ";
4026 O << " = load ";
4028}
4029#endif
4030
4032 Type *ScalarDataTy = getScalarType();
4033 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
4034 bool CreateGather = !isConsecutive();
4035
4036 auto &Builder = State.Builder;
4037 CallInst *NewLI;
4038 Value *EVL = State.get(getEVL(), VPLane(0));
4039 Value *Addr = State.get(getAddr(), !CreateGather);
4040 Value *Mask = nullptr;
4041 if (VPValue *VPMask = getMask())
4042 Mask = State.get(VPMask);
4043 else
4044 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4045
4046 if (CreateGather) {
4047 NewLI =
4048 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
4049 nullptr, "wide.masked.gather");
4050 } else {
4051 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
4052 {Addr, Mask, EVL}, nullptr, "vp.op.load");
4053 }
4054 NewLI->addParamAttr(
4056 applyMetadata(*NewLI);
4057 Instruction *Res = NewLI;
4058 State.set(this, Res);
4059}
4060
4062 VPCostContext &Ctx) const {
4063 if (!Consecutive || IsMasked)
4064 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4065
4066 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4067 // here because the EVL recipes using EVL to replace the tail mask. But in the
4068 // legacy model, it will always calculate the cost of mask.
4069 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4070 // don't need to compare to the legacy cost model.
4071 Type *Ty = toVectorTy(getScalarType(), VF);
4072 unsigned AS =
4073 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4074 return Ctx.TTI.getMemIntrinsicInstrCost(
4075 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
4076 Ctx.CostKind);
4077}
4078
4079#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4081 VPSlotTracker &SlotTracker) const {
4082 O << Indent << "WIDEN ";
4084 O << " = vp.load ";
4086}
4087#endif
4088
4090 VPValue *StoredVPValue = getStoredValue();
4091 bool CreateScatter = !isConsecutive();
4092
4093 auto &Builder = State.Builder;
4094
4095 Value *Mask = nullptr;
4096 if (auto *VPMask = getMask())
4097 Mask = State.get(VPMask);
4098
4099 Value *StoredVal = State.get(StoredVPValue);
4100 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
4101 Instruction *NewSI = nullptr;
4102 if (CreateScatter)
4103 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
4104 else if (Mask)
4105 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
4106 else
4107 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
4108 applyMetadata(*NewSI);
4109}
4110
4111#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4113 VPSlotTracker &SlotTracker) const {
4114 O << Indent << "WIDEN store ";
4116}
4117#endif
4118
4120 VPValue *StoredValue = getStoredValue();
4121 bool CreateScatter = !isConsecutive();
4122
4123 auto &Builder = State.Builder;
4124
4125 CallInst *NewSI = nullptr;
4126 Value *StoredVal = State.get(StoredValue);
4127 Value *EVL = State.get(getEVL(), VPLane(0));
4128 Value *Mask = nullptr;
4129 if (VPValue *VPMask = getMask())
4130 Mask = State.get(VPMask);
4131 else
4132 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4133
4134 Value *Addr = State.get(getAddr(), !CreateScatter);
4135 if (CreateScatter) {
4136 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4137 Intrinsic::vp_scatter,
4138 {StoredVal, Addr, Mask, EVL});
4139 } else {
4140 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4141 Intrinsic::vp_store,
4142 {StoredVal, Addr, Mask, EVL});
4143 }
4144 NewSI->addParamAttr(
4146 applyMetadata(*NewSI);
4147}
4148
4150 VPCostContext &Ctx) const {
4151 if (!Consecutive || IsMasked)
4152 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4153
4154 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4155 // here because the EVL recipes using EVL to replace the tail mask. But in the
4156 // legacy model, it will always calculate the cost of mask.
4157 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4158 // don't need to compare to the legacy cost model.
4159 Type *Ty = toVectorTy(getStoredValue()->getScalarType(), VF);
4160 unsigned AS =
4161 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4162 return Ctx.TTI.getMemIntrinsicInstrCost(
4163 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4164 Ctx.CostKind);
4165}
4166
4167#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4169 VPSlotTracker &SlotTracker) const {
4170 O << Indent << "WIDEN vp.store ";
4172}
4173#endif
4174
4176 VectorType *DstVTy, const DataLayout &DL) {
4177 // Verify that V is a vector type with same number of elements as DstVTy.
4178 auto VF = DstVTy->getElementCount();
4179 auto *SrcVecTy = cast<VectorType>(V->getType());
4180 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4181 Type *SrcElemTy = SrcVecTy->getElementType();
4182 Type *DstElemTy = DstVTy->getElementType();
4183 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4184 "Vector elements must have same size");
4185
4186 // Do a direct cast if element types are castable.
4187 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4188 return Builder.CreateBitOrPointerCast(V, DstVTy);
4189 }
4190 // V cannot be directly casted to desired vector type.
4191 // May happen when V is a floating point vector but DstVTy is a vector of
4192 // pointers or vice-versa. Handle this using a two-step bitcast using an
4193 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4194 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4195 "Only one type should be a pointer type");
4196 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4197 "Only one type should be a floating point type");
4198 Type *IntTy =
4199 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4200 auto *VecIntTy = VectorType::get(IntTy, VF);
4201 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4202 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4203}
4204
4205/// Return a vector containing interleaved elements from multiple
4206/// smaller input vectors.
4208 const Twine &Name) {
4209 unsigned Factor = Vals.size();
4210 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4211
4212 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4213#ifndef NDEBUG
4214 for (Value *Val : Vals)
4215 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4216#endif
4217
4218 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4219 // must use intrinsics to interleave.
4220 if (VecTy->isScalableTy()) {
4221 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4222 return Builder.CreateVectorInterleave(Vals, Name);
4223 }
4224
4225 // Fixed length. Start by concatenating all vectors into a wide vector.
4226 Value *WideVec = concatenateVectors(Builder, Vals);
4227
4228 // Interleave the elements into the wide vector.
4229 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4230 return Builder.CreateShuffleVector(
4231 WideVec, createInterleaveMask(NumElts, Factor), Name);
4232}
4233
4234// Try to vectorize the interleave group that \p Instr belongs to.
4235//
4236// E.g. Translate following interleaved load group (factor = 3):
4237// for (i = 0; i < N; i+=3) {
4238// R = Pic[i]; // Member of index 0
4239// G = Pic[i+1]; // Member of index 1
4240// B = Pic[i+2]; // Member of index 2
4241// ... // do something to R, G, B
4242// }
4243// To:
4244// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4245// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4246// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4247// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4248//
4249// Or translate following interleaved store group (factor = 3):
4250// for (i = 0; i < N; i+=3) {
4251// ... do something to R, G, B
4252// Pic[i] = R; // Member of index 0
4253// Pic[i+1] = G; // Member of index 1
4254// Pic[i+2] = B; // Member of index 2
4255// }
4256// To:
4257// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4258// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4259// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4260// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4261// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4263 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4264 "Masking gaps for scalable vectors is not yet supported.");
4266 Instruction *Instr = Group->getInsertPos();
4267
4268 // Prepare for the vector type of the interleaved load/store.
4269 Type *ScalarTy = getLoadStoreType(Instr);
4270 unsigned InterleaveFactor = Group->getFactor();
4271 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4272
4273 VPValue *BlockInMask = getMask();
4274 VPValue *Addr = getAddr();
4275 Value *ResAddr = State.get(Addr, VPLane(0));
4276
4277 auto CreateGroupMask = [&BlockInMask, &State,
4278 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4279 if (State.VF.isScalable()) {
4280 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4281 assert(InterleaveFactor <= 8 &&
4282 "Unsupported deinterleave factor for scalable vectors");
4283 auto *ResBlockInMask = State.get(BlockInMask);
4284 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4285 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4286 }
4287
4288 if (!BlockInMask)
4289 return MaskForGaps;
4290
4291 Value *ResBlockInMask = State.get(BlockInMask);
4292 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4293 ResBlockInMask,
4294 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4295 "interleaved.mask");
4296 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4297 ShuffledMask, MaskForGaps)
4298 : ShuffledMask;
4299 };
4300
4301 const DataLayout &DL = Instr->getDataLayout();
4302 // Vectorize the interleaved load group.
4303 if (isa<LoadInst>(Instr)) {
4304 Value *MaskForGaps = nullptr;
4305 if (needsMaskForGaps()) {
4306 MaskForGaps =
4307 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4308 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4309 }
4310
4311 Instruction *NewLoad;
4312 if (BlockInMask || MaskForGaps) {
4313 Value *GroupMask = CreateGroupMask(MaskForGaps);
4314 Value *PoisonVec = PoisonValue::get(VecTy);
4315 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4316 Group->getAlign(), GroupMask,
4317 PoisonVec, "wide.masked.vec");
4318 } else
4319 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4320 Group->getAlign(), "wide.vec");
4321 applyMetadata(*NewLoad);
4322 // TODO: Also manage existing metadata using VPIRMetadata.
4323 Group->addMetadata(NewLoad);
4324
4326 if (VecTy->isScalableTy()) {
4327 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4328 // so must use intrinsics to deinterleave.
4329 assert(InterleaveFactor <= 8 &&
4330 "Unsupported deinterleave factor for scalable vectors");
4331 NewLoad = State.Builder.CreateIntrinsic(
4332 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4333 NewLoad->getType(), NewLoad,
4334 /*FMFSource=*/nullptr, "strided.vec");
4335 }
4336
4337 auto CreateStridedVector = [&InterleaveFactor, &State,
4338 &NewLoad](unsigned Index) -> Value * {
4339 assert(Index < InterleaveFactor && "Illegal group index");
4340 if (State.VF.isScalable())
4341 return State.Builder.CreateExtractValue(NewLoad, Index);
4342
4343 // For fixed length VF, use shuffle to extract the sub-vectors from the
4344 // wide load.
4345 auto StrideMask =
4346 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4347 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4348 "strided.vec");
4349 };
4350
4351 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4352 Instruction *Member = Group->getMember(I);
4353
4354 // Skip the gaps in the group.
4355 if (!Member)
4356 continue;
4357
4358 Value *StridedVec = CreateStridedVector(I);
4359
4360 // If this member has different type, cast the result type.
4361 if (Member->getType() != ScalarTy) {
4362 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4363 StridedVec =
4364 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4365 }
4366
4367 if (Group->isReverse())
4368 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4369
4370 State.set(VPDefs[J], StridedVec);
4371 ++J;
4372 }
4373 return;
4374 }
4375
4376 // The sub vector type for current instruction.
4377 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4378
4379 // Vectorize the interleaved store group.
4380 Value *MaskForGaps =
4381 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4382 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4383 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4384 ArrayRef<VPValue *> StoredValues = getStoredValues();
4385 // Collect the stored vector from each member.
4386 SmallVector<Value *, 4> StoredVecs;
4387 unsigned StoredIdx = 0;
4388 for (unsigned i = 0; i < InterleaveFactor; i++) {
4389 assert((Group->getMember(i) || MaskForGaps) &&
4390 "Fail to get a member from an interleaved store group");
4391 Instruction *Member = Group->getMember(i);
4392
4393 // Skip the gaps in the group.
4394 if (!Member) {
4395 Value *Undef = PoisonValue::get(SubVT);
4396 StoredVecs.push_back(Undef);
4397 continue;
4398 }
4399
4400 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4401 ++StoredIdx;
4402
4403 if (Group->isReverse())
4404 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4405
4406 // If this member has different type, cast it to a unified type.
4407
4408 if (StoredVec->getType() != SubVT)
4409 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4410
4411 StoredVecs.push_back(StoredVec);
4412 }
4413
4414 // Interleave all the smaller vectors into one wider vector.
4415 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4416 Instruction *NewStoreInstr;
4417 if (BlockInMask || MaskForGaps) {
4418 Value *GroupMask = CreateGroupMask(MaskForGaps);
4419 NewStoreInstr = State.Builder.CreateMaskedStore(
4420 IVec, ResAddr, Group->getAlign(), GroupMask);
4421 } else
4422 NewStoreInstr =
4423 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4424
4425 applyMetadata(*NewStoreInstr);
4426 // TODO: Also manage existing metadata using VPIRMetadata.
4427 Group->addMetadata(NewStoreInstr);
4428}
4429
4430#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4432 VPSlotTracker &SlotTracker) const {
4434 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4436 VPValue *Mask = getMask();
4437 if (Mask) {
4438 O << ", ";
4439 Mask->printAsOperand(O, SlotTracker);
4440 }
4441
4442 unsigned OpIdx = 0;
4443 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4444 if (!IG->getMember(i))
4445 continue;
4446 if (getNumStoreOperands() > 0) {
4447 O << "\n" << Indent << " store ";
4449 O << " to index " << i;
4450 } else {
4451 O << "\n" << Indent << " ";
4453 O << " = load from index " << i;
4454 }
4455 ++OpIdx;
4456 }
4457}
4458#endif
4459
4461 assert(State.VF.isScalable() &&
4462 "Only support scalable VF for EVL tail-folding.");
4464 "Masking gaps for scalable vectors is not yet supported.");
4466 Instruction *Instr = Group->getInsertPos();
4467
4468 // Prepare for the vector type of the interleaved load/store.
4469 Type *ScalarTy = getLoadStoreType(Instr);
4470 unsigned InterleaveFactor = Group->getFactor();
4471 assert(InterleaveFactor <= 8 &&
4472 "Unsupported deinterleave/interleave factor for scalable vectors");
4473 ElementCount WideVF = State.VF * InterleaveFactor;
4474 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4475
4476 VPValue *Addr = getAddr();
4477 Value *ResAddr = State.get(Addr, VPLane(0));
4478 Value *EVL = State.get(getEVL(), VPLane(0));
4479 Value *InterleaveEVL = State.Builder.CreateMul(
4480 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4481 /* NUW= */ true, /* NSW= */ true);
4482 LLVMContext &Ctx = State.Builder.getContext();
4483
4484 Value *GroupMask = nullptr;
4485 if (VPValue *BlockInMask = getMask()) {
4486 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4487 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4488 } else {
4489 GroupMask =
4490 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4491 }
4492
4493 // Vectorize the interleaved load group.
4494 if (isa<LoadInst>(Instr)) {
4495 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4496 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4497 "wide.vp.load");
4498 NewLoad->addParamAttr(0,
4499 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4500
4501 applyMetadata(*NewLoad);
4502 // TODO: Also manage existing metadata using VPIRMetadata.
4503 Group->addMetadata(NewLoad);
4504
4505 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4506 // so must use intrinsics to deinterleave.
4507 NewLoad = State.Builder.CreateIntrinsic(
4508 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4509 NewLoad->getType(), NewLoad,
4510 /*FMFSource=*/nullptr, "strided.vec");
4511
4512 const DataLayout &DL = Instr->getDataLayout();
4513 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4514 Instruction *Member = Group->getMember(I);
4515 // Skip the gaps in the group.
4516 if (!Member)
4517 continue;
4518
4519 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4520 // If this member has different type, cast the result type.
4521 if (Member->getType() != ScalarTy) {
4522 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4523 StridedVec =
4524 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4525 }
4526
4527 State.set(getVPValue(J), StridedVec);
4528 ++J;
4529 }
4530 return;
4531 } // End for interleaved load.
4532
4533 // The sub vector type for current instruction.
4534 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4535 // Vectorize the interleaved store group.
4536 ArrayRef<VPValue *> StoredValues = getStoredValues();
4537 // Collect the stored vector from each member.
4538 SmallVector<Value *, 4> StoredVecs;
4539 const DataLayout &DL = Instr->getDataLayout();
4540 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4541 Instruction *Member = Group->getMember(I);
4542 // Skip the gaps in the group.
4543 if (!Member) {
4544 StoredVecs.push_back(PoisonValue::get(SubVT));
4545 continue;
4546 }
4547
4548 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4549 // If this member has different type, cast it to a unified type.
4550 if (StoredVec->getType() != SubVT)
4551 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4552
4553 StoredVecs.push_back(StoredVec);
4554 ++StoredIdx;
4555 }
4556
4557 // Interleave all the smaller vectors into one wider vector.
4558 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4559 CallInst *NewStore =
4560 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4561 {IVec, ResAddr, GroupMask, InterleaveEVL});
4562 NewStore->addParamAttr(1,
4563 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4564
4565 applyMetadata(*NewStore);
4566 // TODO: Also manage existing metadata using VPIRMetadata.
4567 Group->addMetadata(NewStore);
4568}
4569
4570#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4572 VPSlotTracker &SlotTracker) const {
4574 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4576 O << ", ";
4578 if (VPValue *Mask = getMask()) {
4579 O << ", ";
4580 Mask->printAsOperand(O, SlotTracker);
4581 }
4582
4583 unsigned OpIdx = 0;
4584 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4585 if (!IG->getMember(i))
4586 continue;
4587 if (getNumStoreOperands() > 0) {
4588 O << "\n" << Indent << " vp.store ";
4590 O << " to index " << i;
4591 } else {
4592 O << "\n" << Indent << " ";
4594 O << " = vp.load from index " << i;
4595 }
4596 ++OpIdx;
4597 }
4598}
4599#endif
4600
4602 VPCostContext &Ctx) const {
4603 Instruction *InsertPos = getInsertPos();
4604 // Find the VPValue index of the interleave group. We need to skip gaps.
4605 unsigned InsertPosIdx = 0;
4606 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4607 if (auto *Member = IG->getMember(Idx)) {
4608 if (Member == InsertPos)
4609 break;
4610 InsertPosIdx++;
4611 }
4612 const VPValue *ValV = getNumDefinedValues() > 0
4613 ? getVPValue(InsertPosIdx)
4614 : getStoredValues()[InsertPosIdx];
4615 Type *ValTy = ValV->getScalarType();
4616 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4617 unsigned AS =
4618 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4619
4620 unsigned InterleaveFactor = IG->getFactor();
4621 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4622
4623 // Holds the indices of existing members in the interleaved group.
4625 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4626 if (IG->getMember(IF))
4627 Indices.push_back(IF);
4628
4629 // Calculate the cost of the whole interleaved group.
4630 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4631 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4632 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4633
4634 if (!IG->isReverse())
4635 return Cost;
4636
4637 return Cost + IG->getNumMembers() *
4638 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4639 VectorTy, VectorTy, {}, Ctx.CostKind,
4640 0);
4641}
4642
4644 return vputils::onlyScalarValuesUsed(this) &&
4645 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4646}
4647
4648#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4650 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4651 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4652 "unexpected number of operands");
4653 O << Indent << "EMIT ";
4655 O << " = WIDEN-POINTER-INDUCTION ";
4657 O << ", ";
4659 O << ", ";
4661 if (getNumOperands() == 5) {
4662 O << ", ";
4664 O << ", ";
4666 }
4667}
4668
4670 VPSlotTracker &SlotTracker) const {
4671 O << Indent << "EMIT ";
4673 O << " = EXPAND SCEV " << *Expr;
4674}
4675#endif
4676
4677#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4679 VPSlotTracker &SlotTracker) const {
4680 O << Indent << "EMIT ";
4682 O << " = WIDEN-CANONICAL-INDUCTION";
4683 printFlags(O);
4685}
4686#endif
4687
4689 auto &Builder = State.Builder;
4690 // Create a vector from the initial value.
4691 auto *VectorInit = getStartValue()->getLiveInIRValue();
4692
4693 Type *VecTy = State.VF.isScalar()
4694 ? VectorInit->getType()
4695 : VectorType::get(VectorInit->getType(), State.VF);
4696
4697 BasicBlock *VectorPH =
4698 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4699 if (State.VF.isVector()) {
4700 auto *IdxTy = Builder.getInt32Ty();
4701 auto *One = ConstantInt::get(IdxTy, 1);
4702 IRBuilder<>::InsertPointGuard Guard(Builder);
4703 Builder.SetInsertPoint(VectorPH->getTerminator());
4704 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4705 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4706 VectorInit = Builder.CreateInsertElement(
4707 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4708 }
4709
4710 // Create a phi node for the new recurrence.
4711 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4712 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4713 Phi->addIncoming(VectorInit, VectorPH);
4714 State.set(this, Phi);
4715}
4716
4719 VPCostContext &Ctx) const {
4720 if (VF.isScalar())
4721 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4722
4723 return 0;
4724}
4725
4726#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4728 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4729 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4731 O << " = phi ";
4733}
4734#endif
4735
4737 // Reductions do not have to start at zero. They can start with
4738 // any loop invariant values.
4739 VPValue *StartVPV = getStartValue();
4740
4741 // In order to support recurrences we need to be able to vectorize Phi nodes.
4742 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4743 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4744 // this value when we vectorize all of the instructions that use the PHI.
4745 BasicBlock *VectorPH =
4746 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4747 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4748 Value *StartV = State.get(StartVPV, ScalarPHI);
4749 Type *VecTy = StartV->getType();
4750
4751 BasicBlock *HeaderBB = State.CFG.PrevBB;
4752 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4753 "recipe must be in the vector loop header");
4754 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4755 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4756 State.set(this, Phi, isInLoop());
4757
4758 Phi->addIncoming(StartV, VectorPH);
4759}
4760
4761#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4763 VPSlotTracker &SlotTracker) const {
4764 O << Indent << "WIDEN-REDUCTION-PHI ";
4765
4767 O << " = phi (";
4768 printRecurrenceKind(O, Kind);
4769 O << ")";
4770 printFlags(O);
4772 if (getVFScaleFactor() > 1)
4773 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4774}
4775#endif
4776
4778 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4779 return vputils::onlyFirstLaneUsed(this);
4780}
4781
4783 Value *Op0 = State.get(getOperand(0));
4784 Type *VecTy = Op0->getType();
4785 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4786 State.set(this, VecPhi);
4787}
4788
4790 VPCostContext &Ctx) const {
4791 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4792}
4793
4794#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 VPSlotTracker &SlotTracker) const {
4797 O << Indent << "WIDEN-PHI ";
4798
4800 O << " = phi ";
4802}
4803#endif
4804
4806 BasicBlock *VectorPH =
4807 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4808 Value *StartMask = State.get(getOperand(0));
4809 PHINode *Phi =
4810 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4811 Phi->addIncoming(StartMask, VectorPH);
4812 State.set(this, Phi);
4813}
4814
4815#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4817 VPSlotTracker &SlotTracker) const {
4818 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4819
4821 O << " = phi ";
4823}
4824#endif
4825
4826#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4828 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4829 O << Indent << "CURRENT-ITERATION-PHI ";
4830
4832 O << " = phi ";
4834}
4835#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void replaceAllUsesWith(Value *Old, Value *New, SmallPtrSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
Hexagon Common GEP
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind)
SmallVector< Value *, 2 > VectorParts
static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind)
static unsigned getCalledFnOperandIndex(ArrayRef< VPValue * > Operands)
For call VPInstruction operands, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:246
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:863
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2691
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2684
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2703
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2101
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2286
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2388
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1792
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2518
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1876
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2384
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1176
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2396
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1800
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2494
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
@ IK_IntInduction
Integer induction variable. Step = C.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isByteTy() const
True if this is an instance of ByteType.
Definition Type.h:242
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4399
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4452
iterator end()
Definition VPlan.h:4436
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4465
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:3001
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2996
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2992
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:364
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
InductionDescriptor::InductionKind getInductionKind() const
Definition VPlan.h:4220
VPValue * getIndex() const
Definition VPlan.h:4217
VPIRValue * getStartValue() const
Definition VPlan.h:4216
VPValue * getStepValue() const
Definition VPlan.h:4218
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPDerivedIVRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPExpandSCEVRecipe(const SCEV *Expr)
bool isVectorToScalar() const
Returns true if this VPExpressionRecipe produces a single scalar.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2466
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2187
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
FastMathFlagsTy FMFs
Definition VPlan.h:783
ReductionFlagsTy ReductionFlags
Definition VPlan.h:785
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:777
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:1000
bool isReductionOrdered() const
Definition VPlan.h:1064
TruncFlagsTy TruncFlags
Definition VPlan.h:778
CmpInst::Predicate getPredicate() const
Definition VPlan.h:972
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
ExactFlagsTy ExactFlags
Definition VPlan.h:780
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:781
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:990
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:995
DisjointFlagsTy DisjointFlags
Definition VPlan.h:779
FCmpFlagsTy FCmpFlags
Definition VPlan.h:784
NonNegFlagsTy NonNegFlags
Definition VPlan.h:782
bool isReductionInLoop() const
Definition VPlan.h:1070
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:929
uint8_t CmpPredStorage
Definition VPlan.h:776
RecurKind getRecurKind() const
Definition VPlan.h:1058
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1721
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
Type * getResultType() const
Definition VPlan.h:1589
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1328
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1332
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1344
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1322
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ VScale
Returns the value for vscale.
Definition VPlan.h:1348
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
bool hasResult() const
Definition VPlan.h:1438
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1520
unsigned getOpcode() const
Definition VPlan.h:1417
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void addOperand(VPValue *Op)
Add Op as operand of this VPInstruction.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1463
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:3106
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:3110
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3108
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3100
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3129
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3094
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3203
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3216
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3166
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1625
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4543
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1650
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1610
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4744
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
bool isSafeToSpeculativelyExecute() const
Return true if we can safely execute this recipe unconditionally even if it is masked originally.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:523
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:467
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
friend class VPValue
Definition VPlanValue.h:316
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3375
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2902
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2926
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3317
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3328
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3330
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3313
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3319
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3326
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3321
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4609
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4685
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3453
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
static Type * computeScalarType(const Instruction *I, ArrayRef< VPValue * > Operands)
Compute the scalar result type for a VPReplicateRecipe wrapping I with Operands (excluding any predic...
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
unsigned getOpcode() const
Definition VPlan.h:3482
VPValue * getStepValue() const
Definition VPlan.h:4288
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4296
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:611
This class can be used to assign names to VPValues.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1527
operand_range operands()
Definition VPlanValue.h:457
unsigned getNumOperands() const
Definition VPlanValue.h:424
operand_iterator op_end()
Definition VPlanValue.h:455
operand_iterator op_begin()
Definition VPlanValue.h:453
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
void addOperand(VPValue *Operand)
Definition VPlanValue.h:410
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1523
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
VPValue * getVFValue() const
Definition VPlan.h:2281
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2278
int64_t getStride() const
Definition VPlan.h:2279
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
VPValue * getStride() const
Definition VPlan.h:2355
Type * getSourceElementType() const
Definition VPlan.h:2370
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
VPValue * getVFxPart() const
Definition VPlan.h:2357
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
operand_range args()
Definition VPlan.h:2138
Function * getCalledScalarFunction() const
Definition VPlan.h:2134
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2235
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2558
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2561
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2664
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2679
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
CallInst * createVectorCall(VPTransformState &State)
Helper function to produce the widened intrinsic call.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:2023
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
void execute(VPTransformState &State) override
Produce a widened version of the vector memory intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector memory intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3749
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3774
Instruction & Ingredient
Definition VPlan.h:3740
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3746
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3784
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3743
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3777
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4757
const DataLayout & getDataLayout() const
Definition VPlan.h:4962
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4916
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5064
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:806
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
LLVM_ABI Type * computeScalarTypeForInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands)
Compute the scalar result type for an IR Opcode given Operands.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ None
Not a recurrence.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
TargetTransformInfo::TargetCostKind CostKind
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1779
PHINode & getIRPhi()
Definition VPlan.h:1792
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1118
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:313
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3869
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3971
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3974
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3919