LLVM 18.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
39 assert(Ty->isIntegerTy() &&
40 "getIntImmCost can only estimate cost of materialising integers");
41
42 // We have a Zero register, so 0 is always free.
43 if (Imm == 0)
44 return TTI::TCC_Free;
45
46 // Otherwise, we check how many instructions it will take to materialise.
47 const DataLayout &DL = getDataLayout();
48 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
49}
50
51// Look for patterns of shift followed by AND that can be turned into a pair of
52// shifts. We won't need to materialize an immediate for the AND so these can
53// be considered free.
54static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
55 uint64_t Mask = Imm.getZExtValue();
56 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
57 if (!BO || !BO->hasOneUse())
58 return false;
59
60 if (BO->getOpcode() != Instruction::Shl)
61 return false;
62
63 if (!isa<ConstantInt>(BO->getOperand(1)))
64 return false;
65
66 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
67 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
68 // is a mask shifted by c2 bits with c3 leading zeros.
69 if (isShiftedMask_64(Mask)) {
70 unsigned Trailing = llvm::countr_zero(Mask);
71 if (ShAmt == Trailing)
72 return true;
73 }
74
75 return false;
76}
77
79 const APInt &Imm, Type *Ty,
81 Instruction *Inst) {
82 assert(Ty->isIntegerTy() &&
83 "getIntImmCost can only estimate cost of materialising integers");
84
85 // We have a Zero register, so 0 is always free.
86 if (Imm == 0)
87 return TTI::TCC_Free;
88
89 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
90 // commutative, in others the immediate comes from a specific argument index.
91 bool Takes12BitImm = false;
92 unsigned ImmArgIdx = ~0U;
93
94 switch (Opcode) {
95 case Instruction::GetElementPtr:
96 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
97 // split up large offsets in GEP into better parts than ConstantHoisting
98 // can.
99 return TTI::TCC_Free;
100 case Instruction::And:
101 // zext.h
102 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
103 return TTI::TCC_Free;
104 // zext.w
105 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
106 return TTI::TCC_Free;
107 // bclri
108 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
109 return TTI::TCC_Free;
110 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
111 canUseShiftPair(Inst, Imm))
112 return TTI::TCC_Free;
113 Takes12BitImm = true;
114 break;
115 case Instruction::Add:
116 Takes12BitImm = true;
117 break;
118 case Instruction::Or:
119 case Instruction::Xor:
120 // bseti/binvi
121 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
122 return TTI::TCC_Free;
123 Takes12BitImm = true;
124 break;
125 case Instruction::Mul:
126 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
127 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
128 return TTI::TCC_Free;
129 // One more or less than a power of 2 can use SLLI+ADD/SUB.
130 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
131 return TTI::TCC_Free;
132 // FIXME: There is no MULI instruction.
133 Takes12BitImm = true;
134 break;
135 case Instruction::Sub:
136 case Instruction::Shl:
137 case Instruction::LShr:
138 case Instruction::AShr:
139 Takes12BitImm = true;
140 ImmArgIdx = 1;
141 break;
142 default:
143 break;
144 }
145
146 if (Takes12BitImm) {
147 // Check immediate is the correct argument...
148 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
149 // ... and fits into the 12-bit immediate.
150 if (Imm.getSignificantBits() <= 64 &&
151 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
152 return TTI::TCC_Free;
153 }
154 }
155
156 // Otherwise, use the full materialisation cost.
157 return getIntImmCost(Imm, Ty, CostKind);
158 }
159
160 // By default, prevent hoisting.
161 return TTI::TCC_Free;
162}
163
166 const APInt &Imm, Type *Ty,
168 // Prevent hoisting in unknown cases.
169 return TTI::TCC_Free;
170}
171
174 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
175 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
176}
177
179 // Currently, the ExpandReductions pass can't expand scalable-vector
180 // reductions, but we still request expansion as RVV doesn't support certain
181 // reductions and the SelectionDAG can't legalize them either.
182 switch (II->getIntrinsicID()) {
183 default:
184 return false;
185 // These reductions have no equivalent in RVV
186 case Intrinsic::vector_reduce_mul:
187 case Intrinsic::vector_reduce_fmul:
188 return true;
189 }
190}
191
192std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
193 if (ST->hasVInstructions())
195 return BaseT::getMaxVScale();
196}
197
198std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
199 if (ST->hasVInstructions())
200 if (unsigned MinVLen = ST->getRealMinVLen();
201 MinVLen >= RISCV::RVVBitsPerBlock)
202 return MinVLen / RISCV::RVVBitsPerBlock;
204}
205
208 unsigned LMUL =
209 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
210 switch (K) {
212 return TypeSize::getFixed(ST->getXLen());
214 return TypeSize::getFixed(
215 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
218 (ST->hasVInstructions() &&
221 : 0);
222 }
223
224 llvm_unreachable("Unsupported register kind");
225}
226
228RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
229 // Add a cost of address generation + the cost of the load. The address
230 // is expected to be a PC relative offset to a constant pool entry
231 // using auipc/addi.
232 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
233 /*AddressSpace=*/0, CostKind);
234}
235
237 LLVMContext &C) {
238 assert((DataVT.getScalarSizeInBits() != 8 ||
239 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
240 MVT IndexVT = DataVT.changeTypeToInteger();
241 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
242 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
243 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
244}
245
247 VectorType *Tp, ArrayRef<int> Mask,
249 int Index, VectorType *SubTp,
251 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
252
253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
254
255 // First, handle cases where having a fixed length vector enables us to
256 // give a more accurate cost than falling back to generic scalable codegen.
257 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
258 if (isa<FixedVectorType>(Tp)) {
259 switch (Kind) {
260 default:
261 break;
263 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
264 MVT EltTp = LT.second.getVectorElementType();
265 // If the size of the element is < ELEN then shuffles of interleaves and
266 // deinterleaves of 2 vectors can be lowered into the following
267 // sequences
268 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
269 // Example sequence:
270 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
271 // vwaddu.vv v10, v8, v9
272 // li a0, -1 (ignored)
273 // vwmaccu.vx v10, a0, v9
274 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
275 return 2 * LT.first * TLI->getLMULCost(LT.second);
276
277 if (Mask[0] == 0 || Mask[0] == 1) {
278 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
279 // Example sequence:
280 // vnsrl.wi v10, v8, 0
281 if (equal(DeinterleaveMask, Mask))
282 return LT.first * TLI->getLMULCost(LT.second);
283 }
284 }
285 }
286 // vrgather + cost of generating the mask constant.
287 // We model this for an unknown mask with a single vrgather.
288 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
289 (LT.second.getScalarSizeInBits() != 8 ||
290 LT.second.getVectorNumElements() <= 256)) {
291 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
292 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
293 return IndexCost + TLI->getVRGatherVVCost(LT.second);
294 }
295 [[fallthrough]];
296 }
299 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
300 // register for the second vrgather. We model this for an unknown
301 // (shuffle) mask.
302 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
303 (LT.second.getScalarSizeInBits() != 8 ||
304 LT.second.getVectorNumElements() <= 256)) {
305 auto &C = Tp->getContext();
306 auto EC = Tp->getElementCount();
307 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
309 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
310 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
311 return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost;
312 }
313 [[fallthrough]];
314 }
315 case TTI::SK_Select: {
316 // We are going to permute multiple sources and the result will be in
317 // multiple destinations. Providing an accurate cost only for splits where
318 // the element type remains the same.
319 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
320 LT.second.isFixedLengthVector() &&
321 LT.second.getVectorElementType().getSizeInBits() ==
323 LT.second.getVectorNumElements() <
324 cast<FixedVectorType>(Tp)->getNumElements() &&
325 divideCeil(Mask.size(),
326 cast<FixedVectorType>(Tp)->getNumElements()) ==
327 static_cast<unsigned>(*LT.first.getValue())) {
328 unsigned NumRegs = *LT.first.getValue();
329 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
330 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
331 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
332
334 for (unsigned I = 0; I < NumRegs; ++I) {
335 bool IsSingleVector = true;
336 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
337 transform(Mask.slice(I * SubVF,
338 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
339 SubMask.begin(), [&](int I) {
340 bool SingleSubVector = I / VF == 0;
341 IsSingleVector &= SingleSubVector;
342 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
343 });
346 SubVecTy, SubMask, CostKind, 0, nullptr);
347 return Cost;
348 }
349 }
350 break;
351 }
352 }
353 };
354
355 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
356 switch (Kind) {
357 default:
358 // Fallthrough to generic handling.
359 // TODO: Most of these cases will return getInvalid in generic code, and
360 // must be implemented here.
361 break;
363 // Example sequence:
364 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
365 // vslidedown.vi v8, v9, 2
366 return LT.first * TLI->getVSlideCost(LT.second);
368 // Example sequence:
369 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
370 // vslideup.vi v8, v9, 2
371 return LT.first * TLI->getVSlideCost(LT.second);
372 case TTI::SK_Select: {
373 // Example sequence:
374 // li a0, 90
375 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
376 // vmv.s.x v0, a0
377 // vmerge.vvm v8, v9, v8, v0
378 return LT.first * 3 * TLI->getLMULCost(LT.second);
379 }
380 case TTI::SK_Broadcast: {
381 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
382 Instruction::InsertElement);
383 if (LT.second.getScalarSizeInBits() == 1) {
384 if (HasScalar) {
385 // Example sequence:
386 // andi a0, a0, 1
387 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
388 // vmv.v.x v8, a0
389 // vmsne.vi v0, v8, 0
390 return LT.first * TLI->getLMULCost(LT.second) * 3;
391 }
392 // Example sequence:
393 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
394 // vmv.v.i v8, 0
395 // vmerge.vim v8, v8, 1, v0
396 // vmv.x.s a0, v8
397 // andi a0, a0, 1
398 // vmv.v.x v8, a0
399 // vmsne.vi v0, v8, 0
400
401 return LT.first * TLI->getLMULCost(LT.second) * 6;
402 }
403
404 if (HasScalar) {
405 // Example sequence:
406 // vmv.v.x v8, a0
407 return LT.first * TLI->getLMULCost(LT.second);
408 }
409
410 // Example sequence:
411 // vrgather.vi v9, v8, 0
412 return LT.first * TLI->getVRGatherVICost(LT.second);
413 }
414 case TTI::SK_Splice:
415 // vslidedown+vslideup.
416 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
417 // of similar code, but I think we expand through memory.
418 return 2 * LT.first * TLI->getVSlideCost(LT.second);
419 case TTI::SK_Reverse: {
420 // TODO: Cases to improve here:
421 // * Illegal vector types
422 // * i64 on RV32
423 // * i1 vector
424 // At low LMUL, most of the cost is producing the vrgather index register.
425 // At high LMUL, the cost of the vrgather itself will dominate.
426 // Example sequence:
427 // csrr a0, vlenb
428 // srli a0, a0, 3
429 // addi a0, a0, -1
430 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
431 // vid.v v9
432 // vrsub.vx v10, v9, a0
433 // vrgather.vv v9, v8, v10
434 InstructionCost LenCost = 3;
435 if (LT.second.isFixedLengthVector())
436 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
437 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
438 InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second);
439 // Mask operation additionally required extend and truncate
440 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
441 return LT.first * (LenCost + GatherCost + ExtendCost);
442 }
443 }
444 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
445}
446
449 unsigned AddressSpace,
451 if (!isLegalMaskedLoadStore(Src, Alignment) ||
453 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
454 CostKind);
455
456 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
457}
458
460 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
461 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
462 bool UseMaskForCond, bool UseMaskForGaps) {
463 if (isa<ScalableVectorType>(VecTy))
465 auto *FVTy = cast<FixedVectorType>(VecTy);
466 InstructionCost MemCost =
467 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
468 unsigned VF = FVTy->getNumElements() / Factor;
469
470 // The interleaved memory access pass will lower interleaved memory ops (i.e
471 // a load and store followed by a specific shuffle) to vlseg/vsseg
472 // intrinsics. In those cases then we can treat it as if it's just one (legal)
473 // memory op
474 if (!UseMaskForCond && !UseMaskForGaps &&
475 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
476 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
477 // Need to make sure type has't been scalarized
478 if (LT.second.isFixedLengthVector()) {
479 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
480 LT.second.getVectorNumElements());
481 // FIXME: We use the memory op cost of the *legalized* type here, becuase
482 // it's getMemoryOpCost returns a really expensive cost for types like
483 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
484 // Should the memory op cost of these be cheaper?
485 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
486 AddressSpace, DL)) {
487 InstructionCost LegalMemCost = getMemoryOpCost(
488 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
489 return LT.first + LegalMemCost;
490 }
491 }
492 }
493
494 // An interleaved load will look like this for Factor=3:
495 // %wide.vec = load <12 x i32>, ptr %3, align 4
496 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
497 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
498 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
499 if (Opcode == Instruction::Load) {
500 InstructionCost Cost = MemCost;
501 for (unsigned Index : Indices) {
502 FixedVectorType *SubVecTy =
503 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
504 auto Mask = createStrideMask(Index, Factor, VF);
505 InstructionCost ShuffleCost =
507 CostKind, 0, nullptr, {});
508 Cost += ShuffleCost;
509 }
510 return Cost;
511 }
512
513 // TODO: Model for NF > 2
514 // We'll need to enhance getShuffleCost to model shuffles that are just
515 // inserts and extracts into subvectors, since they won't have the full cost
516 // of a vrgather.
517 // An interleaved store for 3 vectors of 4 lanes will look like
518 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
519 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
520 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
521 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
522 // store <12 x i32> %interleaved.vec, ptr %10, align 4
523 if (Factor != 2)
524 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
525 Alignment, AddressSpace, CostKind,
526 UseMaskForCond, UseMaskForGaps);
527
528 assert(Opcode == Instruction::Store && "Opcode must be a store");
529 // For an interleaving store of 2 vectors, we perform one large interleaving
530 // shuffle that goes into the wide store
531 auto Mask = createInterleaveMask(VF, Factor);
532 InstructionCost ShuffleCost =
534 CostKind, 0, nullptr, {});
535 return MemCost + ShuffleCost;
536}
537
539 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
540 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
542 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
543 Alignment, CostKind, I);
544
545 if ((Opcode == Instruction::Load &&
546 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
547 (Opcode == Instruction::Store &&
548 !isLegalMaskedScatter(DataTy, Align(Alignment))))
549 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
550 Alignment, CostKind, I);
551
552 // Cost is proportional to the number of memory operations implied. For
553 // scalable vectors, we use an estimate on that number since we don't
554 // know exactly what VL will be.
555 auto &VTy = *cast<VectorType>(DataTy);
556 InstructionCost MemOpCost =
557 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
558 {TTI::OK_AnyValue, TTI::OP_None}, I);
559 unsigned NumLoads = getEstimatedVLFor(&VTy);
560 return NumLoads * MemOpCost;
561}
562
563// Currently, these represent both throughput and codesize costs
564// for the respective intrinsics. The costs in this table are simply
565// instruction counts with the following adjustments made:
566// * One vsetvli is considered free.
568 {Intrinsic::floor, MVT::v2f32, 9},
569 {Intrinsic::floor, MVT::v4f32, 9},
570 {Intrinsic::floor, MVT::v8f32, 9},
571 {Intrinsic::floor, MVT::v16f32, 9},
572 {Intrinsic::floor, MVT::nxv1f32, 9},
573 {Intrinsic::floor, MVT::nxv2f32, 9},
574 {Intrinsic::floor, MVT::nxv4f32, 9},
575 {Intrinsic::floor, MVT::nxv8f32, 9},
576 {Intrinsic::floor, MVT::nxv16f32, 9},
577 {Intrinsic::floor, MVT::v2f64, 9},
578 {Intrinsic::floor, MVT::v4f64, 9},
579 {Intrinsic::floor, MVT::v8f64, 9},
580 {Intrinsic::floor, MVT::v16f64, 9},
581 {Intrinsic::floor, MVT::nxv1f64, 9},
582 {Intrinsic::floor, MVT::nxv2f64, 9},
583 {Intrinsic::floor, MVT::nxv4f64, 9},
584 {Intrinsic::floor, MVT::nxv8f64, 9},
585 {Intrinsic::ceil, MVT::v2f32, 9},
586 {Intrinsic::ceil, MVT::v4f32, 9},
587 {Intrinsic::ceil, MVT::v8f32, 9},
588 {Intrinsic::ceil, MVT::v16f32, 9},
589 {Intrinsic::ceil, MVT::nxv1f32, 9},
590 {Intrinsic::ceil, MVT::nxv2f32, 9},
591 {Intrinsic::ceil, MVT::nxv4f32, 9},
592 {Intrinsic::ceil, MVT::nxv8f32, 9},
593 {Intrinsic::ceil, MVT::nxv16f32, 9},
594 {Intrinsic::ceil, MVT::v2f64, 9},
595 {Intrinsic::ceil, MVT::v4f64, 9},
596 {Intrinsic::ceil, MVT::v8f64, 9},
597 {Intrinsic::ceil, MVT::v16f64, 9},
598 {Intrinsic::ceil, MVT::nxv1f64, 9},
599 {Intrinsic::ceil, MVT::nxv2f64, 9},
600 {Intrinsic::ceil, MVT::nxv4f64, 9},
601 {Intrinsic::ceil, MVT::nxv8f64, 9},
602 {Intrinsic::trunc, MVT::v2f32, 7},
603 {Intrinsic::trunc, MVT::v4f32, 7},
604 {Intrinsic::trunc, MVT::v8f32, 7},
605 {Intrinsic::trunc, MVT::v16f32, 7},
606 {Intrinsic::trunc, MVT::nxv1f32, 7},
607 {Intrinsic::trunc, MVT::nxv2f32, 7},
608 {Intrinsic::trunc, MVT::nxv4f32, 7},
609 {Intrinsic::trunc, MVT::nxv8f32, 7},
610 {Intrinsic::trunc, MVT::nxv16f32, 7},
611 {Intrinsic::trunc, MVT::v2f64, 7},
612 {Intrinsic::trunc, MVT::v4f64, 7},
613 {Intrinsic::trunc, MVT::v8f64, 7},
614 {Intrinsic::trunc, MVT::v16f64, 7},
615 {Intrinsic::trunc, MVT::nxv1f64, 7},
616 {Intrinsic::trunc, MVT::nxv2f64, 7},
617 {Intrinsic::trunc, MVT::nxv4f64, 7},
618 {Intrinsic::trunc, MVT::nxv8f64, 7},
619 {Intrinsic::round, MVT::v2f32, 9},
620 {Intrinsic::round, MVT::v4f32, 9},
621 {Intrinsic::round, MVT::v8f32, 9},
622 {Intrinsic::round, MVT::v16f32, 9},
623 {Intrinsic::round, MVT::nxv1f32, 9},
624 {Intrinsic::round, MVT::nxv2f32, 9},
625 {Intrinsic::round, MVT::nxv4f32, 9},
626 {Intrinsic::round, MVT::nxv8f32, 9},
627 {Intrinsic::round, MVT::nxv16f32, 9},
628 {Intrinsic::round, MVT::v2f64, 9},
629 {Intrinsic::round, MVT::v4f64, 9},
630 {Intrinsic::round, MVT::v8f64, 9},
631 {Intrinsic::round, MVT::v16f64, 9},
632 {Intrinsic::round, MVT::nxv1f64, 9},
633 {Intrinsic::round, MVT::nxv2f64, 9},
634 {Intrinsic::round, MVT::nxv4f64, 9},
635 {Intrinsic::round, MVT::nxv8f64, 9},
636 {Intrinsic::roundeven, MVT::v2f32, 9},
637 {Intrinsic::roundeven, MVT::v4f32, 9},
638 {Intrinsic::roundeven, MVT::v8f32, 9},
639 {Intrinsic::roundeven, MVT::v16f32, 9},
640 {Intrinsic::roundeven, MVT::nxv1f32, 9},
641 {Intrinsic::roundeven, MVT::nxv2f32, 9},
642 {Intrinsic::roundeven, MVT::nxv4f32, 9},
643 {Intrinsic::roundeven, MVT::nxv8f32, 9},
644 {Intrinsic::roundeven, MVT::nxv16f32, 9},
645 {Intrinsic::roundeven, MVT::v2f64, 9},
646 {Intrinsic::roundeven, MVT::v4f64, 9},
647 {Intrinsic::roundeven, MVT::v8f64, 9},
648 {Intrinsic::roundeven, MVT::v16f64, 9},
649 {Intrinsic::roundeven, MVT::nxv1f64, 9},
650 {Intrinsic::roundeven, MVT::nxv2f64, 9},
651 {Intrinsic::roundeven, MVT::nxv4f64, 9},
652 {Intrinsic::roundeven, MVT::nxv8f64, 9},
653 {Intrinsic::rint, MVT::v2f32, 7},
654 {Intrinsic::rint, MVT::v4f32, 7},
655 {Intrinsic::rint, MVT::v8f32, 7},
656 {Intrinsic::rint, MVT::v16f32, 7},
657 {Intrinsic::rint, MVT::nxv1f32, 7},
658 {Intrinsic::rint, MVT::nxv2f32, 7},
659 {Intrinsic::rint, MVT::nxv4f32, 7},
660 {Intrinsic::rint, MVT::nxv8f32, 7},
661 {Intrinsic::rint, MVT::nxv16f32, 7},
662 {Intrinsic::rint, MVT::v2f64, 7},
663 {Intrinsic::rint, MVT::v4f64, 7},
664 {Intrinsic::rint, MVT::v8f64, 7},
665 {Intrinsic::rint, MVT::v16f64, 7},
666 {Intrinsic::rint, MVT::nxv1f64, 7},
667 {Intrinsic::rint, MVT::nxv2f64, 7},
668 {Intrinsic::rint, MVT::nxv4f64, 7},
669 {Intrinsic::rint, MVT::nxv8f64, 7},
670 {Intrinsic::lrint, MVT::v2i32, 1},
671 {Intrinsic::lrint, MVT::v4i32, 1},
672 {Intrinsic::lrint, MVT::v8i32, 1},
673 {Intrinsic::lrint, MVT::v16i32, 1},
674 {Intrinsic::lrint, MVT::nxv1i32, 1},
675 {Intrinsic::lrint, MVT::nxv2i32, 1},
676 {Intrinsic::lrint, MVT::nxv4i32, 1},
677 {Intrinsic::lrint, MVT::nxv8i32, 1},
678 {Intrinsic::lrint, MVT::nxv16i32, 1},
679 {Intrinsic::lrint, MVT::v2i64, 1},
680 {Intrinsic::lrint, MVT::v4i64, 1},
681 {Intrinsic::lrint, MVT::v8i64, 1},
682 {Intrinsic::lrint, MVT::v16i64, 1},
683 {Intrinsic::lrint, MVT::nxv1i64, 1},
684 {Intrinsic::lrint, MVT::nxv2i64, 1},
685 {Intrinsic::lrint, MVT::nxv4i64, 1},
686 {Intrinsic::lrint, MVT::nxv8i64, 1},
687 {Intrinsic::llrint, MVT::v2i64, 1},
688 {Intrinsic::llrint, MVT::v4i64, 1},
689 {Intrinsic::llrint, MVT::v8i64, 1},
690 {Intrinsic::llrint, MVT::v16i64, 1},
691 {Intrinsic::llrint, MVT::nxv1i64, 1},
692 {Intrinsic::llrint, MVT::nxv2i64, 1},
693 {Intrinsic::llrint, MVT::nxv4i64, 1},
694 {Intrinsic::llrint, MVT::nxv8i64, 1},
695 {Intrinsic::nearbyint, MVT::v2f32, 9},
696 {Intrinsic::nearbyint, MVT::v4f32, 9},
697 {Intrinsic::nearbyint, MVT::v8f32, 9},
698 {Intrinsic::nearbyint, MVT::v16f32, 9},
699 {Intrinsic::nearbyint, MVT::nxv1f32, 9},
700 {Intrinsic::nearbyint, MVT::nxv2f32, 9},
701 {Intrinsic::nearbyint, MVT::nxv4f32, 9},
702 {Intrinsic::nearbyint, MVT::nxv8f32, 9},
703 {Intrinsic::nearbyint, MVT::nxv16f32, 9},
704 {Intrinsic::nearbyint, MVT::v2f64, 9},
705 {Intrinsic::nearbyint, MVT::v4f64, 9},
706 {Intrinsic::nearbyint, MVT::v8f64, 9},
707 {Intrinsic::nearbyint, MVT::v16f64, 9},
708 {Intrinsic::nearbyint, MVT::nxv1f64, 9},
709 {Intrinsic::nearbyint, MVT::nxv2f64, 9},
710 {Intrinsic::nearbyint, MVT::nxv4f64, 9},
711 {Intrinsic::nearbyint, MVT::nxv8f64, 9},
712 {Intrinsic::bswap, MVT::v2i16, 3},
713 {Intrinsic::bswap, MVT::v4i16, 3},
714 {Intrinsic::bswap, MVT::v8i16, 3},
715 {Intrinsic::bswap, MVT::v16i16, 3},
716 {Intrinsic::bswap, MVT::nxv1i16, 3},
717 {Intrinsic::bswap, MVT::nxv2i16, 3},
718 {Intrinsic::bswap, MVT::nxv4i16, 3},
719 {Intrinsic::bswap, MVT::nxv8i16, 3},
720 {Intrinsic::bswap, MVT::nxv16i16, 3},
721 {Intrinsic::bswap, MVT::v2i32, 12},
722 {Intrinsic::bswap, MVT::v4i32, 12},
723 {Intrinsic::bswap, MVT::v8i32, 12},
724 {Intrinsic::bswap, MVT::v16i32, 12},
725 {Intrinsic::bswap, MVT::nxv1i32, 12},
726 {Intrinsic::bswap, MVT::nxv2i32, 12},
727 {Intrinsic::bswap, MVT::nxv4i32, 12},
728 {Intrinsic::bswap, MVT::nxv8i32, 12},
729 {Intrinsic::bswap, MVT::nxv16i32, 12},
730 {Intrinsic::bswap, MVT::v2i64, 31},
731 {Intrinsic::bswap, MVT::v4i64, 31},
732 {Intrinsic::bswap, MVT::v8i64, 31},
733 {Intrinsic::bswap, MVT::v16i64, 31},
734 {Intrinsic::bswap, MVT::nxv1i64, 31},
735 {Intrinsic::bswap, MVT::nxv2i64, 31},
736 {Intrinsic::bswap, MVT::nxv4i64, 31},
737 {Intrinsic::bswap, MVT::nxv8i64, 31},
738 {Intrinsic::vp_bswap, MVT::v2i16, 3},
739 {Intrinsic::vp_bswap, MVT::v4i16, 3},
740 {Intrinsic::vp_bswap, MVT::v8i16, 3},
741 {Intrinsic::vp_bswap, MVT::v16i16, 3},
742 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
743 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
744 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
745 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
746 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
747 {Intrinsic::vp_bswap, MVT::v2i32, 12},
748 {Intrinsic::vp_bswap, MVT::v4i32, 12},
749 {Intrinsic::vp_bswap, MVT::v8i32, 12},
750 {Intrinsic::vp_bswap, MVT::v16i32, 12},
751 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
752 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
753 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
754 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
755 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
756 {Intrinsic::vp_bswap, MVT::v2i64, 31},
757 {Intrinsic::vp_bswap, MVT::v4i64, 31},
758 {Intrinsic::vp_bswap, MVT::v8i64, 31},
759 {Intrinsic::vp_bswap, MVT::v16i64, 31},
760 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
761 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
762 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
763 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
764 {Intrinsic::vp_fshl, MVT::v2i8, 7},
765 {Intrinsic::vp_fshl, MVT::v4i8, 7},
766 {Intrinsic::vp_fshl, MVT::v8i8, 7},
767 {Intrinsic::vp_fshl, MVT::v16i8, 7},
768 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
769 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
770 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
771 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
772 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
773 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
774 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
775 {Intrinsic::vp_fshl, MVT::v2i16, 7},
776 {Intrinsic::vp_fshl, MVT::v4i16, 7},
777 {Intrinsic::vp_fshl, MVT::v8i16, 7},
778 {Intrinsic::vp_fshl, MVT::v16i16, 7},
779 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
780 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
781 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
782 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
783 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
784 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
785 {Intrinsic::vp_fshl, MVT::v2i32, 7},
786 {Intrinsic::vp_fshl, MVT::v4i32, 7},
787 {Intrinsic::vp_fshl, MVT::v8i32, 7},
788 {Intrinsic::vp_fshl, MVT::v16i32, 7},
789 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
790 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
791 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
792 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
793 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
794 {Intrinsic::vp_fshl, MVT::v2i64, 7},
795 {Intrinsic::vp_fshl, MVT::v4i64, 7},
796 {Intrinsic::vp_fshl, MVT::v8i64, 7},
797 {Intrinsic::vp_fshl, MVT::v16i64, 7},
798 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
799 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
800 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
801 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
802 {Intrinsic::vp_fshr, MVT::v2i8, 7},
803 {Intrinsic::vp_fshr, MVT::v4i8, 7},
804 {Intrinsic::vp_fshr, MVT::v8i8, 7},
805 {Intrinsic::vp_fshr, MVT::v16i8, 7},
806 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
807 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
808 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
809 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
810 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
811 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
812 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
813 {Intrinsic::vp_fshr, MVT::v2i16, 7},
814 {Intrinsic::vp_fshr, MVT::v4i16, 7},
815 {Intrinsic::vp_fshr, MVT::v8i16, 7},
816 {Intrinsic::vp_fshr, MVT::v16i16, 7},
817 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
818 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
819 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
820 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
821 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
822 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
823 {Intrinsic::vp_fshr, MVT::v2i32, 7},
824 {Intrinsic::vp_fshr, MVT::v4i32, 7},
825 {Intrinsic::vp_fshr, MVT::v8i32, 7},
826 {Intrinsic::vp_fshr, MVT::v16i32, 7},
827 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
828 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
829 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
830 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
831 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
832 {Intrinsic::vp_fshr, MVT::v2i64, 7},
833 {Intrinsic::vp_fshr, MVT::v4i64, 7},
834 {Intrinsic::vp_fshr, MVT::v8i64, 7},
835 {Intrinsic::vp_fshr, MVT::v16i64, 7},
836 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
837 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
838 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
839 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
840 {Intrinsic::bitreverse, MVT::v2i8, 17},
841 {Intrinsic::bitreverse, MVT::v4i8, 17},
842 {Intrinsic::bitreverse, MVT::v8i8, 17},
843 {Intrinsic::bitreverse, MVT::v16i8, 17},
844 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
845 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
846 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
847 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
848 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
849 {Intrinsic::bitreverse, MVT::v2i16, 24},
850 {Intrinsic::bitreverse, MVT::v4i16, 24},
851 {Intrinsic::bitreverse, MVT::v8i16, 24},
852 {Intrinsic::bitreverse, MVT::v16i16, 24},
853 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
854 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
855 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
856 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
857 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
858 {Intrinsic::bitreverse, MVT::v2i32, 33},
859 {Intrinsic::bitreverse, MVT::v4i32, 33},
860 {Intrinsic::bitreverse, MVT::v8i32, 33},
861 {Intrinsic::bitreverse, MVT::v16i32, 33},
862 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
863 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
864 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
865 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
866 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
867 {Intrinsic::bitreverse, MVT::v2i64, 52},
868 {Intrinsic::bitreverse, MVT::v4i64, 52},
869 {Intrinsic::bitreverse, MVT::v8i64, 52},
870 {Intrinsic::bitreverse, MVT::v16i64, 52},
871 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
872 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
873 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
874 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
875 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
876 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
877 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
878 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
879 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
880 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
881 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
882 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
883 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
884 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
885 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
886 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
887 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
888 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
889 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
890 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
891 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
892 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
893 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
894 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
895 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
896 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
897 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
898 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
899 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
900 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
901 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
902 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
903 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
904 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
905 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
906 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
907 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
908 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
909 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
910 {Intrinsic::ctpop, MVT::v2i8, 12},
911 {Intrinsic::ctpop, MVT::v4i8, 12},
912 {Intrinsic::ctpop, MVT::v8i8, 12},
913 {Intrinsic::ctpop, MVT::v16i8, 12},
914 {Intrinsic::ctpop, MVT::nxv1i8, 12},
915 {Intrinsic::ctpop, MVT::nxv2i8, 12},
916 {Intrinsic::ctpop, MVT::nxv4i8, 12},
917 {Intrinsic::ctpop, MVT::nxv8i8, 12},
918 {Intrinsic::ctpop, MVT::nxv16i8, 12},
919 {Intrinsic::ctpop, MVT::v2i16, 19},
920 {Intrinsic::ctpop, MVT::v4i16, 19},
921 {Intrinsic::ctpop, MVT::v8i16, 19},
922 {Intrinsic::ctpop, MVT::v16i16, 19},
923 {Intrinsic::ctpop, MVT::nxv1i16, 19},
924 {Intrinsic::ctpop, MVT::nxv2i16, 19},
925 {Intrinsic::ctpop, MVT::nxv4i16, 19},
926 {Intrinsic::ctpop, MVT::nxv8i16, 19},
927 {Intrinsic::ctpop, MVT::nxv16i16, 19},
928 {Intrinsic::ctpop, MVT::v2i32, 20},
929 {Intrinsic::ctpop, MVT::v4i32, 20},
930 {Intrinsic::ctpop, MVT::v8i32, 20},
931 {Intrinsic::ctpop, MVT::v16i32, 20},
932 {Intrinsic::ctpop, MVT::nxv1i32, 20},
933 {Intrinsic::ctpop, MVT::nxv2i32, 20},
934 {Intrinsic::ctpop, MVT::nxv4i32, 20},
935 {Intrinsic::ctpop, MVT::nxv8i32, 20},
936 {Intrinsic::ctpop, MVT::nxv16i32, 20},
937 {Intrinsic::ctpop, MVT::v2i64, 21},
938 {Intrinsic::ctpop, MVT::v4i64, 21},
939 {Intrinsic::ctpop, MVT::v8i64, 21},
940 {Intrinsic::ctpop, MVT::v16i64, 21},
941 {Intrinsic::ctpop, MVT::nxv1i64, 21},
942 {Intrinsic::ctpop, MVT::nxv2i64, 21},
943 {Intrinsic::ctpop, MVT::nxv4i64, 21},
944 {Intrinsic::ctpop, MVT::nxv8i64, 21},
945 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
946 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
947 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
948 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
949 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
950 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
951 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
952 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
953 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
954 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
955 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
956 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
957 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
958 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
959 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
960 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
961 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
962 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
963 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
964 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
965 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
966 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
967 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
968 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
969 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
970 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
971 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
972 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
973 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
974 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
975 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
976 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
977 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
978 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
979 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
980 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
981 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
982 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
983 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
984 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
985 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
986 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
987 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
988 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
989 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
990 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
991 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
992 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
993 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
994 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
995 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
996 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
997 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
998 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
999 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
1000 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
1001 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
1002 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
1003 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
1004 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
1005 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
1006 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
1007 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
1008 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
1009 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
1010 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
1011 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
1012 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
1013 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
1014 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
1015 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
1016 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
1017 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
1018 {Intrinsic::vp_cttz, MVT::v2i8, 16},
1019 {Intrinsic::vp_cttz, MVT::v4i8, 16},
1020 {Intrinsic::vp_cttz, MVT::v8i8, 16},
1021 {Intrinsic::vp_cttz, MVT::v16i8, 16},
1022 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
1023 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
1024 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
1025 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
1026 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
1027 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
1028 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
1029 {Intrinsic::vp_cttz, MVT::v2i16, 23},
1030 {Intrinsic::vp_cttz, MVT::v4i16, 23},
1031 {Intrinsic::vp_cttz, MVT::v8i16, 23},
1032 {Intrinsic::vp_cttz, MVT::v16i16, 23},
1033 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
1034 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
1035 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
1036 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
1037 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
1038 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
1039 {Intrinsic::vp_cttz, MVT::v2i32, 24},
1040 {Intrinsic::vp_cttz, MVT::v4i32, 24},
1041 {Intrinsic::vp_cttz, MVT::v8i32, 24},
1042 {Intrinsic::vp_cttz, MVT::v16i32, 24},
1043 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
1044 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
1045 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
1046 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
1047 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
1048 {Intrinsic::vp_cttz, MVT::v2i64, 25},
1049 {Intrinsic::vp_cttz, MVT::v4i64, 25},
1050 {Intrinsic::vp_cttz, MVT::v8i64, 25},
1051 {Intrinsic::vp_cttz, MVT::v16i64, 25},
1052 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
1053 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
1054 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
1055 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
1056};
1057
1059 switch (ID) {
1060#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1061 case Intrinsic::VPID: \
1062 return ISD::VPSD;
1063#include "llvm/IR/VPIntrinsics.def"
1064#undef HELPER_MAP_VPID_TO_VPSD
1065 }
1066 return ISD::DELETED_NODE;
1067}
1068
1072 auto *RetTy = ICA.getReturnType();
1073 switch (ICA.getID()) {
1074 case Intrinsic::ceil:
1075 case Intrinsic::floor:
1076 case Intrinsic::trunc:
1077 case Intrinsic::rint:
1078 case Intrinsic::lrint:
1079 case Intrinsic::llrint:
1080 case Intrinsic::round:
1081 case Intrinsic::roundeven: {
1082 // These all use the same code.
1083 auto LT = getTypeLegalizationCost(RetTy);
1084 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1085 return LT.first * 8;
1086 break;
1087 }
1088 case Intrinsic::umin:
1089 case Intrinsic::umax:
1090 case Intrinsic::smin:
1091 case Intrinsic::smax: {
1092 auto LT = getTypeLegalizationCost(RetTy);
1093 if ((ST->hasVInstructions() && LT.second.isVector()) ||
1094 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
1095 return LT.first;
1096 break;
1097 }
1098 case Intrinsic::sadd_sat:
1099 case Intrinsic::ssub_sat:
1100 case Intrinsic::uadd_sat:
1101 case Intrinsic::usub_sat:
1102 case Intrinsic::fabs:
1103 case Intrinsic::sqrt: {
1104 auto LT = getTypeLegalizationCost(RetTy);
1105 if (ST->hasVInstructions() && LT.second.isVector())
1106 return LT.first;
1107 break;
1108 }
1109 case Intrinsic::ctpop: {
1110 auto LT = getTypeLegalizationCost(RetTy);
1111 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
1112 return LT.first;
1113 break;
1114 }
1115 case Intrinsic::abs: {
1116 auto LT = getTypeLegalizationCost(RetTy);
1117 if (ST->hasVInstructions() && LT.second.isVector()) {
1118 // vrsub.vi v10, v8, 0
1119 // vmax.vv v8, v8, v10
1120 return LT.first * 2;
1121 }
1122 break;
1123 }
1124 // TODO: add more intrinsic
1125 case Intrinsic::experimental_stepvector: {
1126 unsigned Cost = 1; // vid
1127 auto LT = getTypeLegalizationCost(RetTy);
1128 return Cost + (LT.first - 1);
1129 }
1130 case Intrinsic::vp_rint: {
1131 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1132 unsigned Cost = 5;
1133 auto LT = getTypeLegalizationCost(RetTy);
1134 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1135 return Cost * LT.first;
1136 break;
1137 }
1138 case Intrinsic::vp_nearbyint: {
1139 // More one read and one write for fflags than vp_rint.
1140 unsigned Cost = 7;
1141 auto LT = getTypeLegalizationCost(RetTy);
1142 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1143 return Cost * LT.first;
1144 break;
1145 }
1146 case Intrinsic::vp_ceil:
1147 case Intrinsic::vp_floor:
1148 case Intrinsic::vp_round:
1149 case Intrinsic::vp_roundeven:
1150 case Intrinsic::vp_roundtozero: {
1151 // Rounding with static rounding mode needs two more instructions to
1152 // swap/write FRM than vp_rint.
1153 unsigned Cost = 7;
1154 auto LT = getTypeLegalizationCost(RetTy);
1155 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1156 if (TLI->isOperationCustom(VPISD, LT.second))
1157 return Cost * LT.first;
1158 break;
1159 }
1160 }
1161
1162 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1163 auto LT = getTypeLegalizationCost(RetTy);
1164 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1165 ICA.getID(), LT.second))
1166 return LT.first * Entry->Cost;
1167 }
1168
1170}
1171
1173 Type *Src,
1176 const Instruction *I) {
1177 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1178 // FIXME: Need to compute legalizing cost for illegal types.
1179 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1180 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1181
1182 // Skip if element size of Dst or Src is bigger than ELEN.
1183 if (Src->getScalarSizeInBits() > ST->getELen() ||
1184 Dst->getScalarSizeInBits() > ST->getELen())
1185 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1186
1187 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1188 assert(ISD && "Invalid opcode");
1189
1190 // FIXME: Need to consider vsetvli and lmul.
1191 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1192 (int)Log2_32(Src->getScalarSizeInBits());
1193 switch (ISD) {
1194 case ISD::SIGN_EXTEND:
1195 case ISD::ZERO_EXTEND:
1196 if (Src->getScalarSizeInBits() == 1) {
1197 // We do not use vsext/vzext to extend from mask vector.
1198 // Instead we use the following instructions to extend from mask vector:
1199 // vmv.v.i v8, 0
1200 // vmerge.vim v8, v8, -1, v0
1201 return 2;
1202 }
1203 return 1;
1204 case ISD::TRUNCATE:
1205 if (Dst->getScalarSizeInBits() == 1) {
1206 // We do not use several vncvt to truncate to mask vector. So we could
1207 // not use PowDiff to calculate it.
1208 // Instead we use the following instructions to truncate to mask vector:
1209 // vand.vi v8, v8, 1
1210 // vmsne.vi v0, v8, 0
1211 return 2;
1212 }
1213 [[fallthrough]];
1214 case ISD::FP_EXTEND:
1215 case ISD::FP_ROUND:
1216 // Counts of narrow/widen instructions.
1217 return std::abs(PowDiff);
1218 case ISD::FP_TO_SINT:
1219 case ISD::FP_TO_UINT:
1220 case ISD::SINT_TO_FP:
1221 case ISD::UINT_TO_FP:
1222 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1223 // The cost of convert from or to mask vector is different from other
1224 // cases. We could not use PowDiff to calculate it.
1225 // For mask vector to fp, we should use the following instructions:
1226 // vmv.v.i v8, 0
1227 // vmerge.vim v8, v8, -1, v0
1228 // vfcvt.f.x.v v8, v8
1229
1230 // And for fp vector to mask, we use:
1231 // vfncvt.rtz.x.f.w v9, v8
1232 // vand.vi v8, v9, 1
1233 // vmsne.vi v0, v8, 0
1234 return 3;
1235 }
1236 if (std::abs(PowDiff) <= 1)
1237 return 1;
1238 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1239 // so it only need two conversion.
1240 if (Src->isIntOrIntVectorTy())
1241 return 2;
1242 // Counts of narrow/widen instructions.
1243 return std::abs(PowDiff);
1244 }
1245 }
1246 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1247}
1248
1249unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1250 if (isa<ScalableVectorType>(Ty)) {
1251 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1252 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1253 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1254 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1255 }
1256 return cast<FixedVectorType>(Ty)->getNumElements();
1257}
1258
1261 FastMathFlags FMF,
1263 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1264 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1265
1266 // Skip if scalar size of Ty is bigger than ELEN.
1267 if (Ty->getScalarSizeInBits() > ST->getELen())
1268 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1269
1270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1271 if (Ty->getElementType()->isIntegerTy(1))
1272 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1273 // cost 2, but we don't have enough info here so we slightly over cost.
1274 return (LT.first - 1) + 3;
1275
1276 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1277 InstructionCost BaseCost = 2;
1278
1280 return (LT.first - 1) + BaseCost;
1281
1282 unsigned VL = getEstimatedVLFor(Ty);
1283 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1284}
1285
1288 std::optional<FastMathFlags> FMF,
1290 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1292
1293 // Skip if scalar size of Ty is bigger than ELEN.
1294 if (Ty->getScalarSizeInBits() > ST->getELen())
1296
1297 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1298 assert(ISD && "Invalid opcode");
1299
1300 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1301 ISD != ISD::FADD)
1303
1304 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1305 if (Ty->getElementType()->isIntegerTy(1))
1306 // vcpop sequences, see vreduction-mask.ll
1307 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1308
1309 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1310 InstructionCost BaseCost = 2;
1311
1313 return (LT.first - 1) + BaseCost;
1314
1315 unsigned VL = getEstimatedVLFor(Ty);
1317 return (LT.first - 1) + BaseCost + VL;
1318 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1319}
1320
1322 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1324 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1325 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1326 FMF, CostKind);
1327
1328 // Skip if scalar size of ResTy is bigger than ELEN.
1329 if (ResTy->getScalarSizeInBits() > ST->getELen())
1330 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1331 FMF, CostKind);
1332
1333 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1334 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1335 FMF, CostKind);
1336
1337 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1338
1339 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1340 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1341 FMF, CostKind);
1342
1343 return (LT.first - 1) +
1345}
1346
1348 TTI::OperandValueInfo OpInfo,
1350 assert(OpInfo.isConstant() && "non constant operand?");
1351 if (!isa<VectorType>(Ty))
1352 // FIXME: We need to account for immediate materialization here, but doing
1353 // a decent job requires more knowledge about the immediate than we
1354 // currently have here.
1355 return 0;
1356
1357 if (OpInfo.isUniform())
1358 // vmv.x.i, vmv.v.x, or vfmv.v.f
1359 // We ignore the cost of the scalar constant materialization to be consistent
1360 // with how we treat scalar constants themselves just above.
1361 return 1;
1362
1363 return getConstantPoolLoadCost(Ty, CostKind);
1364}
1365
1366
1368 MaybeAlign Alignment,
1369 unsigned AddressSpace,
1371 TTI::OperandValueInfo OpInfo,
1372 const Instruction *I) {
1373 EVT VT = TLI->getValueType(DL, Src, true);
1374 // Type legalization can't handle structs
1375 if (VT == MVT::Other)
1376 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1377 CostKind, OpInfo, I);
1378
1380 if (Opcode == Instruction::Store && OpInfo.isConstant())
1381 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1382 InstructionCost BaseCost =
1384 CostKind, OpInfo, I);
1385 // Assume memory ops cost scale with the number of vector registers
1386 // possible accessed by the instruction. Note that BasicTTI already
1387 // handles the LT.first term for us.
1388 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1389 LT.second.isVector())
1390 BaseCost *= TLI->getLMULCost(LT.second);
1391 return Cost + BaseCost;
1392
1393}
1394
1396 Type *CondTy,
1397 CmpInst::Predicate VecPred,
1399 const Instruction *I) {
1401 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1402 I);
1403
1404 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1405 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1406 I);
1407
1408 // Skip if scalar size of ValTy is bigger than ELEN.
1409 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1410 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1411 I);
1412
1413 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1414 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1415 if (CondTy->isVectorTy()) {
1416 if (ValTy->getScalarSizeInBits() == 1) {
1417 // vmandn.mm v8, v8, v9
1418 // vmand.mm v9, v0, v9
1419 // vmor.mm v0, v9, v8
1420 return LT.first * 3;
1421 }
1422 // vselect and max/min are supported natively.
1423 return LT.first * 1;
1424 }
1425
1426 if (ValTy->getScalarSizeInBits() == 1) {
1427 // vmv.v.x v9, a0
1428 // vmsne.vi v9, v9, 0
1429 // vmandn.mm v8, v8, v9
1430 // vmand.mm v9, v0, v9
1431 // vmor.mm v0, v9, v8
1432 return LT.first * 5;
1433 }
1434
1435 // vmv.v.x v10, a0
1436 // vmsne.vi v0, v10, 0
1437 // vmerge.vvm v8, v9, v8, v0
1438 return LT.first * 3;
1439 }
1440
1441 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1442 ValTy->isVectorTy()) {
1443 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1444
1445 // Support natively.
1446 if (CmpInst::isIntPredicate(VecPred))
1447 return LT.first * 1;
1448
1449 // If we do not support the input floating point vector type, use the base
1450 // one which will calculate as:
1451 // ScalarizeCost + Num * Cost for fixed vector,
1452 // InvalidCost for scalable vector.
1453 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1454 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1455 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1456 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1457 I);
1458 switch (VecPred) {
1459 // Support natively.
1460 case CmpInst::FCMP_OEQ:
1461 case CmpInst::FCMP_OGT:
1462 case CmpInst::FCMP_OGE:
1463 case CmpInst::FCMP_OLT:
1464 case CmpInst::FCMP_OLE:
1465 case CmpInst::FCMP_UNE:
1466 return LT.first * 1;
1467 // TODO: Other comparisons?
1468 default:
1469 break;
1470 }
1471 }
1472
1473 // TODO: Add cost for scalar type.
1474
1475 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1476}
1477
1480 const Instruction *I) {
1482 return Opcode == Instruction::PHI ? 0 : 1;
1483 // Branches are assumed to be predicted.
1484 return 0;
1485}
1486
1489 unsigned Index, Value *Op0,
1490 Value *Op1) {
1491 assert(Val->isVectorTy() && "This must be a vector type");
1492
1493 if (Opcode != Instruction::ExtractElement &&
1494 Opcode != Instruction::InsertElement)
1495 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1496
1497 // Legalize the type.
1498 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1499
1500 // This type is legalized to a scalar type.
1501 if (!LT.second.isVector()) {
1502 auto *FixedVecTy = cast<FixedVectorType>(Val);
1503 // If Index is a known constant, cost is zero.
1504 if (Index != -1U)
1505 return 0;
1506 // Extract/InsertElement with non-constant index is very costly when
1507 // scalarized; estimate cost of loads/stores sequence via the stack:
1508 // ExtractElement cost: store vector to stack, load scalar;
1509 // InsertElement cost: store vector to stack, store scalar, load vector.
1510 Type *ElemTy = FixedVecTy->getElementType();
1511 auto NumElems = FixedVecTy->getNumElements();
1512 auto Align = DL.getPrefTypeAlign(ElemTy);
1513 InstructionCost LoadCost =
1514 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1515 InstructionCost StoreCost =
1516 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1517 return Opcode == Instruction::ExtractElement
1518 ? StoreCost * NumElems + LoadCost
1519 : (StoreCost + LoadCost) * NumElems + StoreCost;
1520 }
1521
1522 // For unsupported scalable vector.
1523 if (LT.second.isScalableVector() && !LT.first.isValid())
1524 return LT.first;
1525
1526 if (!isTypeLegal(Val))
1527 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1528
1529 // Mask vector extract/insert is expanded via e8.
1530 if (Val->getScalarSizeInBits() == 1) {
1531 VectorType *WideTy =
1533 cast<VectorType>(Val)->getElementCount());
1534 if (Opcode == Instruction::ExtractElement) {
1535 InstructionCost ExtendCost
1536 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1538 InstructionCost ExtractCost
1539 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1540 return ExtendCost + ExtractCost;
1541 }
1542 InstructionCost ExtendCost
1543 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1545 InstructionCost InsertCost
1546 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1547 InstructionCost TruncCost
1548 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1550 return ExtendCost + InsertCost + TruncCost;
1551 }
1552
1553
1554 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1555 // and vslideup + vmv.s.x to insert element to vector.
1556 unsigned BaseCost = 1;
1557 // When insertelement we should add the index with 1 as the input of vslideup.
1558 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1559
1560 if (Index != -1U) {
1561 // The type may be split. For fixed-width vectors we can normalize the
1562 // index to the new type.
1563 if (LT.second.isFixedLengthVector()) {
1564 unsigned Width = LT.second.getVectorNumElements();
1565 Index = Index % Width;
1566 }
1567
1568 // We could extract/insert the first element without vslidedown/vslideup.
1569 if (Index == 0)
1570 SlideCost = 0;
1571 else if (Opcode == Instruction::InsertElement)
1572 SlideCost = 1; // With a constant index, we do not need to use addi.
1573 }
1574
1575 // Extract i64 in the target that has XLEN=32 need more instruction.
1576 if (Val->getScalarType()->isIntegerTy() &&
1577 ST->getXLen() < Val->getScalarSizeInBits()) {
1578 // For extractelement, we need the following instructions:
1579 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1580 // vslidedown.vx v8, v8, a0
1581 // vmv.x.s a0, v8
1582 // li a1, 32
1583 // vsrl.vx v8, v8, a1
1584 // vmv.x.s a1, v8
1585
1586 // For insertelement, we need the following instructions:
1587 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1588 // vmv.v.i v12, 0
1589 // vslide1up.vx v16, v12, a1
1590 // vslide1up.vx v12, v16, a0
1591 // addi a0, a2, 1
1592 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1593 // vslideup.vx v8, v12, a2
1594
1595 // TODO: should we count these special vsetvlis?
1596 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1597 }
1598 return BaseCost + SlideCost;
1599}
1600
1602 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1604 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1605
1606 // TODO: Handle more cost kinds.
1608 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1609 Args, CxtI);
1610
1611 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1612 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1613 Args, CxtI);
1614
1615 // Skip if scalar size of Ty is bigger than ELEN.
1616 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1617 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1618 Args, CxtI);
1619
1620 // Legalize the type.
1621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1622
1623 // TODO: Handle scalar type.
1624 if (!LT.second.isVector())
1625 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1626 Args, CxtI);
1627
1628
1629 auto getConstantMatCost =
1630 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1631 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1632 // Two sub-cases:
1633 // * Has a 5 bit immediate operand which can be splatted.
1634 // * Has a larger immediate which must be materialized in scalar register
1635 // We return 0 for both as we currently ignore the cost of materializing
1636 // scalar constants in GPRs.
1637 return 0;
1638
1639 return getConstantPoolLoadCost(Ty, CostKind);
1640 };
1641
1642 // Add the cost of materializing any constant vectors required.
1643 InstructionCost ConstantMatCost = 0;
1644 if (Op1Info.isConstant())
1645 ConstantMatCost += getConstantMatCost(0, Op1Info);
1646 if (Op2Info.isConstant())
1647 ConstantMatCost += getConstantMatCost(1, Op2Info);
1648
1649 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1650 case ISD::ADD:
1651 case ISD::SUB:
1652 case ISD::AND:
1653 case ISD::OR:
1654 case ISD::XOR:
1655 case ISD::SHL:
1656 case ISD::SRL:
1657 case ISD::SRA:
1658 case ISD::MUL:
1659 case ISD::MULHS:
1660 case ISD::MULHU:
1661 case ISD::FADD:
1662 case ISD::FSUB:
1663 case ISD::FMUL:
1664 case ISD::FNEG: {
1665 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1666 }
1667 default:
1668 return ConstantMatCost +
1669 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1670 Args, CxtI);
1671 }
1672}
1673
1674// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1676 ArrayRef<const Value *> Ptrs, const Value *Base,
1677 const TTI::PointersChainInfo &Info, Type *AccessTy,
1680 // In the basic model we take into account GEP instructions only
1681 // (although here can come alloca instruction, a value, constants and/or
1682 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1683 // pointer). Typically, if Base is a not a GEP-instruction and all the
1684 // pointers are relative to the same base address, all the rest are
1685 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1686 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1687 // any their index is a non-const.
1688 // If no known dependecies between the pointers cost is calculated as a sum
1689 // of costs of GEP instructions.
1690 for (auto [I, V] : enumerate(Ptrs)) {
1691 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1692 if (!GEP)
1693 continue;
1694 if (Info.isSameBase() && V != Base) {
1695 if (GEP->hasAllConstantIndices())
1696 continue;
1697 // If the chain is unit-stride and BaseReg + stride*i is a legal
1698 // addressing mode, then presume the base GEP is sitting around in a
1699 // register somewhere and check if we can fold the offset relative to
1700 // it.
1701 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1702 if (Info.isUnitStride() &&
1703 isLegalAddressingMode(AccessTy,
1704 /* BaseGV */ nullptr,
1705 /* BaseOffset */ Stride * I,
1706 /* HasBaseReg */ true,
1707 /* Scale */ 0,
1708 GEP->getType()->getPointerAddressSpace()))
1709 continue;
1710 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1711 {TTI::OK_AnyValue, TTI::OP_None},
1712 {TTI::OK_AnyValue, TTI::OP_None},
1713 std::nullopt);
1714 } else {
1715 SmallVector<const Value *> Indices(GEP->indices());
1716 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1717 Indices, AccessTy, CostKind);
1718 }
1719 }
1720 return Cost;
1721}
1722
1726 // TODO: More tuning on benchmarks and metrics with changes as needed
1727 // would apply to all settings below to enable performance.
1728
1729
1730 if (ST->enableDefaultUnroll())
1731 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1732
1733 // Enable Upper bound unrolling universally, not dependant upon the conditions
1734 // below.
1735 UP.UpperBound = true;
1736
1737 // Disable loop unrolling for Oz and Os.
1738 UP.OptSizeThreshold = 0;
1740 if (L->getHeader()->getParent()->hasOptSize())
1741 return;
1742
1743 SmallVector<BasicBlock *, 4> ExitingBlocks;
1744 L->getExitingBlocks(ExitingBlocks);
1745 LLVM_DEBUG(dbgs() << "Loop has:\n"
1746 << "Blocks: " << L->getNumBlocks() << "\n"
1747 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1748
1749 // Only allow another exit other than the latch. This acts as an early exit
1750 // as it mirrors the profitability calculation of the runtime unroller.
1751 if (ExitingBlocks.size() > 2)
1752 return;
1753
1754 // Limit the CFG of the loop body for targets with a branch predictor.
1755 // Allowing 4 blocks permits if-then-else diamonds in the body.
1756 if (L->getNumBlocks() > 4)
1757 return;
1758
1759 // Don't unroll vectorized loops, including the remainder loop
1760 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1761 return;
1762
1763 // Scan the loop: don't unroll loops with calls as this could prevent
1764 // inlining.
1766 for (auto *BB : L->getBlocks()) {
1767 for (auto &I : *BB) {
1768 // Initial setting - Don't unroll loops containing vectorized
1769 // instructions.
1770 if (I.getType()->isVectorTy())
1771 return;
1772
1773 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1774 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1775 if (!isLoweredToCall(F))
1776 continue;
1777 }
1778 return;
1779 }
1780
1781 SmallVector<const Value *> Operands(I.operand_values());
1784 }
1785 }
1786
1787 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1788
1789 UP.Partial = true;
1790 UP.Runtime = true;
1791 UP.UnrollRemainder = true;
1792 UP.UnrollAndJam = true;
1794
1795 // Force unrolling small loops can be very useful because of the branch
1796 // taken cost of the backedge.
1797 if (Cost < 12)
1798 UP.Force = true;
1799}
1800
1804}
1805
1808 if (Ty->isVectorTy()) {
1809 if (Size.isScalable() && ST->hasVInstructions())
1810 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1811
1813 return divideCeil(Size, ST->getRealMinVLen());
1814 }
1815
1816 return BaseT::getRegUsageForType(Ty);
1817}
1818
1819unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1820 if (SLPMaxVF.getNumOccurrences())
1821 return SLPMaxVF;
1822
1823 // Return how many elements can fit in getRegisterBitwidth. This is the
1824 // same routine as used in LoopVectorizer. We should probably be
1825 // accounting for whether we actually have instructions with the right
1826 // lane type, but we don't have enough information to do that without
1827 // some additional plumbing which hasn't been justified yet.
1828 TypeSize RegWidth =
1830 // If no vector registers, or absurd element widths, disable
1831 // vectorization by returning 1.
1832 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1833}
1834
1836 const TargetTransformInfo::LSRCost &C2) {
1837 // RISC-V specific here are "instruction number 1st priority".
1838 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1839 C1.NumIVMuls, C1.NumBaseAdds,
1840 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1841 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1842 C2.NumIVMuls, C2.NumBaseAdds,
1843 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1844}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static constexpr uint32_t Opcode
Definition: aarch32.h:200
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:556
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:730
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:865
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:729
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:943
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:991
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:431
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:334
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:628
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:829
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:780
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:783
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:786
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:784
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:785
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:787
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:796
bool isIntPredicate() const
Definition: InstrTypes.h:888
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:699
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:285
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideCost(MVT VT) const
Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:333
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:336
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:188
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:928
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1084
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2375
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1932
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:1994
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).