LLVM 17.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Result used for getMaximumVF query which is used exclusively by "
34 "SLP vectorizer. Defaults to 1 which disables SLP."),
36
37InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
38 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
39 // implementation-defined.
40 if (!VT.isVector())
42 unsigned Cost;
43 if (VT.isScalableVector()) {
44 unsigned LMul;
45 bool Fractional;
46 std::tie(LMul, Fractional) =
48 Cost = Fractional ? 1 : LMul;
49 } else {
51 }
52 return Cost;
53}
54
57 assert(Ty->isIntegerTy() &&
58 "getIntImmCost can only estimate cost of materialising integers");
59
60 // We have a Zero register, so 0 is always free.
61 if (Imm == 0)
62 return TTI::TCC_Free;
63
64 // Otherwise, we check how many instructions it will take to materialise.
65 const DataLayout &DL = getDataLayout();
67 getST()->getFeatureBits());
68}
69
70// Look for patterns of shift followed by AND that can be turned into a pair of
71// shifts. We won't need to materialize an immediate for the AND so these can
72// be considered free.
73static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
74 uint64_t Mask = Imm.getZExtValue();
75 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
76 if (!BO || !BO->hasOneUse())
77 return false;
78
79 if (BO->getOpcode() != Instruction::Shl)
80 return false;
81
82 if (!isa<ConstantInt>(BO->getOperand(1)))
83 return false;
84
85 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
86 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
87 // is a mask shifted by c2 bits with c3 leading zeros.
88 if (isShiftedMask_64(Mask)) {
89 unsigned Trailing = llvm::countr_zero(Mask);
90 if (ShAmt == Trailing)
91 return true;
92 }
93
94 return false;
95}
96
98 const APInt &Imm, Type *Ty,
100 Instruction *Inst) {
101 assert(Ty->isIntegerTy() &&
102 "getIntImmCost can only estimate cost of materialising integers");
103
104 // We have a Zero register, so 0 is always free.
105 if (Imm == 0)
106 return TTI::TCC_Free;
107
108 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
109 // commutative, in others the immediate comes from a specific argument index.
110 bool Takes12BitImm = false;
111 unsigned ImmArgIdx = ~0U;
112
113 switch (Opcode) {
114 case Instruction::GetElementPtr:
115 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
116 // split up large offsets in GEP into better parts than ConstantHoisting
117 // can.
118 return TTI::TCC_Free;
119 case Instruction::And:
120 // zext.h
121 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
122 return TTI::TCC_Free;
123 // zext.w
124 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
125 return TTI::TCC_Free;
126 // bclri
127 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
128 return TTI::TCC_Free;
129 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
130 canUseShiftPair(Inst, Imm))
131 return TTI::TCC_Free;
132 Takes12BitImm = true;
133 break;
134 case Instruction::Add:
135 Takes12BitImm = true;
136 break;
137 case Instruction::Or:
138 case Instruction::Xor:
139 // bseti/binvi
140 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
141 return TTI::TCC_Free;
142 Takes12BitImm = true;
143 break;
144 case Instruction::Mul:
145 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
146 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
147 return TTI::TCC_Free;
148 // FIXME: There is no MULI instruction.
149 Takes12BitImm = true;
150 break;
151 case Instruction::Sub:
152 case Instruction::Shl:
153 case Instruction::LShr:
154 case Instruction::AShr:
155 Takes12BitImm = true;
156 ImmArgIdx = 1;
157 break;
158 default:
159 break;
160 }
161
162 if (Takes12BitImm) {
163 // Check immediate is the correct argument...
164 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
165 // ... and fits into the 12-bit immediate.
166 if (Imm.getSignificantBits() <= 64 &&
167 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
168 return TTI::TCC_Free;
169 }
170 }
171
172 // Otherwise, use the full materialisation cost.
173 return getIntImmCost(Imm, Ty, CostKind);
174 }
175
176 // By default, prevent hoisting.
177 return TTI::TCC_Free;
178}
179
182 const APInt &Imm, Type *Ty,
184 // Prevent hoisting in unknown cases.
185 return TTI::TCC_Free;
186}
187
190 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
191 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
192}
193
195 // Currently, the ExpandReductions pass can't expand scalable-vector
196 // reductions, but we still request expansion as RVV doesn't support certain
197 // reductions and the SelectionDAG can't legalize them either.
198 switch (II->getIntrinsicID()) {
199 default:
200 return false;
201 // These reductions have no equivalent in RVV
202 case Intrinsic::vector_reduce_mul:
203 case Intrinsic::vector_reduce_fmul:
204 return true;
205 }
206}
207
208std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
209 if (ST->hasVInstructions())
211 return BaseT::getMaxVScale();
212}
213
214std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
215 if (ST->hasVInstructions())
216 if (unsigned MinVLen = ST->getRealMinVLen();
217 MinVLen >= RISCV::RVVBitsPerBlock)
218 return MinVLen / RISCV::RVVBitsPerBlock;
220}
221
224 unsigned LMUL =
225 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
226 switch (K) {
228 return TypeSize::getFixed(ST->getXLen());
230 return TypeSize::getFixed(
231 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
234 (ST->hasVInstructions() &&
237 : 0);
238 }
239
240 llvm_unreachable("Unsupported register kind");
241}
242
244RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
245 // Add a cost of address generation + the cost of the load. The address
246 // is expected to be a PC relative offset to a constant pool entry
247 // using auipc/addi.
248 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
249 /*AddressSpace=*/0, CostKind);
250}
251
253 LLVMContext &C) {
254 assert((DataVT.getScalarSizeInBits() != 8 ||
255 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
256 MVT IndexVT = DataVT.changeTypeToInteger();
257 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
258 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
259 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
260}
261
262
264 VectorType *Tp, ArrayRef<int> Mask,
266 int Index, VectorType *SubTp,
268 Kind = improveShuffleKindFromMask(Kind, Mask);
269
270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
271
272 // First, handle cases where having a fixed length vector enables us to
273 // give a more accurate cost than falling back to generic scalable codegen.
274 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
275 if (isa<FixedVectorType>(Tp)) {
276 switch (Kind) {
277 default:
278 break;
280 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
281 MVT EltTp = LT.second.getVectorElementType();
282 // If the size of the element is < ELEN then shuffles of interleaves and
283 // deinterleaves of 2 vectors can be lowered into the following
284 // sequences
285 if (EltTp.getScalarSizeInBits() < ST->getELEN()) {
286 // Example sequence:
287 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
288 // vwaddu.vv v10, v8, v9
289 // li a0, -1 (ignored)
290 // vwmaccu.vx v10, a0, v9
291 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
292 return 2 * LT.first * getLMULCost(LT.second);
293
294 if (Mask[0] == 0 || Mask[0] == 1) {
295 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
296 // Example sequence:
297 // vnsrl.wi v10, v8, 0
298 if (equal(DeinterleaveMask, Mask))
299 return LT.first * getLMULCost(LT.second);
300 }
301 }
302
303 // vrgather + cost of generating the mask constant.
304 // We model this for an unknown mask with a single vrgather.
305 if (LT.first == 1 &&
306 (LT.second.getScalarSizeInBits() != 8 ||
307 LT.second.getVectorNumElements() <= 256)) {
308 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
309 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
310 return IndexCost + getLMULCost(LT.second);
311 }
312 }
313 break;
314 }
317 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
318 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
319 // register for the second vrgather. We model this for an unknown
320 // (shuffle) mask.
321 if (LT.first == 1 &&
322 (LT.second.getScalarSizeInBits() != 8 ||
323 LT.second.getVectorNumElements() <= 256)) {
324 auto &C = Tp->getContext();
325 auto EC = Tp->getElementCount();
326 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
328 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
329 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
330 return 2 * IndexCost + 2 * getLMULCost(LT.second) + MaskCost;
331 }
332 }
333 break;
334 }
335 }
336 };
337
338 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
339 switch (Kind) {
340 default:
341 // Fallthrough to generic handling.
342 // TODO: Most of these cases will return getInvalid in generic code, and
343 // must be implemented here.
344 break;
346 // Example sequence:
347 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
348 // vslidedown.vi v8, v9, 2
349 return LT.first * getLMULCost(LT.second);
351 // Example sequence:
352 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
353 // vslideup.vi v8, v9, 2
354 return LT.first * getLMULCost(LT.second);
355 case TTI::SK_Select: {
356 // Example sequence:
357 // li a0, 90
358 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
359 // vmv.s.x v0, a0
360 // vmerge.vvm v8, v9, v8, v0
361 return LT.first * 3 * getLMULCost(LT.second);
362 }
363 case TTI::SK_Broadcast: {
364 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
365 Instruction::InsertElement);
366 if (LT.second.getScalarSizeInBits() == 1) {
367 if (HasScalar) {
368 // Example sequence:
369 // andi a0, a0, 1
370 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
371 // vmv.v.x v8, a0
372 // vmsne.vi v0, v8, 0
373 return LT.first * getLMULCost(LT.second) * 3;
374 }
375 // Example sequence:
376 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
377 // vmv.v.i v8, 0
378 // vmerge.vim v8, v8, 1, v0
379 // vmv.x.s a0, v8
380 // andi a0, a0, 1
381 // vmv.v.x v8, a0
382 // vmsne.vi v0, v8, 0
383
384 return LT.first * getLMULCost(LT.second) * 6;
385 }
386
387 if (HasScalar) {
388 // Example sequence:
389 // vmv.v.x v8, a0
390 return LT.first * getLMULCost(LT.second);
391 }
392
393 // Example sequence:
394 // vrgather.vi v9, v8, 0
395 // TODO: vrgather could be slower than vmv.v.x. It is
396 // implementation-dependent.
397 return LT.first * getLMULCost(LT.second);
398 }
399 case TTI::SK_Splice:
400 // vslidedown+vslideup.
401 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
402 // of similar code, but I think we expand through memory.
403 return 2 * LT.first * getLMULCost(LT.second);
404 case TTI::SK_Reverse: {
405 // TODO: Cases to improve here:
406 // * LMUL > 1
407 // * i64 on RV32
408 // * i1 vector
409
410 // Most of the cost here is producing the vrgather index register
411 // Example sequence:
412 // csrr a0, vlenb
413 // srli a0, a0, 3
414 // addi a0, a0, -1
415 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
416 // vid.v v9
417 // vrsub.vx v10, v9, a0
418 // vrgather.vv v9, v8, v10
419 unsigned LenCost = 3;
420 if (LT.second.isFixedLengthVector())
421 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
422 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
423 if (Tp->getElementType()->isIntegerTy(1))
424 // Mask operation additionally required extend and truncate
425 return LT.first * (LenCost + 6);
426 return LT.first * (LenCost + 3);
427 }
428 }
429 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
430}
431
433RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
434 unsigned AddressSpace,
436 if (!isLegalMaskedLoadStore(Src, Alignment) ||
438 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
439 CostKind);
440
441 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
442}
443
445 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
446 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
447 bool UseMaskForCond, bool UseMaskForGaps) {
448 auto *FVTy = cast<FixedVectorType>(VecTy);
449 InstructionCost MemCost =
450 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
451 unsigned VF = FVTy->getNumElements() / Factor;
452
453 // The interleaved memory access pass will lower interleaved memory ops (i.e
454 // a load and store followed by a specific shuffle) to vlseg/vsseg
455 // intrinsics. In those cases then we can treat it as if it's just one (legal)
456 // memory op
457 if (!UseMaskForCond && !UseMaskForGaps &&
458 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
459 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
460 // Need to make sure type has't been scalarized
461 if (LT.second.isFixedLengthVector()) {
462 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
463 LT.second.getVectorNumElements());
464 // FIXME: We use the memory op cost of the *legalized* type here, becuase
465 // it's getMemoryOpCost returns a really expensive cost for types like
466 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
467 // Should the memory op cost of these be cheaper?
468 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, DL)) {
469 InstructionCost LegalMemCost = getMemoryOpCost(
470 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
471 return LT.first + LegalMemCost;
472 }
473 }
474 }
475
476 // An interleaved load will look like this for Factor=3:
477 // %wide.vec = load <12 x i32>, ptr %3, align 4
478 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
479 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
480 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
481 if (Opcode == Instruction::Load) {
482 InstructionCost Cost = MemCost;
483 for (unsigned Index : Indices) {
484 FixedVectorType *SubVecTy =
485 FixedVectorType::get(FVTy->getElementType(), VF);
486 auto Mask = createStrideMask(Index, Factor, VF);
487 InstructionCost ShuffleCost =
489 CostKind, 0, nullptr, {});
490 Cost += ShuffleCost;
491 }
492 return Cost;
493 }
494
495 // TODO: Model for NF > 2
496 // We'll need to enhance getShuffleCost to model shuffles that are just
497 // inserts and extracts into subvectors, since they won't have the full cost
498 // of a vrgather.
499 // An interleaved store for 3 vectors of 4 lanes will look like
500 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
501 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
502 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
503 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
504 // store <12 x i32> %interleaved.vec, ptr %10, align 4
505 if (Factor != 2)
506 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
507 Alignment, AddressSpace, CostKind,
508 UseMaskForCond, UseMaskForGaps);
509
510 assert(Opcode == Instruction::Store && "Opcode must be a store");
511 // For an interleaving store of 2 vectors, we perform one large interleaving
512 // shuffle that goes into the wide store
513 auto Mask = createInterleaveMask(VF, Factor);
514 InstructionCost ShuffleCost =
516 CostKind, 0, nullptr, {});
517 return MemCost + ShuffleCost;
518}
519
521 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
522 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
524 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
525 Alignment, CostKind, I);
526
527 if ((Opcode == Instruction::Load &&
528 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
529 (Opcode == Instruction::Store &&
530 !isLegalMaskedScatter(DataTy, Align(Alignment))))
531 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
532 Alignment, CostKind, I);
533
534 // Cost is proportional to the number of memory operations implied. For
535 // scalable vectors, we use an estimate on that number since we don't
536 // know exactly what VL will be.
537 auto &VTy = *cast<VectorType>(DataTy);
538 InstructionCost MemOpCost =
539 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
540 {TTI::OK_AnyValue, TTI::OP_None}, I);
541 unsigned NumLoads = getEstimatedVLFor(&VTy);
542 return NumLoads * MemOpCost;
543}
544
545// Currently, these represent both throughput and codesize costs
546// for the respective intrinsics. The costs in this table are simply
547// instruction counts with the following adjustments made:
548// * One vsetvli is considered free.
550 {Intrinsic::floor, MVT::v2f32, 9},
551 {Intrinsic::floor, MVT::v4f32, 9},
552 {Intrinsic::floor, MVT::v8f32, 9},
553 {Intrinsic::floor, MVT::v16f32, 9},
554 {Intrinsic::floor, MVT::nxv1f32, 9},
555 {Intrinsic::floor, MVT::nxv2f32, 9},
556 {Intrinsic::floor, MVT::nxv4f32, 9},
557 {Intrinsic::floor, MVT::nxv8f32, 9},
558 {Intrinsic::floor, MVT::nxv16f32, 9},
559 {Intrinsic::floor, MVT::v2f64, 9},
560 {Intrinsic::floor, MVT::v4f64, 9},
561 {Intrinsic::floor, MVT::v8f64, 9},
562 {Intrinsic::floor, MVT::v16f64, 9},
563 {Intrinsic::floor, MVT::nxv1f64, 9},
564 {Intrinsic::floor, MVT::nxv2f64, 9},
565 {Intrinsic::floor, MVT::nxv4f64, 9},
566 {Intrinsic::floor, MVT::nxv8f64, 9},
567 {Intrinsic::ceil, MVT::v2f32, 9},
568 {Intrinsic::ceil, MVT::v4f32, 9},
569 {Intrinsic::ceil, MVT::v8f32, 9},
570 {Intrinsic::ceil, MVT::v16f32, 9},
571 {Intrinsic::ceil, MVT::nxv1f32, 9},
572 {Intrinsic::ceil, MVT::nxv2f32, 9},
573 {Intrinsic::ceil, MVT::nxv4f32, 9},
574 {Intrinsic::ceil, MVT::nxv8f32, 9},
575 {Intrinsic::ceil, MVT::nxv16f32, 9},
576 {Intrinsic::ceil, MVT::v2f64, 9},
577 {Intrinsic::ceil, MVT::v4f64, 9},
578 {Intrinsic::ceil, MVT::v8f64, 9},
579 {Intrinsic::ceil, MVT::v16f64, 9},
580 {Intrinsic::ceil, MVT::nxv1f64, 9},
581 {Intrinsic::ceil, MVT::nxv2f64, 9},
582 {Intrinsic::ceil, MVT::nxv4f64, 9},
583 {Intrinsic::ceil, MVT::nxv8f64, 9},
584 {Intrinsic::trunc, MVT::v2f32, 7},
585 {Intrinsic::trunc, MVT::v4f32, 7},
586 {Intrinsic::trunc, MVT::v8f32, 7},
587 {Intrinsic::trunc, MVT::v16f32, 7},
588 {Intrinsic::trunc, MVT::nxv1f32, 7},
589 {Intrinsic::trunc, MVT::nxv2f32, 7},
590 {Intrinsic::trunc, MVT::nxv4f32, 7},
591 {Intrinsic::trunc, MVT::nxv8f32, 7},
592 {Intrinsic::trunc, MVT::nxv16f32, 7},
593 {Intrinsic::trunc, MVT::v2f64, 7},
594 {Intrinsic::trunc, MVT::v4f64, 7},
595 {Intrinsic::trunc, MVT::v8f64, 7},
596 {Intrinsic::trunc, MVT::v16f64, 7},
597 {Intrinsic::trunc, MVT::nxv1f64, 7},
598 {Intrinsic::trunc, MVT::nxv2f64, 7},
599 {Intrinsic::trunc, MVT::nxv4f64, 7},
600 {Intrinsic::trunc, MVT::nxv8f64, 7},
601 {Intrinsic::round, MVT::v2f32, 9},
602 {Intrinsic::round, MVT::v4f32, 9},
603 {Intrinsic::round, MVT::v8f32, 9},
604 {Intrinsic::round, MVT::v16f32, 9},
605 {Intrinsic::round, MVT::nxv1f32, 9},
606 {Intrinsic::round, MVT::nxv2f32, 9},
607 {Intrinsic::round, MVT::nxv4f32, 9},
608 {Intrinsic::round, MVT::nxv8f32, 9},
609 {Intrinsic::round, MVT::nxv16f32, 9},
610 {Intrinsic::round, MVT::v2f64, 9},
611 {Intrinsic::round, MVT::v4f64, 9},
612 {Intrinsic::round, MVT::v8f64, 9},
613 {Intrinsic::round, MVT::v16f64, 9},
614 {Intrinsic::round, MVT::nxv1f64, 9},
615 {Intrinsic::round, MVT::nxv2f64, 9},
616 {Intrinsic::round, MVT::nxv4f64, 9},
617 {Intrinsic::round, MVT::nxv8f64, 9},
618 {Intrinsic::roundeven, MVT::v2f32, 9},
619 {Intrinsic::roundeven, MVT::v4f32, 9},
620 {Intrinsic::roundeven, MVT::v8f32, 9},
621 {Intrinsic::roundeven, MVT::v16f32, 9},
622 {Intrinsic::roundeven, MVT::nxv1f32, 9},
623 {Intrinsic::roundeven, MVT::nxv2f32, 9},
624 {Intrinsic::roundeven, MVT::nxv4f32, 9},
625 {Intrinsic::roundeven, MVT::nxv8f32, 9},
626 {Intrinsic::roundeven, MVT::nxv16f32, 9},
627 {Intrinsic::roundeven, MVT::v2f64, 9},
628 {Intrinsic::roundeven, MVT::v4f64, 9},
629 {Intrinsic::roundeven, MVT::v8f64, 9},
630 {Intrinsic::roundeven, MVT::v16f64, 9},
631 {Intrinsic::roundeven, MVT::nxv1f64, 9},
632 {Intrinsic::roundeven, MVT::nxv2f64, 9},
633 {Intrinsic::roundeven, MVT::nxv4f64, 9},
634 {Intrinsic::roundeven, MVT::nxv8f64, 9},
635 {Intrinsic::rint, MVT::v2f32, 7},
636 {Intrinsic::rint, MVT::v4f32, 7},
637 {Intrinsic::rint, MVT::v8f32, 7},
638 {Intrinsic::rint, MVT::v16f32, 7},
639 {Intrinsic::rint, MVT::nxv1f32, 7},
640 {Intrinsic::rint, MVT::nxv2f32, 7},
641 {Intrinsic::rint, MVT::nxv4f32, 7},
642 {Intrinsic::rint, MVT::nxv8f32, 7},
643 {Intrinsic::rint, MVT::nxv16f32, 7},
644 {Intrinsic::rint, MVT::v2f64, 7},
645 {Intrinsic::rint, MVT::v4f64, 7},
646 {Intrinsic::rint, MVT::v8f64, 7},
647 {Intrinsic::rint, MVT::v16f64, 7},
648 {Intrinsic::rint, MVT::nxv1f64, 7},
649 {Intrinsic::rint, MVT::nxv2f64, 7},
650 {Intrinsic::rint, MVT::nxv4f64, 7},
651 {Intrinsic::rint, MVT::nxv8f64, 7},
652 {Intrinsic::nearbyint, MVT::v2f32, 9},
653 {Intrinsic::nearbyint, MVT::v4f32, 9},
654 {Intrinsic::nearbyint, MVT::v8f32, 9},
655 {Intrinsic::nearbyint, MVT::v16f32, 9},
656 {Intrinsic::nearbyint, MVT::nxv1f32, 9},
657 {Intrinsic::nearbyint, MVT::nxv2f32, 9},
658 {Intrinsic::nearbyint, MVT::nxv4f32, 9},
659 {Intrinsic::nearbyint, MVT::nxv8f32, 9},
660 {Intrinsic::nearbyint, MVT::nxv16f32, 9},
661 {Intrinsic::nearbyint, MVT::v2f64, 9},
662 {Intrinsic::nearbyint, MVT::v4f64, 9},
663 {Intrinsic::nearbyint, MVT::v8f64, 9},
664 {Intrinsic::nearbyint, MVT::v16f64, 9},
665 {Intrinsic::nearbyint, MVT::nxv1f64, 9},
666 {Intrinsic::nearbyint, MVT::nxv2f64, 9},
667 {Intrinsic::nearbyint, MVT::nxv4f64, 9},
668 {Intrinsic::nearbyint, MVT::nxv8f64, 9},
669 {Intrinsic::bswap, MVT::v2i16, 3},
670 {Intrinsic::bswap, MVT::v4i16, 3},
671 {Intrinsic::bswap, MVT::v8i16, 3},
672 {Intrinsic::bswap, MVT::v16i16, 3},
673 {Intrinsic::bswap, MVT::nxv1i16, 3},
674 {Intrinsic::bswap, MVT::nxv2i16, 3},
675 {Intrinsic::bswap, MVT::nxv4i16, 3},
676 {Intrinsic::bswap, MVT::nxv8i16, 3},
677 {Intrinsic::bswap, MVT::nxv16i16, 3},
678 {Intrinsic::bswap, MVT::v2i32, 12},
679 {Intrinsic::bswap, MVT::v4i32, 12},
680 {Intrinsic::bswap, MVT::v8i32, 12},
681 {Intrinsic::bswap, MVT::v16i32, 12},
682 {Intrinsic::bswap, MVT::nxv1i32, 12},
683 {Intrinsic::bswap, MVT::nxv2i32, 12},
684 {Intrinsic::bswap, MVT::nxv4i32, 12},
685 {Intrinsic::bswap, MVT::nxv8i32, 12},
686 {Intrinsic::bswap, MVT::nxv16i32, 12},
687 {Intrinsic::bswap, MVT::v2i64, 31},
688 {Intrinsic::bswap, MVT::v4i64, 31},
689 {Intrinsic::bswap, MVT::v8i64, 31},
690 {Intrinsic::bswap, MVT::v16i64, 31},
691 {Intrinsic::bswap, MVT::nxv1i64, 31},
692 {Intrinsic::bswap, MVT::nxv2i64, 31},
693 {Intrinsic::bswap, MVT::nxv4i64, 31},
694 {Intrinsic::bswap, MVT::nxv8i64, 31},
695 {Intrinsic::vp_bswap, MVT::v2i16, 3},
696 {Intrinsic::vp_bswap, MVT::v4i16, 3},
697 {Intrinsic::vp_bswap, MVT::v8i16, 3},
698 {Intrinsic::vp_bswap, MVT::v16i16, 3},
699 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
700 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
701 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
702 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
703 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
704 {Intrinsic::vp_bswap, MVT::v2i32, 12},
705 {Intrinsic::vp_bswap, MVT::v4i32, 12},
706 {Intrinsic::vp_bswap, MVT::v8i32, 12},
707 {Intrinsic::vp_bswap, MVT::v16i32, 12},
708 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
709 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
710 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
711 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
712 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
713 {Intrinsic::vp_bswap, MVT::v2i64, 31},
714 {Intrinsic::vp_bswap, MVT::v4i64, 31},
715 {Intrinsic::vp_bswap, MVT::v8i64, 31},
716 {Intrinsic::vp_bswap, MVT::v16i64, 31},
717 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
718 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
719 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
720 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
721 {Intrinsic::vp_fshl, MVT::v2i8, 7},
722 {Intrinsic::vp_fshl, MVT::v4i8, 7},
723 {Intrinsic::vp_fshl, MVT::v8i8, 7},
724 {Intrinsic::vp_fshl, MVT::v16i8, 7},
725 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
726 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
727 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
728 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
729 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
730 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
731 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
732 {Intrinsic::vp_fshl, MVT::v2i16, 7},
733 {Intrinsic::vp_fshl, MVT::v4i16, 7},
734 {Intrinsic::vp_fshl, MVT::v8i16, 7},
735 {Intrinsic::vp_fshl, MVT::v16i16, 7},
736 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
737 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
738 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
739 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
740 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
741 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
742 {Intrinsic::vp_fshl, MVT::v2i32, 7},
743 {Intrinsic::vp_fshl, MVT::v4i32, 7},
744 {Intrinsic::vp_fshl, MVT::v8i32, 7},
745 {Intrinsic::vp_fshl, MVT::v16i32, 7},
746 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
747 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
748 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
749 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
750 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
751 {Intrinsic::vp_fshl, MVT::v2i64, 7},
752 {Intrinsic::vp_fshl, MVT::v4i64, 7},
753 {Intrinsic::vp_fshl, MVT::v8i64, 7},
754 {Intrinsic::vp_fshl, MVT::v16i64, 7},
755 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
756 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
757 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
758 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
759 {Intrinsic::vp_fshr, MVT::v2i8, 7},
760 {Intrinsic::vp_fshr, MVT::v4i8, 7},
761 {Intrinsic::vp_fshr, MVT::v8i8, 7},
762 {Intrinsic::vp_fshr, MVT::v16i8, 7},
763 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
764 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
765 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
766 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
767 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
768 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
769 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
770 {Intrinsic::vp_fshr, MVT::v2i16, 7},
771 {Intrinsic::vp_fshr, MVT::v4i16, 7},
772 {Intrinsic::vp_fshr, MVT::v8i16, 7},
773 {Intrinsic::vp_fshr, MVT::v16i16, 7},
774 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
775 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
776 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
777 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
778 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
779 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
780 {Intrinsic::vp_fshr, MVT::v2i32, 7},
781 {Intrinsic::vp_fshr, MVT::v4i32, 7},
782 {Intrinsic::vp_fshr, MVT::v8i32, 7},
783 {Intrinsic::vp_fshr, MVT::v16i32, 7},
784 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
785 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
786 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
787 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
788 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
789 {Intrinsic::vp_fshr, MVT::v2i64, 7},
790 {Intrinsic::vp_fshr, MVT::v4i64, 7},
791 {Intrinsic::vp_fshr, MVT::v8i64, 7},
792 {Intrinsic::vp_fshr, MVT::v16i64, 7},
793 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
794 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
795 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
796 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
797 {Intrinsic::bitreverse, MVT::v2i8, 17},
798 {Intrinsic::bitreverse, MVT::v4i8, 17},
799 {Intrinsic::bitreverse, MVT::v8i8, 17},
800 {Intrinsic::bitreverse, MVT::v16i8, 17},
801 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
802 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
803 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
804 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
805 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
806 {Intrinsic::bitreverse, MVT::v2i16, 24},
807 {Intrinsic::bitreverse, MVT::v4i16, 24},
808 {Intrinsic::bitreverse, MVT::v8i16, 24},
809 {Intrinsic::bitreverse, MVT::v16i16, 24},
810 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
811 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
812 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
813 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
814 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
815 {Intrinsic::bitreverse, MVT::v2i32, 33},
816 {Intrinsic::bitreverse, MVT::v4i32, 33},
817 {Intrinsic::bitreverse, MVT::v8i32, 33},
818 {Intrinsic::bitreverse, MVT::v16i32, 33},
819 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
820 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
821 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
822 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
823 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
824 {Intrinsic::bitreverse, MVT::v2i64, 52},
825 {Intrinsic::bitreverse, MVT::v4i64, 52},
826 {Intrinsic::bitreverse, MVT::v8i64, 52},
827 {Intrinsic::bitreverse, MVT::v16i64, 52},
828 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
829 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
830 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
831 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
832 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
833 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
834 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
835 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
836 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
837 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
838 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
839 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
840 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
841 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
842 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
843 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
844 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
845 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
846 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
847 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
848 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
849 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
850 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
851 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
852 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
853 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
854 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
855 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
856 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
857 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
858 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
859 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
860 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
861 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
862 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
863 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
864 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
865 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
866 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
867 {Intrinsic::ctpop, MVT::v2i8, 12},
868 {Intrinsic::ctpop, MVT::v4i8, 12},
869 {Intrinsic::ctpop, MVT::v8i8, 12},
870 {Intrinsic::ctpop, MVT::v16i8, 12},
871 {Intrinsic::ctpop, MVT::nxv1i8, 12},
872 {Intrinsic::ctpop, MVT::nxv2i8, 12},
873 {Intrinsic::ctpop, MVT::nxv4i8, 12},
874 {Intrinsic::ctpop, MVT::nxv8i8, 12},
875 {Intrinsic::ctpop, MVT::nxv16i8, 12},
876 {Intrinsic::ctpop, MVT::v2i16, 19},
877 {Intrinsic::ctpop, MVT::v4i16, 19},
878 {Intrinsic::ctpop, MVT::v8i16, 19},
879 {Intrinsic::ctpop, MVT::v16i16, 19},
880 {Intrinsic::ctpop, MVT::nxv1i16, 19},
881 {Intrinsic::ctpop, MVT::nxv2i16, 19},
882 {Intrinsic::ctpop, MVT::nxv4i16, 19},
883 {Intrinsic::ctpop, MVT::nxv8i16, 19},
884 {Intrinsic::ctpop, MVT::nxv16i16, 19},
885 {Intrinsic::ctpop, MVT::v2i32, 20},
886 {Intrinsic::ctpop, MVT::v4i32, 20},
887 {Intrinsic::ctpop, MVT::v8i32, 20},
888 {Intrinsic::ctpop, MVT::v16i32, 20},
889 {Intrinsic::ctpop, MVT::nxv1i32, 20},
890 {Intrinsic::ctpop, MVT::nxv2i32, 20},
891 {Intrinsic::ctpop, MVT::nxv4i32, 20},
892 {Intrinsic::ctpop, MVT::nxv8i32, 20},
893 {Intrinsic::ctpop, MVT::nxv16i32, 20},
894 {Intrinsic::ctpop, MVT::v2i64, 21},
895 {Intrinsic::ctpop, MVT::v4i64, 21},
896 {Intrinsic::ctpop, MVT::v8i64, 21},
897 {Intrinsic::ctpop, MVT::v16i64, 21},
898 {Intrinsic::ctpop, MVT::nxv1i64, 21},
899 {Intrinsic::ctpop, MVT::nxv2i64, 21},
900 {Intrinsic::ctpop, MVT::nxv4i64, 21},
901 {Intrinsic::ctpop, MVT::nxv8i64, 21},
902 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
903 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
904 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
905 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
906 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
907 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
908 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
909 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
910 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
911 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
912 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
913 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
914 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
915 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
916 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
917 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
918 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
919 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
920 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
921 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
922 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
923 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
924 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
925 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
926 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
927 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
928 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
929 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
930 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
931 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
932 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
933 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
934 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
935 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
936 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
937 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
938 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
939 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
940 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
941 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
942 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
943 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
944 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
945 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
946 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
947 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
948 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
949 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
950 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
951 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
952 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
953 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
954 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
955 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
956 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
957 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
958 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
959 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
960 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
961 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
962 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
963 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
964 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
965 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
966 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
967 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
968 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
969 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
970 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
971 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
972 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
973 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
974 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
975 {Intrinsic::vp_cttz, MVT::v2i8, 16},
976 {Intrinsic::vp_cttz, MVT::v4i8, 16},
977 {Intrinsic::vp_cttz, MVT::v8i8, 16},
978 {Intrinsic::vp_cttz, MVT::v16i8, 16},
979 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
980 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
981 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
982 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
983 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
984 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
985 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
986 {Intrinsic::vp_cttz, MVT::v2i16, 23},
987 {Intrinsic::vp_cttz, MVT::v4i16, 23},
988 {Intrinsic::vp_cttz, MVT::v8i16, 23},
989 {Intrinsic::vp_cttz, MVT::v16i16, 23},
990 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
991 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
992 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
993 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
994 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
995 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
996 {Intrinsic::vp_cttz, MVT::v2i32, 24},
997 {Intrinsic::vp_cttz, MVT::v4i32, 24},
998 {Intrinsic::vp_cttz, MVT::v8i32, 24},
999 {Intrinsic::vp_cttz, MVT::v16i32, 24},
1000 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
1001 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
1002 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
1003 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
1004 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
1005 {Intrinsic::vp_cttz, MVT::v2i64, 25},
1006 {Intrinsic::vp_cttz, MVT::v4i64, 25},
1007 {Intrinsic::vp_cttz, MVT::v8i64, 25},
1008 {Intrinsic::vp_cttz, MVT::v16i64, 25},
1009 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
1010 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
1011 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
1012 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
1013};
1014
1016 switch (ID) {
1017#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1018 case Intrinsic::VPID: \
1019 return ISD::VPSD;
1020#include "llvm/IR/VPIntrinsics.def"
1021#undef HELPER_MAP_VPID_TO_VPSD
1022 }
1023 return ISD::DELETED_NODE;
1024}
1025
1029 auto *RetTy = ICA.getReturnType();
1030 switch (ICA.getID()) {
1031 case Intrinsic::ceil:
1032 case Intrinsic::floor:
1033 case Intrinsic::trunc:
1034 case Intrinsic::rint:
1035 case Intrinsic::round:
1036 case Intrinsic::roundeven: {
1037 // These all use the same code.
1038 auto LT = getTypeLegalizationCost(RetTy);
1039 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1040 return LT.first * 8;
1041 break;
1042 }
1043 case Intrinsic::umin:
1044 case Intrinsic::umax:
1045 case Intrinsic::smin:
1046 case Intrinsic::smax: {
1047 auto LT = getTypeLegalizationCost(RetTy);
1048 if ((ST->hasVInstructions() && LT.second.isVector()) ||
1049 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
1050 return LT.first;
1051 break;
1052 }
1053 case Intrinsic::sadd_sat:
1054 case Intrinsic::ssub_sat:
1055 case Intrinsic::uadd_sat:
1056 case Intrinsic::usub_sat:
1057 case Intrinsic::fabs:
1058 case Intrinsic::sqrt: {
1059 auto LT = getTypeLegalizationCost(RetTy);
1060 if (ST->hasVInstructions() && LT.second.isVector())
1061 return LT.first;
1062 break;
1063 }
1064 case Intrinsic::abs: {
1065 auto LT = getTypeLegalizationCost(RetTy);
1066 if (ST->hasVInstructions() && LT.second.isVector()) {
1067 // vrsub.vi v10, v8, 0
1068 // vmax.vv v8, v8, v10
1069 return LT.first * 2;
1070 }
1071 break;
1072 }
1073 // TODO: add more intrinsic
1074 case Intrinsic::experimental_stepvector: {
1075 unsigned Cost = 1; // vid
1076 auto LT = getTypeLegalizationCost(RetTy);
1077 return Cost + (LT.first - 1);
1078 }
1079 case Intrinsic::vp_rint: {
1080 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1081 unsigned Cost = 5;
1082 auto LT = getTypeLegalizationCost(RetTy);
1083 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1084 return Cost * LT.first;
1085 break;
1086 }
1087 case Intrinsic::vp_nearbyint: {
1088 // More one read and one write for fflags than vp_rint.
1089 unsigned Cost = 7;
1090 auto LT = getTypeLegalizationCost(RetTy);
1091 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1092 return Cost * LT.first;
1093 break;
1094 }
1095 case Intrinsic::vp_ceil:
1096 case Intrinsic::vp_floor:
1097 case Intrinsic::vp_round:
1098 case Intrinsic::vp_roundeven:
1099 case Intrinsic::vp_roundtozero: {
1100 // Rounding with static rounding mode needs two more instructions to
1101 // swap/write FRM than vp_rint.
1102 unsigned Cost = 7;
1103 auto LT = getTypeLegalizationCost(RetTy);
1104 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1105 if (TLI->isOperationCustom(VPISD, LT.second))
1106 return Cost * LT.first;
1107 break;
1108 }
1109 }
1110
1111 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1112 auto LT = getTypeLegalizationCost(RetTy);
1113 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1114 ICA.getID(), LT.second))
1115 return LT.first * Entry->Cost;
1116 }
1117
1119}
1120
1122 Type *Src,
1125 const Instruction *I) {
1126 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1127 // FIXME: Need to compute legalizing cost for illegal types.
1128 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1129 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1130
1131 // Skip if element size of Dst or Src is bigger than ELEN.
1132 if (Src->getScalarSizeInBits() > ST->getELEN() ||
1133 Dst->getScalarSizeInBits() > ST->getELEN())
1134 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1135
1136 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1137 assert(ISD && "Invalid opcode");
1138
1139 // FIXME: Need to consider vsetvli and lmul.
1140 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1141 (int)Log2_32(Src->getScalarSizeInBits());
1142 switch (ISD) {
1143 case ISD::SIGN_EXTEND:
1144 case ISD::ZERO_EXTEND:
1145 if (Src->getScalarSizeInBits() == 1) {
1146 // We do not use vsext/vzext to extend from mask vector.
1147 // Instead we use the following instructions to extend from mask vector:
1148 // vmv.v.i v8, 0
1149 // vmerge.vim v8, v8, -1, v0
1150 return 2;
1151 }
1152 return 1;
1153 case ISD::TRUNCATE:
1154 if (Dst->getScalarSizeInBits() == 1) {
1155 // We do not use several vncvt to truncate to mask vector. So we could
1156 // not use PowDiff to calculate it.
1157 // Instead we use the following instructions to truncate to mask vector:
1158 // vand.vi v8, v8, 1
1159 // vmsne.vi v0, v8, 0
1160 return 2;
1161 }
1162 [[fallthrough]];
1163 case ISD::FP_EXTEND:
1164 case ISD::FP_ROUND:
1165 // Counts of narrow/widen instructions.
1166 return std::abs(PowDiff);
1167 case ISD::FP_TO_SINT:
1168 case ISD::FP_TO_UINT:
1169 case ISD::SINT_TO_FP:
1170 case ISD::UINT_TO_FP:
1171 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1172 // The cost of convert from or to mask vector is different from other
1173 // cases. We could not use PowDiff to calculate it.
1174 // For mask vector to fp, we should use the following instructions:
1175 // vmv.v.i v8, 0
1176 // vmerge.vim v8, v8, -1, v0
1177 // vfcvt.f.x.v v8, v8
1178
1179 // And for fp vector to mask, we use:
1180 // vfncvt.rtz.x.f.w v9, v8
1181 // vand.vi v8, v9, 1
1182 // vmsne.vi v0, v8, 0
1183 return 3;
1184 }
1185 if (std::abs(PowDiff) <= 1)
1186 return 1;
1187 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1188 // so it only need two conversion.
1189 if (Src->isIntOrIntVectorTy())
1190 return 2;
1191 // Counts of narrow/widen instructions.
1192 return std::abs(PowDiff);
1193 }
1194 }
1195 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1196}
1197
1198unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1199 if (isa<ScalableVectorType>(Ty)) {
1200 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1201 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1202 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1203 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1204 }
1205 return cast<FixedVectorType>(Ty)->getNumElements();
1206}
1207
1210 bool IsUnsigned, FastMathFlags FMF,
1212 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1213 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, FMF, CostKind);
1214
1215 // Skip if scalar size of Ty is bigger than ELEN.
1216 if (Ty->getScalarSizeInBits() > ST->getELEN())
1217 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, FMF, CostKind);
1218
1219 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1220 if (Ty->getElementType()->isIntegerTy(1))
1221 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1222 // cost 2, but we don't have enough info here so we slightly over cost.
1223 return (LT.first - 1) + 3;
1224
1225 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1226 InstructionCost BaseCost = 2;
1227
1229 return (LT.first - 1) + BaseCost;
1230
1231 unsigned VL = getEstimatedVLFor(Ty);
1232 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1233}
1234
1237 std::optional<FastMathFlags> FMF,
1239 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1240 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1241
1242 // Skip if scalar size of Ty is bigger than ELEN.
1243 if (Ty->getScalarSizeInBits() > ST->getELEN())
1244 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1245
1246 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1247 assert(ISD && "Invalid opcode");
1248
1249 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1250 ISD != ISD::FADD)
1251 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1252
1253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1254 if (Ty->getElementType()->isIntegerTy(1))
1255 // vcpop sequences, see vreduction-mask.ll
1256 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1257
1258 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1259 InstructionCost BaseCost = 2;
1260
1262 return (LT.first - 1) + BaseCost;
1263
1264 unsigned VL = getEstimatedVLFor(Ty);
1266 return (LT.first - 1) + BaseCost + VL;
1267 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1268}
1269
1271 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1273 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1274 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1275 FMF, CostKind);
1276
1277 // Skip if scalar size of ResTy is bigger than ELEN.
1278 if (ResTy->getScalarSizeInBits() > ST->getELEN())
1279 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1280 FMF, CostKind);
1281
1282 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1283 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1284 FMF, CostKind);
1285
1286 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1287
1288 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1289 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1290 FMF, CostKind);
1291
1292 return (LT.first - 1) +
1293 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1294}
1295
1297 TTI::OperandValueInfo OpInfo,
1299 assert(OpInfo.isConstant() && "non constant operand?");
1300 if (!isa<VectorType>(Ty))
1301 // FIXME: We need to account for immediate materialization here, but doing
1302 // a decent job requires more knowledge about the immediate than we
1303 // currently have here.
1304 return 0;
1305
1306 if (OpInfo.isUniform())
1307 // vmv.x.i, vmv.v.x, or vfmv.v.f
1308 // We ignore the cost of the scalar constant materialization to be consistent
1309 // with how we treat scalar constants themselves just above.
1310 return 1;
1311
1312 return getConstantPoolLoadCost(Ty, CostKind);
1313}
1314
1315
1317 MaybeAlign Alignment,
1318 unsigned AddressSpace,
1320 TTI::OperandValueInfo OpInfo,
1321 const Instruction *I) {
1322 EVT VT = TLI->getValueType(DL, Src, true);
1323 // Type legalization can't handle structs
1324 if (VT == MVT::Other)
1325 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1326 CostKind, OpInfo, I);
1327
1329 if (Opcode == Instruction::Store && OpInfo.isConstant())
1330 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1331 InstructionCost BaseCost =
1332 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1333 CostKind, OpInfo, I);
1334 // Assume memory ops cost scale with the number of vector registers
1335 // possible accessed by the instruction. Note that BasicTTI already
1336 // handles the LT.first term for us.
1337 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1338 LT.second.isVector())
1339 BaseCost *= getLMULCost(LT.second);
1340 return Cost + BaseCost;
1341
1342}
1343
1345 Type *CondTy,
1346 CmpInst::Predicate VecPred,
1348 const Instruction *I) {
1350 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1351 I);
1352
1353 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1354 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1355 I);
1356
1357 // Skip if scalar size of ValTy is bigger than ELEN.
1358 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1359 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1360 I);
1361
1362 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1363 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1364 if (CondTy->isVectorTy()) {
1365 if (ValTy->getScalarSizeInBits() == 1) {
1366 // vmandn.mm v8, v8, v9
1367 // vmand.mm v9, v0, v9
1368 // vmor.mm v0, v9, v8
1369 return LT.first * 3;
1370 }
1371 // vselect and max/min are supported natively.
1372 return LT.first * 1;
1373 }
1374
1375 if (ValTy->getScalarSizeInBits() == 1) {
1376 // vmv.v.x v9, a0
1377 // vmsne.vi v9, v9, 0
1378 // vmandn.mm v8, v8, v9
1379 // vmand.mm v9, v0, v9
1380 // vmor.mm v0, v9, v8
1381 return LT.first * 5;
1382 }
1383
1384 // vmv.v.x v10, a0
1385 // vmsne.vi v0, v10, 0
1386 // vmerge.vvm v8, v9, v8, v0
1387 return LT.first * 3;
1388 }
1389
1390 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1391 ValTy->isVectorTy()) {
1392 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1393
1394 // Support natively.
1395 if (CmpInst::isIntPredicate(VecPred))
1396 return LT.first * 1;
1397
1398 // If we do not support the input floating point vector type, use the base
1399 // one which will calculate as:
1400 // ScalarizeCost + Num * Cost for fixed vector,
1401 // InvalidCost for scalable vector.
1402 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1403 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1404 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1405 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1406 I);
1407 switch (VecPred) {
1408 // Support natively.
1409 case CmpInst::FCMP_OEQ:
1410 case CmpInst::FCMP_OGT:
1411 case CmpInst::FCMP_OGE:
1412 case CmpInst::FCMP_OLT:
1413 case CmpInst::FCMP_OLE:
1414 case CmpInst::FCMP_UNE:
1415 return LT.first * 1;
1416 // TODO: Other comparisons?
1417 default:
1418 break;
1419 }
1420 }
1421
1422 // TODO: Add cost for scalar type.
1423
1424 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1425}
1426
1429 unsigned Index, Value *Op0,
1430 Value *Op1) {
1431 assert(Val->isVectorTy() && "This must be a vector type");
1432
1433 if (Opcode != Instruction::ExtractElement &&
1434 Opcode != Instruction::InsertElement)
1435 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1436
1437 // Legalize the type.
1438 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1439
1440 // This type is legalized to a scalar type.
1441 if (!LT.second.isVector())
1442 return 0;
1443
1444 // For unsupported scalable vector.
1445 if (LT.second.isScalableVector() && !LT.first.isValid())
1446 return LT.first;
1447
1448 if (!isTypeLegal(Val))
1449 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1450
1451 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1452 // and vslideup + vmv.s.x to insert element to vector.
1453 unsigned BaseCost = 1;
1454 // When insertelement we should add the index with 1 as the input of vslideup.
1455 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1456
1457 if (Index != -1U) {
1458 // The type may be split. For fixed-width vectors we can normalize the
1459 // index to the new type.
1460 if (LT.second.isFixedLengthVector()) {
1461 unsigned Width = LT.second.getVectorNumElements();
1462 Index = Index % Width;
1463 }
1464
1465 // We could extract/insert the first element without vslidedown/vslideup.
1466 if (Index == 0)
1467 SlideCost = 0;
1468 else if (Opcode == Instruction::InsertElement)
1469 SlideCost = 1; // With a constant index, we do not need to use addi.
1470 }
1471
1472 // Mask vector extract/insert element is different from normal case.
1473 if (Val->getScalarSizeInBits() == 1) {
1474 // For extractelement, we need the following instructions:
1475 // vmv.v.i v8, 0
1476 // vmerge.vim v8, v8, 1, v0
1477 // vsetivli zero, 1, e8, m2, ta, mu (not count)
1478 // vslidedown.vx v8, v8, a0
1479 // vmv.x.s a0, v8
1480
1481 // For insertelement, we need the following instructions:
1482 // vsetvli a2, zero, e8, m1, ta, mu (not count)
1483 // vmv.s.x v8, a0
1484 // vmv.v.i v9, 0
1485 // vmerge.vim v9, v9, 1, v0
1486 // addi a0, a1, 1
1487 // vsetvli zero, a0, e8, m1, tu, mu (not count)
1488 // vslideup.vx v9, v8, a1
1489 // vsetvli a0, zero, e8, m1, ta, mu (not count)
1490 // vand.vi v8, v9, 1
1491 // vmsne.vi v0, v8, 0
1492
1493 // TODO: should we count these special vsetvlis?
1494 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1495 }
1496 // Extract i64 in the target that has XLEN=32 need more instruction.
1497 if (Val->getScalarType()->isIntegerTy() &&
1498 ST->getXLen() < Val->getScalarSizeInBits()) {
1499 // For extractelement, we need the following instructions:
1500 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1501 // vslidedown.vx v8, v8, a0
1502 // vmv.x.s a0, v8
1503 // li a1, 32
1504 // vsrl.vx v8, v8, a1
1505 // vmv.x.s a1, v8
1506
1507 // For insertelement, we need the following instructions:
1508 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1509 // vmv.v.i v12, 0
1510 // vslide1up.vx v16, v12, a1
1511 // vslide1up.vx v12, v16, a0
1512 // addi a0, a2, 1
1513 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1514 // vslideup.vx v8, v12, a2
1515
1516 // TODO: should we count these special vsetvlis?
1517 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1518 }
1519 return BaseCost + SlideCost;
1520}
1521
1523 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1525 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1526
1527 // TODO: Handle more cost kinds.
1529 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1530 Args, CxtI);
1531
1532 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1533 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1534 Args, CxtI);
1535
1536 // Skip if scalar size of Ty is bigger than ELEN.
1537 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1538 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1539 Args, CxtI);
1540
1541 // Legalize the type.
1542 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1543
1544 // TODO: Handle scalar type.
1545 if (!LT.second.isVector())
1546 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1547 Args, CxtI);
1548
1549
1550 auto getConstantMatCost =
1551 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1552 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1553 // Two sub-cases:
1554 // * Has a 5 bit immediate operand which can be splatted.
1555 // * Has a larger immediate which must be materialized in scalar register
1556 // We return 0 for both as we currently ignore the cost of materializing
1557 // scalar constants in GPRs.
1558 return 0;
1559
1560 return getConstantPoolLoadCost(Ty, CostKind);
1561 };
1562
1563 // Add the cost of materializing any constant vectors required.
1564 InstructionCost ConstantMatCost = 0;
1565 if (Op1Info.isConstant())
1566 ConstantMatCost += getConstantMatCost(0, Op1Info);
1567 if (Op2Info.isConstant())
1568 ConstantMatCost += getConstantMatCost(1, Op2Info);
1569
1570 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1571 case ISD::ADD:
1572 case ISD::SUB:
1573 case ISD::AND:
1574 case ISD::OR:
1575 case ISD::XOR:
1576 case ISD::SHL:
1577 case ISD::SRL:
1578 case ISD::SRA:
1579 case ISD::MUL:
1580 case ISD::MULHS:
1581 case ISD::MULHU:
1582 case ISD::FADD:
1583 case ISD::FSUB:
1584 case ISD::FMUL:
1585 case ISD::FNEG: {
1586 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1587 }
1588 default:
1589 return ConstantMatCost +
1590 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1591 Args, CxtI);
1592 }
1593}
1594
1595// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1597 ArrayRef<const Value *> Ptrs, const Value *Base,
1598 const TTI::PointersChainInfo &Info, Type *AccessTy,
1601 // In the basic model we take into account GEP instructions only
1602 // (although here can come alloca instruction, a value, constants and/or
1603 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1604 // pointer). Typically, if Base is a not a GEP-instruction and all the
1605 // pointers are relative to the same base address, all the rest are
1606 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1607 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1608 // any their index is a non-const.
1609 // If no known dependecies between the pointers cost is calculated as a sum
1610 // of costs of GEP instructions.
1611 for (auto [I, V] : enumerate(Ptrs)) {
1612 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1613 if (!GEP)
1614 continue;
1615 if (Info.isSameBase() && V != Base) {
1616 if (GEP->hasAllConstantIndices())
1617 continue;
1618 // If the chain is unit-stride and BaseReg + stride*i is a legal
1619 // addressing mode, then presume the base GEP is sitting around in a
1620 // register somewhere and check if we can fold the offset relative to
1621 // it.
1622 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1623 if (Info.isUnitStride() &&
1624 isLegalAddressingMode(AccessTy,
1625 /* BaseGV */ nullptr,
1626 /* BaseOffset */ Stride * I,
1627 /* HasBaseReg */ true,
1628 /* Scale */ 0,
1629 GEP->getType()->getPointerAddressSpace()))
1630 continue;
1631 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1632 {TTI::OK_AnyValue, TTI::OP_None},
1633 {TTI::OK_AnyValue, TTI::OP_None},
1634 std::nullopt);
1635 } else {
1636 SmallVector<const Value *> Indices(GEP->indices());
1637 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1638 Indices, CostKind);
1639 }
1640 }
1641 return Cost;
1642}
1643
1647 // TODO: More tuning on benchmarks and metrics with changes as needed
1648 // would apply to all settings below to enable performance.
1649
1650
1651 if (ST->enableDefaultUnroll())
1652 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1653
1654 // Enable Upper bound unrolling universally, not dependant upon the conditions
1655 // below.
1656 UP.UpperBound = true;
1657
1658 // Disable loop unrolling for Oz and Os.
1659 UP.OptSizeThreshold = 0;
1661 if (L->getHeader()->getParent()->hasOptSize())
1662 return;
1663
1664 SmallVector<BasicBlock *, 4> ExitingBlocks;
1665 L->getExitingBlocks(ExitingBlocks);
1666 LLVM_DEBUG(dbgs() << "Loop has:\n"
1667 << "Blocks: " << L->getNumBlocks() << "\n"
1668 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1669
1670 // Only allow another exit other than the latch. This acts as an early exit
1671 // as it mirrors the profitability calculation of the runtime unroller.
1672 if (ExitingBlocks.size() > 2)
1673 return;
1674
1675 // Limit the CFG of the loop body for targets with a branch predictor.
1676 // Allowing 4 blocks permits if-then-else diamonds in the body.
1677 if (L->getNumBlocks() > 4)
1678 return;
1679
1680 // Don't unroll vectorized loops, including the remainder loop
1681 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1682 return;
1683
1684 // Scan the loop: don't unroll loops with calls as this could prevent
1685 // inlining.
1687 for (auto *BB : L->getBlocks()) {
1688 for (auto &I : *BB) {
1689 // Initial setting - Don't unroll loops containing vectorized
1690 // instructions.
1691 if (I.getType()->isVectorTy())
1692 return;
1693
1694 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1695 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1696 if (!isLoweredToCall(F))
1697 continue;
1698 }
1699 return;
1700 }
1701
1702 SmallVector<const Value *> Operands(I.operand_values());
1705 }
1706 }
1707
1708 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1709
1710 UP.Partial = true;
1711 UP.Runtime = true;
1712 UP.UnrollRemainder = true;
1713 UP.UnrollAndJam = true;
1715
1716 // Force unrolling small loops can be very useful because of the branch
1717 // taken cost of the backedge.
1718 if (Cost < 12)
1719 UP.Force = true;
1720}
1721
1725}
1726
1729 if (Ty->isVectorTy()) {
1730 if (Size.isScalable() && ST->hasVInstructions())
1731 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1732
1734 return divideCeil(Size, ST->getRealMinVLen());
1735 }
1736
1737 return BaseT::getRegUsageForType(Ty);
1738}
1739
1740unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1741 // This interface is currently only used by SLP. Returning 1 (which is the
1742 // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
1743 // problem w/ constant materialization which causes SLP to perform majorly
1744 // unprofitable transformations.
1745 // TODO: Figure out constant materialization cost modeling and remove.
1746 return SLPMaxVF;
1747}
1748
1750 const TargetTransformInfo::LSRCost &C2) {
1751 // RISC-V specific here are "instruction number 1st priority".
1752 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1753 C1.NumIVMuls, C1.NumBaseAdds,
1754 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1755 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1756 C2.NumIVMuls, C2.NumBaseAdds,
1757 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1758}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Result used for getMaximumVF query which is used exclusively by " "SLP vectorizer. Defaults to 1 which disables SLP."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:540
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:714
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:849
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:713
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:963
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:330
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:612
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:813
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:927
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:418
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:993
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:714
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:717
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:715
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:716
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:718
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:727
bool isIntPredicate() const
Definition: InstrTypes.h:819
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:868
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:536
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:754
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
Machine Value Type.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
unsigned getELEN() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool isLegalInterleavedAccessType(FixedVectorType *, unsigned Factor, const DataLayout &) const
Returns whether or not generating a fixed length interleaved load/store intrinsic for this type will ...
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
static RISCVII::VLMUL getLMUL(MVT VT)
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:229
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:638
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:738
Type * getElementType() const
Definition: DerivedTypes.h:433
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:163
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:786
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:773
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:923
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:650
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:704
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:776
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:883
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:832
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:679
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:865
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:782
int getIntMatCost(const APInt &Val, unsigned Size, const FeatureBitset &ActiveFeatures, bool CompressionCost)
std::pair< unsigned, bool > decodeVLMUL(RISCVII::VLMUL VLMUL)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:395
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1084
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2430
AddressSpace
Definition: NVPTXBaseInfo.h:21
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:382
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2087
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).