LLVM 17.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
15#include <cmath>
16#include <optional>
17using namespace llvm;
18
19#define DEBUG_TYPE "riscvtti"
20
22 "riscv-v-register-bit-width-lmul",
24 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
25 "by autovectorized code. Fractional LMULs are not supported."),
27
29 "riscv-v-slp-max-vf",
31 "Result used for getMaximumVF query which is used exclusively by "
32 "SLP vectorizer. Defaults to 1 which disables SLP."),
34
35InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
36 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
37 // implementation-defined.
38 if (!VT.isVector())
40 unsigned Cost;
41 if (VT.isScalableVector()) {
42 unsigned LMul;
43 bool Fractional;
44 std::tie(LMul, Fractional) =
46 if (Fractional)
47 Cost = 1;
48 else
49 Cost = LMul;
50 } else {
51 Cost = VT.getSizeInBits() / ST->getRealMinVLen();
52 }
53 return std::max<unsigned>(Cost, 1);
54}
55
58 assert(Ty->isIntegerTy() &&
59 "getIntImmCost can only estimate cost of materialising integers");
60
61 // We have a Zero register, so 0 is always free.
62 if (Imm == 0)
63 return TTI::TCC_Free;
64
65 // Otherwise, we check how many instructions it will take to materialise.
66 const DataLayout &DL = getDataLayout();
68 getST()->getFeatureBits());
69}
70
71// Look for patterns of shift followed by AND that can be turned into a pair of
72// shifts. We won't need to materialize an immediate for the AND so these can
73// be considered free.
74static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
75 uint64_t Mask = Imm.getZExtValue();
76 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
77 if (!BO || !BO->hasOneUse())
78 return false;
79
80 if (BO->getOpcode() != Instruction::Shl)
81 return false;
82
83 if (!isa<ConstantInt>(BO->getOperand(1)))
84 return false;
85
86 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
87 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
88 // is a mask shifted by c2 bits with c3 leading zeros.
89 if (isShiftedMask_64(Mask)) {
90 unsigned Trailing = llvm::countr_zero(Mask);
91 if (ShAmt == Trailing)
92 return true;
93 }
94
95 return false;
96}
97
99 const APInt &Imm, Type *Ty,
101 Instruction *Inst) {
102 assert(Ty->isIntegerTy() &&
103 "getIntImmCost can only estimate cost of materialising integers");
104
105 // We have a Zero register, so 0 is always free.
106 if (Imm == 0)
107 return TTI::TCC_Free;
108
109 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
110 // commutative, in others the immediate comes from a specific argument index.
111 bool Takes12BitImm = false;
112 unsigned ImmArgIdx = ~0U;
113
114 switch (Opcode) {
115 case Instruction::GetElementPtr:
116 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
117 // split up large offsets in GEP into better parts than ConstantHoisting
118 // can.
119 return TTI::TCC_Free;
120 case Instruction::And:
121 // zext.h
122 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
123 return TTI::TCC_Free;
124 // zext.w
125 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
126 return TTI::TCC_Free;
127 // bclri
128 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
129 return TTI::TCC_Free;
130 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
131 canUseShiftPair(Inst, Imm))
132 return TTI::TCC_Free;
133 Takes12BitImm = true;
134 break;
135 case Instruction::Add:
136 Takes12BitImm = true;
137 break;
138 case Instruction::Or:
139 case Instruction::Xor:
140 // bseti/binvi
141 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
142 return TTI::TCC_Free;
143 Takes12BitImm = true;
144 break;
145 case Instruction::Mul:
146 // Negated power of 2 is a shift and a negate.
147 if (Imm.isNegatedPowerOf2())
148 return TTI::TCC_Free;
149 // FIXME: There is no MULI instruction.
150 Takes12BitImm = true;
151 break;
152 case Instruction::Sub:
153 case Instruction::Shl:
154 case Instruction::LShr:
155 case Instruction::AShr:
156 Takes12BitImm = true;
157 ImmArgIdx = 1;
158 break;
159 default:
160 break;
161 }
162
163 if (Takes12BitImm) {
164 // Check immediate is the correct argument...
165 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
166 // ... and fits into the 12-bit immediate.
167 if (Imm.getMinSignedBits() <= 64 &&
168 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
169 return TTI::TCC_Free;
170 }
171 }
172
173 // Otherwise, use the full materialisation cost.
174 return getIntImmCost(Imm, Ty, CostKind);
175 }
176
177 // By default, prevent hoisting.
178 return TTI::TCC_Free;
179}
180
183 const APInt &Imm, Type *Ty,
185 // Prevent hoisting in unknown cases.
186 return TTI::TCC_Free;
187}
188
191 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
192 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
193}
194
196 // Currently, the ExpandReductions pass can't expand scalable-vector
197 // reductions, but we still request expansion as RVV doesn't support certain
198 // reductions and the SelectionDAG can't legalize them either.
199 switch (II->getIntrinsicID()) {
200 default:
201 return false;
202 // These reductions have no equivalent in RVV
203 case Intrinsic::vector_reduce_mul:
204 case Intrinsic::vector_reduce_fmul:
205 return true;
206 }
207}
208
209std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
210 if (ST->hasVInstructions())
212 return BaseT::getMaxVScale();
213}
214
215std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
216 if (ST->hasVInstructions())
217 if (unsigned MinVLen = ST->getRealMinVLen();
218 MinVLen >= RISCV::RVVBitsPerBlock)
219 return MinVLen / RISCV::RVVBitsPerBlock;
221}
222
225 unsigned LMUL =
226 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
227 switch (K) {
229 return TypeSize::getFixed(ST->getXLen());
231 return TypeSize::getFixed(
232 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
235 (ST->hasVInstructions() &&
238 : 0);
239 }
240
241 llvm_unreachable("Unsupported register kind");
242}
243
245 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
246
247 unsigned Cost = 2; // vslidedown+vslideup.
248 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
249 // of similar code, but I think we expand through memory.
250 return Cost * LT.first * getLMULCost(LT.second);
251}
252
254 VectorType *Tp, ArrayRef<int> Mask,
256 int Index, VectorType *SubTp,
258 if (isa<ScalableVectorType>(Tp)) {
259 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
260 switch (Kind) {
261 default:
262 // Fallthrough to generic handling.
263 // TODO: Most of these cases will return getInvalid in generic code, and
264 // must be implemented here.
265 break;
266 case TTI::SK_Broadcast: {
267 return LT.first * 1;
268 }
269 case TTI::SK_Splice:
270 return getSpliceCost(Tp, Index);
271 case TTI::SK_Reverse:
272 // Most of the cost here is producing the vrgather index register
273 // Example sequence:
274 // csrr a0, vlenb
275 // srli a0, a0, 3
276 // addi a0, a0, -1
277 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
278 // vid.v v9
279 // vrsub.vx v10, v9, a0
280 // vrgather.vv v9, v8, v10
281 if (Tp->getElementType()->isIntegerTy(1))
282 // Mask operation additionally required extend and truncate
283 return LT.first * 9;
284 return LT.first * 6;
285 }
286 }
287
288 if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
289 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
290 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
291 Instruction::InsertElement);
292 if (LT.second.getScalarSizeInBits() == 1) {
293 if (HasScalar) {
294 // Example sequence:
295 // andi a0, a0, 1
296 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
297 // vmv.v.x v8, a0
298 // vmsne.vi v0, v8, 0
299 return LT.first * getLMULCost(LT.second) * 3;
300 }
301 // Example sequence:
302 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
303 // vmv.v.i v8, 0
304 // vmerge.vim v8, v8, 1, v0
305 // vmv.x.s a0, v8
306 // andi a0, a0, 1
307 // vmv.v.x v8, a0
308 // vmsne.vi v0, v8, 0
309
310 return LT.first * getLMULCost(LT.second) * 6;
311 }
312
313 if (HasScalar) {
314 // Example sequence:
315 // vmv.v.x v8, a0
316 return LT.first * getLMULCost(LT.second);
317 }
318
319 // Example sequence:
320 // vrgather.vi v9, v8, 0
321 // TODO: vrgather could be slower than vmv.v.x. It is
322 // implementation-dependent.
323 return LT.first * getLMULCost(LT.second);
324 }
325
326 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
327}
328
330RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
331 unsigned AddressSpace,
333 if (!isLegalMaskedLoadStore(Src, Alignment) ||
335 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
336 CostKind);
337
338 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
339}
340
342 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
343 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
345 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
346 Alignment, CostKind, I);
347
348 if ((Opcode == Instruction::Load &&
349 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
350 (Opcode == Instruction::Store &&
351 !isLegalMaskedScatter(DataTy, Align(Alignment))))
352 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
353 Alignment, CostKind, I);
354
355 // Cost is proportional to the number of memory operations implied. For
356 // scalable vectors, we use an estimate on that number since we don't
357 // know exactly what VL will be.
358 auto &VTy = *cast<VectorType>(DataTy);
359 InstructionCost MemOpCost =
360 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
361 {TTI::OK_AnyValue, TTI::OP_None}, I);
362 unsigned NumLoads = getEstimatedVLFor(&VTy);
363 return NumLoads * MemOpCost;
364}
365
366// Currently, these represent both throughput and codesize costs
367// for the respective intrinsics. The costs in this table are simply
368// instruction counts with the following adjustments made:
369// * One vsetvli is considered free.
371 {Intrinsic::floor, MVT::v2f32, 9},
372 {Intrinsic::floor, MVT::v4f32, 9},
373 {Intrinsic::floor, MVT::v8f32, 9},
374 {Intrinsic::floor, MVT::v16f32, 9},
375 {Intrinsic::floor, MVT::nxv1f32, 9},
376 {Intrinsic::floor, MVT::nxv2f32, 9},
377 {Intrinsic::floor, MVT::nxv4f32, 9},
378 {Intrinsic::floor, MVT::nxv8f32, 9},
379 {Intrinsic::floor, MVT::nxv16f32, 9},
380 {Intrinsic::floor, MVT::v2f64, 9},
381 {Intrinsic::floor, MVT::v4f64, 9},
382 {Intrinsic::floor, MVT::v8f64, 9},
383 {Intrinsic::floor, MVT::v16f64, 9},
384 {Intrinsic::floor, MVT::nxv1f64, 9},
385 {Intrinsic::floor, MVT::nxv2f64, 9},
386 {Intrinsic::floor, MVT::nxv4f64, 9},
387 {Intrinsic::floor, MVT::nxv8f64, 9},
388 {Intrinsic::ceil, MVT::v2f32, 9},
389 {Intrinsic::ceil, MVT::v4f32, 9},
390 {Intrinsic::ceil, MVT::v8f32, 9},
391 {Intrinsic::ceil, MVT::v16f32, 9},
392 {Intrinsic::ceil, MVT::nxv1f32, 9},
393 {Intrinsic::ceil, MVT::nxv2f32, 9},
394 {Intrinsic::ceil, MVT::nxv4f32, 9},
395 {Intrinsic::ceil, MVT::nxv8f32, 9},
396 {Intrinsic::ceil, MVT::nxv16f32, 9},
397 {Intrinsic::ceil, MVT::v2f64, 9},
398 {Intrinsic::ceil, MVT::v4f64, 9},
399 {Intrinsic::ceil, MVT::v8f64, 9},
400 {Intrinsic::ceil, MVT::v16f64, 9},
401 {Intrinsic::ceil, MVT::nxv1f64, 9},
402 {Intrinsic::ceil, MVT::nxv2f64, 9},
403 {Intrinsic::ceil, MVT::nxv4f64, 9},
404 {Intrinsic::ceil, MVT::nxv8f64, 9},
405 {Intrinsic::trunc, MVT::v2f32, 7},
406 {Intrinsic::trunc, MVT::v4f32, 7},
407 {Intrinsic::trunc, MVT::v8f32, 7},
408 {Intrinsic::trunc, MVT::v16f32, 7},
409 {Intrinsic::trunc, MVT::nxv1f32, 7},
410 {Intrinsic::trunc, MVT::nxv2f32, 7},
411 {Intrinsic::trunc, MVT::nxv4f32, 7},
412 {Intrinsic::trunc, MVT::nxv8f32, 7},
413 {Intrinsic::trunc, MVT::nxv16f32, 7},
414 {Intrinsic::trunc, MVT::v2f64, 7},
415 {Intrinsic::trunc, MVT::v4f64, 7},
416 {Intrinsic::trunc, MVT::v8f64, 7},
417 {Intrinsic::trunc, MVT::v16f64, 7},
418 {Intrinsic::trunc, MVT::nxv1f64, 7},
419 {Intrinsic::trunc, MVT::nxv2f64, 7},
420 {Intrinsic::trunc, MVT::nxv4f64, 7},
421 {Intrinsic::trunc, MVT::nxv8f64, 7},
422 {Intrinsic::round, MVT::v2f32, 9},
423 {Intrinsic::round, MVT::v4f32, 9},
424 {Intrinsic::round, MVT::v8f32, 9},
425 {Intrinsic::round, MVT::v16f32, 9},
426 {Intrinsic::round, MVT::nxv1f32, 9},
427 {Intrinsic::round, MVT::nxv2f32, 9},
428 {Intrinsic::round, MVT::nxv4f32, 9},
429 {Intrinsic::round, MVT::nxv8f32, 9},
430 {Intrinsic::round, MVT::nxv16f32, 9},
431 {Intrinsic::round, MVT::v2f64, 9},
432 {Intrinsic::round, MVT::v4f64, 9},
433 {Intrinsic::round, MVT::v8f64, 9},
434 {Intrinsic::round, MVT::v16f64, 9},
435 {Intrinsic::round, MVT::nxv1f64, 9},
436 {Intrinsic::round, MVT::nxv2f64, 9},
437 {Intrinsic::round, MVT::nxv4f64, 9},
438 {Intrinsic::round, MVT::nxv8f64, 9},
439 {Intrinsic::roundeven, MVT::v2f32, 9},
440 {Intrinsic::roundeven, MVT::v4f32, 9},
441 {Intrinsic::roundeven, MVT::v8f32, 9},
442 {Intrinsic::roundeven, MVT::v16f32, 9},
443 {Intrinsic::roundeven, MVT::nxv1f32, 9},
444 {Intrinsic::roundeven, MVT::nxv2f32, 9},
445 {Intrinsic::roundeven, MVT::nxv4f32, 9},
446 {Intrinsic::roundeven, MVT::nxv8f32, 9},
447 {Intrinsic::roundeven, MVT::nxv16f32, 9},
448 {Intrinsic::roundeven, MVT::v2f64, 9},
449 {Intrinsic::roundeven, MVT::v4f64, 9},
450 {Intrinsic::roundeven, MVT::v8f64, 9},
451 {Intrinsic::roundeven, MVT::v16f64, 9},
452 {Intrinsic::roundeven, MVT::nxv1f64, 9},
453 {Intrinsic::roundeven, MVT::nxv2f64, 9},
454 {Intrinsic::roundeven, MVT::nxv4f64, 9},
455 {Intrinsic::roundeven, MVT::nxv8f64, 9},
456 {Intrinsic::bswap, MVT::v2i16, 3},
457 {Intrinsic::bswap, MVT::v4i16, 3},
458 {Intrinsic::bswap, MVT::v8i16, 3},
459 {Intrinsic::bswap, MVT::v16i16, 3},
460 {Intrinsic::bswap, MVT::nxv1i16, 3},
461 {Intrinsic::bswap, MVT::nxv2i16, 3},
462 {Intrinsic::bswap, MVT::nxv4i16, 3},
463 {Intrinsic::bswap, MVT::nxv8i16, 3},
464 {Intrinsic::bswap, MVT::nxv16i16, 3},
465 {Intrinsic::bswap, MVT::v2i32, 12},
466 {Intrinsic::bswap, MVT::v4i32, 12},
467 {Intrinsic::bswap, MVT::v8i32, 12},
468 {Intrinsic::bswap, MVT::v16i32, 12},
469 {Intrinsic::bswap, MVT::nxv1i32, 12},
470 {Intrinsic::bswap, MVT::nxv2i32, 12},
471 {Intrinsic::bswap, MVT::nxv4i32, 12},
472 {Intrinsic::bswap, MVT::nxv8i32, 12},
473 {Intrinsic::bswap, MVT::nxv16i32, 12},
474 {Intrinsic::bswap, MVT::v2i64, 31},
475 {Intrinsic::bswap, MVT::v4i64, 31},
476 {Intrinsic::bswap, MVT::v8i64, 31},
477 {Intrinsic::bswap, MVT::v16i64, 31},
478 {Intrinsic::bswap, MVT::nxv1i64, 31},
479 {Intrinsic::bswap, MVT::nxv2i64, 31},
480 {Intrinsic::bswap, MVT::nxv4i64, 31},
481 {Intrinsic::bswap, MVT::nxv8i64, 31},
482 {Intrinsic::vp_bswap, MVT::v2i16, 3},
483 {Intrinsic::vp_bswap, MVT::v4i16, 3},
484 {Intrinsic::vp_bswap, MVT::v8i16, 3},
485 {Intrinsic::vp_bswap, MVT::v16i16, 3},
486 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
487 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
488 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
489 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
490 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
491 {Intrinsic::vp_bswap, MVT::v2i32, 12},
492 {Intrinsic::vp_bswap, MVT::v4i32, 12},
493 {Intrinsic::vp_bswap, MVT::v8i32, 12},
494 {Intrinsic::vp_bswap, MVT::v16i32, 12},
495 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
496 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
497 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
498 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
499 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
500 {Intrinsic::vp_bswap, MVT::v2i64, 31},
501 {Intrinsic::vp_bswap, MVT::v4i64, 31},
502 {Intrinsic::vp_bswap, MVT::v8i64, 31},
503 {Intrinsic::vp_bswap, MVT::v16i64, 31},
504 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
505 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
506 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
507 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
508 {Intrinsic::vp_fshl, MVT::v2i8, 7},
509 {Intrinsic::vp_fshl, MVT::v4i8, 7},
510 {Intrinsic::vp_fshl, MVT::v8i8, 7},
511 {Intrinsic::vp_fshl, MVT::v16i8, 7},
512 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
513 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
514 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
515 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
516 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
517 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
518 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
519 {Intrinsic::vp_fshl, MVT::v2i16, 7},
520 {Intrinsic::vp_fshl, MVT::v4i16, 7},
521 {Intrinsic::vp_fshl, MVT::v8i16, 7},
522 {Intrinsic::vp_fshl, MVT::v16i16, 7},
523 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
524 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
525 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
526 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
527 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
528 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
529 {Intrinsic::vp_fshl, MVT::v2i32, 7},
530 {Intrinsic::vp_fshl, MVT::v4i32, 7},
531 {Intrinsic::vp_fshl, MVT::v8i32, 7},
532 {Intrinsic::vp_fshl, MVT::v16i32, 7},
533 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
534 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
535 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
536 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
537 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
538 {Intrinsic::vp_fshl, MVT::v2i64, 7},
539 {Intrinsic::vp_fshl, MVT::v4i64, 7},
540 {Intrinsic::vp_fshl, MVT::v8i64, 7},
541 {Intrinsic::vp_fshl, MVT::v16i64, 7},
542 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
543 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
544 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
545 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
546 {Intrinsic::vp_fshr, MVT::v2i8, 7},
547 {Intrinsic::vp_fshr, MVT::v4i8, 7},
548 {Intrinsic::vp_fshr, MVT::v8i8, 7},
549 {Intrinsic::vp_fshr, MVT::v16i8, 7},
550 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
551 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
552 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
553 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
554 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
555 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
556 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
557 {Intrinsic::vp_fshr, MVT::v2i16, 7},
558 {Intrinsic::vp_fshr, MVT::v4i16, 7},
559 {Intrinsic::vp_fshr, MVT::v8i16, 7},
560 {Intrinsic::vp_fshr, MVT::v16i16, 7},
561 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
562 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
563 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
564 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
565 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
566 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
567 {Intrinsic::vp_fshr, MVT::v2i32, 7},
568 {Intrinsic::vp_fshr, MVT::v4i32, 7},
569 {Intrinsic::vp_fshr, MVT::v8i32, 7},
570 {Intrinsic::vp_fshr, MVT::v16i32, 7},
571 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
572 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
573 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
574 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
575 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
576 {Intrinsic::vp_fshr, MVT::v2i64, 7},
577 {Intrinsic::vp_fshr, MVT::v4i64, 7},
578 {Intrinsic::vp_fshr, MVT::v8i64, 7},
579 {Intrinsic::vp_fshr, MVT::v16i64, 7},
580 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
581 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
582 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
583 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
584 {Intrinsic::bitreverse, MVT::v2i8, 17},
585 {Intrinsic::bitreverse, MVT::v4i8, 17},
586 {Intrinsic::bitreverse, MVT::v8i8, 17},
587 {Intrinsic::bitreverse, MVT::v16i8, 17},
588 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
589 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
590 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
591 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
592 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
593 {Intrinsic::bitreverse, MVT::v2i16, 24},
594 {Intrinsic::bitreverse, MVT::v4i16, 24},
595 {Intrinsic::bitreverse, MVT::v8i16, 24},
596 {Intrinsic::bitreverse, MVT::v16i16, 24},
597 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
598 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
599 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
600 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
601 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
602 {Intrinsic::bitreverse, MVT::v2i32, 33},
603 {Intrinsic::bitreverse, MVT::v4i32, 33},
604 {Intrinsic::bitreverse, MVT::v8i32, 33},
605 {Intrinsic::bitreverse, MVT::v16i32, 33},
606 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
607 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
608 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
609 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
610 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
611 {Intrinsic::bitreverse, MVT::v2i64, 52},
612 {Intrinsic::bitreverse, MVT::v4i64, 52},
613 {Intrinsic::bitreverse, MVT::v8i64, 52},
614 {Intrinsic::bitreverse, MVT::v16i64, 52},
615 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
616 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
617 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
618 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
619 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
620 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
621 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
622 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
623 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
624 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
625 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
626 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
627 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
628 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
629 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
630 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
631 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
632 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
633 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
634 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
635 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
636 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
637 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
638 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
639 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
640 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
641 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
642 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
643 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
644 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
645 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
646 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
647 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
648 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
649 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
650 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
651 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
652 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
653 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
654 {Intrinsic::ctpop, MVT::v2i8, 12},
655 {Intrinsic::ctpop, MVT::v4i8, 12},
656 {Intrinsic::ctpop, MVT::v8i8, 12},
657 {Intrinsic::ctpop, MVT::v16i8, 12},
658 {Intrinsic::ctpop, MVT::nxv1i8, 12},
659 {Intrinsic::ctpop, MVT::nxv2i8, 12},
660 {Intrinsic::ctpop, MVT::nxv4i8, 12},
661 {Intrinsic::ctpop, MVT::nxv8i8, 12},
662 {Intrinsic::ctpop, MVT::nxv16i8, 12},
663 {Intrinsic::ctpop, MVT::v2i16, 19},
664 {Intrinsic::ctpop, MVT::v4i16, 19},
665 {Intrinsic::ctpop, MVT::v8i16, 19},
666 {Intrinsic::ctpop, MVT::v16i16, 19},
667 {Intrinsic::ctpop, MVT::nxv1i16, 19},
668 {Intrinsic::ctpop, MVT::nxv2i16, 19},
669 {Intrinsic::ctpop, MVT::nxv4i16, 19},
670 {Intrinsic::ctpop, MVT::nxv8i16, 19},
671 {Intrinsic::ctpop, MVT::nxv16i16, 19},
672 {Intrinsic::ctpop, MVT::v2i32, 20},
673 {Intrinsic::ctpop, MVT::v4i32, 20},
674 {Intrinsic::ctpop, MVT::v8i32, 20},
675 {Intrinsic::ctpop, MVT::v16i32, 20},
676 {Intrinsic::ctpop, MVT::nxv1i32, 20},
677 {Intrinsic::ctpop, MVT::nxv2i32, 20},
678 {Intrinsic::ctpop, MVT::nxv4i32, 20},
679 {Intrinsic::ctpop, MVT::nxv8i32, 20},
680 {Intrinsic::ctpop, MVT::nxv16i32, 20},
681 {Intrinsic::ctpop, MVT::v2i64, 21},
682 {Intrinsic::ctpop, MVT::v4i64, 21},
683 {Intrinsic::ctpop, MVT::v8i64, 21},
684 {Intrinsic::ctpop, MVT::v16i64, 21},
685 {Intrinsic::ctpop, MVT::nxv1i64, 21},
686 {Intrinsic::ctpop, MVT::nxv2i64, 21},
687 {Intrinsic::ctpop, MVT::nxv4i64, 21},
688 {Intrinsic::ctpop, MVT::nxv8i64, 21},
689 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
690 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
691 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
692 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
693 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
694 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
695 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
696 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
697 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
698 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
699 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
700 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
701 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
702 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
703 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
704 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
705 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
706 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
707 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
708 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
709 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
710 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
711 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
712 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
713 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
714 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
715 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
716 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
717 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
718 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
719 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
720 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
721 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
722 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
723 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
724 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
725 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
726 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
727 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
728 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
729 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
730 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
731 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
732 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
733 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
734 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
735 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
736 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
737 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
738 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
739 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
740 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
741 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
742 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
743 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
744 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
745 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
746 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
747 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
748 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
749 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
750 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
751 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
752 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
753 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
754 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
755 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
756 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
757 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
758 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
759 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
760 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
761 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
762 {Intrinsic::vp_cttz, MVT::v2i8, 16},
763 {Intrinsic::vp_cttz, MVT::v4i8, 16},
764 {Intrinsic::vp_cttz, MVT::v8i8, 16},
765 {Intrinsic::vp_cttz, MVT::v16i8, 16},
766 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
767 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
768 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
769 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
770 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
771 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
772 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
773 {Intrinsic::vp_cttz, MVT::v2i16, 23},
774 {Intrinsic::vp_cttz, MVT::v4i16, 23},
775 {Intrinsic::vp_cttz, MVT::v8i16, 23},
776 {Intrinsic::vp_cttz, MVT::v16i16, 23},
777 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
778 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
779 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
780 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
781 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
782 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
783 {Intrinsic::vp_cttz, MVT::v2i32, 24},
784 {Intrinsic::vp_cttz, MVT::v4i32, 24},
785 {Intrinsic::vp_cttz, MVT::v8i32, 24},
786 {Intrinsic::vp_cttz, MVT::v16i32, 24},
787 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
788 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
789 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
790 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
791 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
792 {Intrinsic::vp_cttz, MVT::v2i64, 25},
793 {Intrinsic::vp_cttz, MVT::v4i64, 25},
794 {Intrinsic::vp_cttz, MVT::v8i64, 25},
795 {Intrinsic::vp_cttz, MVT::v16i64, 25},
796 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
797 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
798 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
799 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
800};
801
803 switch (ID) {
804#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
805 case Intrinsic::VPID: \
806 return ISD::VPSD;
807#include "llvm/IR/VPIntrinsics.def"
808#undef HELPER_MAP_VPID_TO_VPSD
809 }
810 return ISD::DELETED_NODE;
811}
812
816 auto *RetTy = ICA.getReturnType();
817 switch (ICA.getID()) {
818 case Intrinsic::ceil:
819 case Intrinsic::floor:
820 case Intrinsic::trunc:
821 case Intrinsic::rint:
822 case Intrinsic::round:
823 case Intrinsic::roundeven: {
824 // These all use the same code.
826 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
827 return LT.first * 8;
828 break;
829 }
830 case Intrinsic::umin:
831 case Intrinsic::umax:
832 case Intrinsic::smin:
833 case Intrinsic::smax: {
835 if ((ST->hasVInstructions() && LT.second.isVector()) ||
836 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
837 return LT.first;
838 break;
839 }
840 case Intrinsic::sadd_sat:
841 case Intrinsic::ssub_sat:
842 case Intrinsic::uadd_sat:
843 case Intrinsic::usub_sat: {
845 if (ST->hasVInstructions() && LT.second.isVector())
846 return LT.first;
847 break;
848 }
849 case Intrinsic::abs: {
851 if (ST->hasVInstructions() && LT.second.isVector()) {
852 // vrsub.vi v10, v8, 0
853 // vmax.vv v8, v8, v10
854 return LT.first * 2;
855 }
856 break;
857 }
858 case Intrinsic::fabs:
859 case Intrinsic::sqrt: {
861 if (ST->hasVInstructions() && LT.second.isVector())
862 return LT.first;
863 break;
864 }
865 // TODO: add more intrinsic
866 case Intrinsic::experimental_stepvector: {
867 unsigned Cost = 1; // vid
869 return Cost + (LT.first - 1);
870 }
871 case Intrinsic::vp_rint: {
872 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
873 unsigned Cost = 5;
875 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
876 return Cost * LT.first;
877 break;
878 }
879 case Intrinsic::vp_nearbyint: {
880 // More one read and one write for fflags than vp_rint.
881 unsigned Cost = 7;
883 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
884 return Cost * LT.first;
885 break;
886 }
887 case Intrinsic::vp_ceil:
888 case Intrinsic::vp_floor:
889 case Intrinsic::vp_round:
890 case Intrinsic::vp_roundeven:
891 case Intrinsic::vp_roundtozero: {
892 // Rounding with static rounding mode needs two more instructions to
893 // swap/write FRM than vp_rint.
894 unsigned Cost = 7;
896 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
897 if (TLI->isOperationCustom(VPISD, LT.second))
898 return Cost * LT.first;
899 break;
900 }
901 }
902
903 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
905 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
906 ICA.getID(), LT.second))
907 return LT.first * Entry->Cost;
908 }
909
911}
912
914 Type *Src,
917 const Instruction *I) {
918 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
919 // FIXME: Need to compute legalizing cost for illegal types.
920 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
921 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
922
923 // Skip if element size of Dst or Src is bigger than ELEN.
924 if (Src->getScalarSizeInBits() > ST->getELEN() ||
925 Dst->getScalarSizeInBits() > ST->getELEN())
926 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
927
928 int ISD = TLI->InstructionOpcodeToISD(Opcode);
929 assert(ISD && "Invalid opcode");
930
931 // FIXME: Need to consider vsetvli and lmul.
932 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
933 (int)Log2_32(Src->getScalarSizeInBits());
934 switch (ISD) {
935 case ISD::SIGN_EXTEND:
936 case ISD::ZERO_EXTEND:
937 if (Src->getScalarSizeInBits() == 1) {
938 // We do not use vsext/vzext to extend from mask vector.
939 // Instead we use the following instructions to extend from mask vector:
940 // vmv.v.i v8, 0
941 // vmerge.vim v8, v8, -1, v0
942 return 2;
943 }
944 return 1;
945 case ISD::TRUNCATE:
946 if (Dst->getScalarSizeInBits() == 1) {
947 // We do not use several vncvt to truncate to mask vector. So we could
948 // not use PowDiff to calculate it.
949 // Instead we use the following instructions to truncate to mask vector:
950 // vand.vi v8, v8, 1
951 // vmsne.vi v0, v8, 0
952 return 2;
953 }
954 [[fallthrough]];
955 case ISD::FP_EXTEND:
956 case ISD::FP_ROUND:
957 // Counts of narrow/widen instructions.
958 return std::abs(PowDiff);
959 case ISD::FP_TO_SINT:
960 case ISD::FP_TO_UINT:
961 case ISD::SINT_TO_FP:
962 case ISD::UINT_TO_FP:
963 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
964 // The cost of convert from or to mask vector is different from other
965 // cases. We could not use PowDiff to calculate it.
966 // For mask vector to fp, we should use the following instructions:
967 // vmv.v.i v8, 0
968 // vmerge.vim v8, v8, -1, v0
969 // vfcvt.f.x.v v8, v8
970
971 // And for fp vector to mask, we use:
972 // vfncvt.rtz.x.f.w v9, v8
973 // vand.vi v8, v9, 1
974 // vmsne.vi v0, v8, 0
975 return 3;
976 }
977 if (std::abs(PowDiff) <= 1)
978 return 1;
979 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
980 // so it only need two conversion.
981 if (Src->isIntOrIntVectorTy())
982 return 2;
983 // Counts of narrow/widen instructions.
984 return std::abs(PowDiff);
985 }
986 }
987 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
988}
989
990unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
991 if (isa<ScalableVectorType>(Ty)) {
992 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
993 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
994 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
995 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
996 }
997 return cast<FixedVectorType>(Ty)->getNumElements();
998}
999
1002 bool IsUnsigned,
1004 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1005 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1006
1007 // Skip if scalar size of Ty is bigger than ELEN.
1008 if (Ty->getScalarSizeInBits() > ST->getELEN())
1009 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
1010
1011 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1012 if (Ty->getElementType()->isIntegerTy(1))
1013 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1014 // cost 2, but we don't have enough info here so we slightly over cost.
1015 return (LT.first - 1) + 3;
1016
1017 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1018 InstructionCost BaseCost = 2;
1019 unsigned VL = getEstimatedVLFor(Ty);
1020 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1021}
1022
1025 std::optional<FastMathFlags> FMF,
1027 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1028 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1029
1030 // Skip if scalar size of Ty is bigger than ELEN.
1031 if (Ty->getScalarSizeInBits() > ST->getELEN())
1032 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1033
1034 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1035 assert(ISD && "Invalid opcode");
1036
1037 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1038 ISD != ISD::FADD)
1039 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1040
1041 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1042 if (Ty->getElementType()->isIntegerTy(1))
1043 // vcpop sequences, see vreduction-mask.ll
1044 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1045
1046 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1047 InstructionCost BaseCost = 2;
1048 unsigned VL = getEstimatedVLFor(Ty);
1050 return (LT.first - 1) + BaseCost + VL;
1051 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1052}
1053
1055 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1056 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1057 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1058 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1059 FMF, CostKind);
1060
1061 // Skip if scalar size of ResTy is bigger than ELEN.
1062 if (ResTy->getScalarSizeInBits() > ST->getELEN())
1063 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1064 FMF, CostKind);
1065
1066 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1067 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1068 FMF, CostKind);
1069
1070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1071
1072 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1073 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1074 FMF, CostKind);
1075
1076 return (LT.first - 1) +
1077 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1078}
1079
1081 TTI::OperandValueInfo OpInfo,
1083 assert(OpInfo.isConstant() && "non constant operand?");
1084 if (!isa<VectorType>(Ty))
1085 // FIXME: We need to account for immediate materialization here, but doing
1086 // a decent job requires more knowledge about the immediate than we
1087 // currently have here.
1088 return 0;
1089
1090 if (OpInfo.isUniform())
1091 // vmv.x.i, vmv.v.x, or vfmv.v.f
1092 // We ignore the cost of the scalar constant materialization to be consistent
1093 // with how we treat scalar constants themselves just above.
1094 return 1;
1095
1096 // Add a cost of address generation + the cost of the vector load. The
1097 // address is expected to be a PC relative offset to a constant pool entry
1098 // using auipc/addi.
1099 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1100 /*AddressSpace=*/0, CostKind);
1101}
1102
1103
1105 MaybeAlign Alignment,
1106 unsigned AddressSpace,
1108 TTI::OperandValueInfo OpInfo,
1109 const Instruction *I) {
1111 if (Opcode == Instruction::Store && OpInfo.isConstant())
1112 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1113 return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1114 CostKind, OpInfo, I);
1115}
1116
1118 Type *CondTy,
1119 CmpInst::Predicate VecPred,
1121 const Instruction *I) {
1123 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1124 I);
1125
1126 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1127 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1128 I);
1129
1130 // Skip if scalar size of ValTy is bigger than ELEN.
1131 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELEN())
1132 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1133 I);
1134
1135 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1137 if (CondTy->isVectorTy()) {
1138 if (ValTy->getScalarSizeInBits() == 1) {
1139 // vmandn.mm v8, v8, v9
1140 // vmand.mm v9, v0, v9
1141 // vmor.mm v0, v9, v8
1142 return LT.first * 3;
1143 }
1144 // vselect and max/min are supported natively.
1145 return LT.first * 1;
1146 }
1147
1148 if (ValTy->getScalarSizeInBits() == 1) {
1149 // vmv.v.x v9, a0
1150 // vmsne.vi v9, v9, 0
1151 // vmandn.mm v8, v8, v9
1152 // vmand.mm v9, v0, v9
1153 // vmor.mm v0, v9, v8
1154 return LT.first * 5;
1155 }
1156
1157 // vmv.v.x v10, a0
1158 // vmsne.vi v0, v10, 0
1159 // vmerge.vvm v8, v9, v8, v0
1160 return LT.first * 3;
1161 }
1162
1163 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1164 ValTy->isVectorTy()) {
1165 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1166
1167 // Support natively.
1168 if (CmpInst::isIntPredicate(VecPred))
1169 return LT.first * 1;
1170
1171 // If we do not support the input floating point vector type, use the base
1172 // one which will calculate as:
1173 // ScalarizeCost + Num * Cost for fixed vector,
1174 // InvalidCost for scalable vector.
1175 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1176 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1177 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1179 I);
1180 switch (VecPred) {
1181 // Support natively.
1182 case CmpInst::FCMP_OEQ:
1183 case CmpInst::FCMP_OGT:
1184 case CmpInst::FCMP_OGE:
1185 case CmpInst::FCMP_OLT:
1186 case CmpInst::FCMP_OLE:
1187 case CmpInst::FCMP_UNE:
1188 return LT.first * 1;
1189 // TODO: Other comparisons?
1190 default:
1191 break;
1192 }
1193 }
1194
1195 // TODO: Add cost for scalar type.
1196
1197 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1198}
1199
1202 unsigned Index, Value *Op0,
1203 Value *Op1) {
1204 assert(Val->isVectorTy() && "This must be a vector type");
1205
1206 if (Opcode != Instruction::ExtractElement &&
1207 Opcode != Instruction::InsertElement)
1208 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1209
1210 // Legalize the type.
1211 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1212
1213 // This type is legalized to a scalar type.
1214 if (!LT.second.isVector())
1215 return 0;
1216
1217 // For unsupported scalable vector.
1218 if (LT.second.isScalableVector() && !LT.first.isValid())
1219 return LT.first;
1220
1221 if (!isTypeLegal(Val))
1222 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1223
1224 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1225 // and vslideup + vmv.s.x to insert element to vector.
1226 unsigned BaseCost = 1;
1227 // When insertelement we should add the index with 1 as the input of vslideup.
1228 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1229
1230 if (Index != -1U) {
1231 // The type may be split. For fixed-width vectors we can normalize the
1232 // index to the new type.
1233 if (LT.second.isFixedLengthVector()) {
1234 unsigned Width = LT.second.getVectorNumElements();
1235 Index = Index % Width;
1236 }
1237
1238 // We could extract/insert the first element without vslidedown/vslideup.
1239 if (Index == 0)
1240 SlideCost = 0;
1241 else if (Opcode == Instruction::InsertElement)
1242 SlideCost = 1; // With a constant index, we do not need to use addi.
1243 }
1244
1245 // Mask vector extract/insert element is different from normal case.
1246 if (Val->getScalarSizeInBits() == 1) {
1247 // For extractelement, we need the following instructions:
1248 // vmv.v.i v8, 0
1249 // vmerge.vim v8, v8, 1, v0
1250 // vsetivli zero, 1, e8, m2, ta, mu (not count)
1251 // vslidedown.vx v8, v8, a0
1252 // vmv.x.s a0, v8
1253
1254 // For insertelement, we need the following instructions:
1255 // vsetvli a2, zero, e8, m1, ta, mu (not count)
1256 // vmv.s.x v8, a0
1257 // vmv.v.i v9, 0
1258 // vmerge.vim v9, v9, 1, v0
1259 // addi a0, a1, 1
1260 // vsetvli zero, a0, e8, m1, tu, mu (not count)
1261 // vslideup.vx v9, v8, a1
1262 // vsetvli a0, zero, e8, m1, ta, mu (not count)
1263 // vand.vi v8, v9, 1
1264 // vmsne.vi v0, v8, 0
1265
1266 // TODO: should we count these special vsetvlis?
1267 BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
1268 }
1269 // Extract i64 in the target that has XLEN=32 need more instruction.
1270 if (Val->getScalarType()->isIntegerTy() &&
1271 ST->getXLen() < Val->getScalarSizeInBits()) {
1272 // For extractelement, we need the following instructions:
1273 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1274 // vslidedown.vx v8, v8, a0
1275 // vmv.x.s a0, v8
1276 // li a1, 32
1277 // vsrl.vx v8, v8, a1
1278 // vmv.x.s a1, v8
1279
1280 // For insertelement, we need the following instructions:
1281 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1282 // vmv.v.i v12, 0
1283 // vslide1up.vx v16, v12, a1
1284 // vslide1up.vx v12, v16, a0
1285 // addi a0, a2, 1
1286 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1287 // vslideup.vx v8, v12, a2
1288
1289 // TODO: should we count these special vsetvlis?
1290 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1291 }
1292 return BaseCost + SlideCost;
1293}
1294
1296 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1298 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1299
1300 // TODO: Handle more cost kinds.
1302 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1303 Args, CxtI);
1304
1305 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1306 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1307 Args, CxtI);
1308
1309 // Skip if scalar size of Ty is bigger than ELEN.
1310 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELEN())
1311 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1312 Args, CxtI);
1313
1314 // Legalize the type.
1315 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1316
1317 // TODO: Handle scalar type.
1318 if (!LT.second.isVector())
1319 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1320 Args, CxtI);
1321
1322
1323 auto getConstantMatCost =
1324 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1325 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1326 // Two sub-cases:
1327 // * Has a 5 bit immediate operand which can be splatted.
1328 // * Has a larger immediate which must be materialized in scalar register
1329 // We return 0 for both as we currently ignore the cost of materializing
1330 // scalar constants in GPRs.
1331 return 0;
1332
1333 // Add a cost of address generation + the cost of the vector load. The
1334 // address is expected to be a PC relative offset to a constant pool entry
1335 // using auipc/addi.
1336 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
1337 /*AddressSpace=*/0, CostKind);
1338 };
1339
1340 // Add the cost of materializing any constant vectors required.
1341 InstructionCost ConstantMatCost = 0;
1342 if (Op1Info.isConstant())
1343 ConstantMatCost += getConstantMatCost(0, Op1Info);
1344 if (Op2Info.isConstant())
1345 ConstantMatCost += getConstantMatCost(1, Op2Info);
1346
1347 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1348 case ISD::ADD:
1349 case ISD::SUB:
1350 case ISD::AND:
1351 case ISD::OR:
1352 case ISD::XOR:
1353 case ISD::SHL:
1354 case ISD::SRL:
1355 case ISD::SRA:
1356 case ISD::MUL:
1357 case ISD::MULHS:
1358 case ISD::MULHU:
1359 case ISD::FADD:
1360 case ISD::FSUB:
1361 case ISD::FMUL:
1362 case ISD::FNEG: {
1363 return ConstantMatCost + getLMULCost(LT.second) * LT.first * 1;
1364 }
1365 default:
1366 return ConstantMatCost +
1367 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1368 Args, CxtI);
1369 }
1370}
1371
1375 // TODO: More tuning on benchmarks and metrics with changes as needed
1376 // would apply to all settings below to enable performance.
1377
1378
1379 if (ST->enableDefaultUnroll())
1380 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1381
1382 // Enable Upper bound unrolling universally, not dependant upon the conditions
1383 // below.
1384 UP.UpperBound = true;
1385
1386 // Disable loop unrolling for Oz and Os.
1387 UP.OptSizeThreshold = 0;
1389 if (L->getHeader()->getParent()->hasOptSize())
1390 return;
1391
1392 SmallVector<BasicBlock *, 4> ExitingBlocks;
1393 L->getExitingBlocks(ExitingBlocks);
1394 LLVM_DEBUG(dbgs() << "Loop has:\n"
1395 << "Blocks: " << L->getNumBlocks() << "\n"
1396 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1397
1398 // Only allow another exit other than the latch. This acts as an early exit
1399 // as it mirrors the profitability calculation of the runtime unroller.
1400 if (ExitingBlocks.size() > 2)
1401 return;
1402
1403 // Limit the CFG of the loop body for targets with a branch predictor.
1404 // Allowing 4 blocks permits if-then-else diamonds in the body.
1405 if (L->getNumBlocks() > 4)
1406 return;
1407
1408 // Don't unroll vectorized loops, including the remainder loop
1409 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1410 return;
1411
1412 // Scan the loop: don't unroll loops with calls as this could prevent
1413 // inlining.
1415 for (auto *BB : L->getBlocks()) {
1416 for (auto &I : *BB) {
1417 // Initial setting - Don't unroll loops containing vectorized
1418 // instructions.
1419 if (I.getType()->isVectorTy())
1420 return;
1421
1422 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1423 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1424 if (!isLoweredToCall(F))
1425 continue;
1426 }
1427 return;
1428 }
1429
1430 SmallVector<const Value *> Operands(I.operand_values());
1433 }
1434 }
1435
1436 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1437
1438 UP.Partial = true;
1439 UP.Runtime = true;
1440 UP.UnrollRemainder = true;
1441 UP.UnrollAndJam = true;
1443
1444 // Force unrolling small loops can be very useful because of the branch
1445 // taken cost of the backedge.
1446 if (Cost < 12)
1447 UP.Force = true;
1448}
1449
1453}
1454
1457 if (Ty->isVectorTy()) {
1458 if (Size.isScalable() && ST->hasVInstructions())
1459 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1460
1462 return divideCeil(Size, ST->getRealMinVLen());
1463 }
1464
1465 return BaseT::getRegUsageForType(Ty);
1466}
1467
1468unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1469 // This interface is currently only used by SLP. Returning 1 (which is the
1470 // default value for SLPMaxVF) disables SLP. We currently have a cost modeling
1471 // problem w/ constant materialization which causes SLP to perform majorly
1472 // unprofitable transformations.
1473 // TODO: Figure out constant materialization cost modeling and remove.
1474 return SLPMaxVF;
1475}
1476
1478 const TargetTransformInfo::LSRCost &C2) {
1479 // RISCV specific here are "instruction number 1st priority".
1480 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1481 C1.NumIVMuls, C1.NumBaseAdds,
1482 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1483 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1484 C2.NumIVMuls, C2.NumBaseAdds,
1485 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1486}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Result used for getMaximumVF query which is used exclusively by " "SLP vectorizer. Defaults to 1 which disables SLP."), cl::init(1), cl::Hidden)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:75
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:715
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:849
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:714
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:963
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:813
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:993
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:721
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:724
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:722
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:723
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:725
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:734
bool isIntPredicate() const
Definition: InstrTypes.h:826
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:836
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:676
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:644
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:33
BlockT * getHeader() const
Definition: LoopInfo.h:105
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
Machine Value Type.
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
unsigned getELEN() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
static RISCVII::VLMUL getLMUL(MVT VT)
The main scalar evolution driver.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimunSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:258
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:222
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:341
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
Type * getElementType() const
Definition: DerivedTypes.h:422
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:163
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:910
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:637
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:870
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
int getIntMatCost(const APInt &Val, unsigned Size, const FeatureBitset &ActiveFeatures, bool CompressionCost)
std::pair< unsigned, bool > decodeVLMUL(RISCVII::VLMUL VLMUL)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:386
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:508
AddressSpace
Definition: NVPTXBaseInfo.h:21
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:373
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).