LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
43 size_t NumInstr = OpCodes.size();
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
48 return LMULCost * NumInstr;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMOR_MM:
95 case RISCV::VMXOR_MM:
96 case RISCV::VMAND_MM:
97 case RISCV::VMANDN_MM:
98 case RISCV::VMNAND_MM:
99 case RISCV::VCPOP_M:
100 case RISCV::VFIRST_M:
101 Cost += 1;
102 break;
103 default:
104 Cost += LMULCost;
105 }
106 }
107 return Cost;
108}
109
112 assert(Ty->isIntegerTy() &&
113 "getIntImmCost can only estimate cost of materialising integers");
114
115 // We have a Zero register, so 0 is always free.
116 if (Imm == 0)
117 return TTI::TCC_Free;
118
119 // Otherwise, we check how many instructions it will take to materialise.
120 const DataLayout &DL = getDataLayout();
121 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
122}
123
124// Look for patterns of shift followed by AND that can be turned into a pair of
125// shifts. We won't need to materialize an immediate for the AND so these can
126// be considered free.
127static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
128 uint64_t Mask = Imm.getZExtValue();
129 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
130 if (!BO || !BO->hasOneUse())
131 return false;
132
133 if (BO->getOpcode() != Instruction::Shl)
134 return false;
135
136 if (!isa<ConstantInt>(BO->getOperand(1)))
137 return false;
138
139 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
140 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
141 // is a mask shifted by c2 bits with c3 leading zeros.
142 if (isShiftedMask_64(Mask)) {
143 unsigned Trailing = llvm::countr_zero(Mask);
144 if (ShAmt == Trailing)
145 return true;
146 }
147
148 return false;
149}
150
152 const APInt &Imm, Type *Ty,
154 Instruction *Inst) {
155 assert(Ty->isIntegerTy() &&
156 "getIntImmCost can only estimate cost of materialising integers");
157
158 // We have a Zero register, so 0 is always free.
159 if (Imm == 0)
160 return TTI::TCC_Free;
161
162 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
163 // commutative, in others the immediate comes from a specific argument index.
164 bool Takes12BitImm = false;
165 unsigned ImmArgIdx = ~0U;
166
167 switch (Opcode) {
168 case Instruction::GetElementPtr:
169 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
170 // split up large offsets in GEP into better parts than ConstantHoisting
171 // can.
172 return TTI::TCC_Free;
173 case Instruction::Store:
174 // If the address is a constant, use the materialization cost.
175 if (Idx == 1)
176 return getIntImmCost(Imm, Ty, CostKind);
177 return TTI::TCC_Free;
178 case Instruction::Load:
179 // If the address is a constant, use the materialization cost.
180 return getIntImmCost(Imm, Ty, CostKind);
181 case Instruction::And:
182 // zext.h
183 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
184 return TTI::TCC_Free;
185 // zext.w
186 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
187 return TTI::TCC_Free;
188 // bclri
189 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
190 return TTI::TCC_Free;
191 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
192 canUseShiftPair(Inst, Imm))
193 return TTI::TCC_Free;
194 Takes12BitImm = true;
195 break;
196 case Instruction::Add:
197 Takes12BitImm = true;
198 break;
199 case Instruction::Or:
200 case Instruction::Xor:
201 // bseti/binvi
202 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
203 return TTI::TCC_Free;
204 Takes12BitImm = true;
205 break;
206 case Instruction::Mul:
207 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
208 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
209 return TTI::TCC_Free;
210 // One more or less than a power of 2 can use SLLI+ADD/SUB.
211 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
212 return TTI::TCC_Free;
213 // FIXME: There is no MULI instruction.
214 Takes12BitImm = true;
215 break;
216 case Instruction::Sub:
217 case Instruction::Shl:
218 case Instruction::LShr:
219 case Instruction::AShr:
220 Takes12BitImm = true;
221 ImmArgIdx = 1;
222 break;
223 default:
224 break;
225 }
226
227 if (Takes12BitImm) {
228 // Check immediate is the correct argument...
229 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
230 // ... and fits into the 12-bit immediate.
231 if (Imm.getSignificantBits() <= 64 &&
232 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
233 return TTI::TCC_Free;
234 }
235 }
236
237 // Otherwise, use the full materialisation cost.
238 return getIntImmCost(Imm, Ty, CostKind);
239 }
240
241 // By default, prevent hoisting.
242 return TTI::TCC_Free;
243}
244
247 const APInt &Imm, Type *Ty,
249 // Prevent hoisting in unknown cases.
250 return TTI::TCC_Free;
251}
252
253bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
254 return ST->hasVInstructions();
255}
256
259 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
260 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
263}
264
266 // Currently, the ExpandReductions pass can't expand scalable-vector
267 // reductions, but we still request expansion as RVV doesn't support certain
268 // reductions and the SelectionDAG can't legalize them either.
269 switch (II->getIntrinsicID()) {
270 default:
271 return false;
272 // These reductions have no equivalent in RVV
273 case Intrinsic::vector_reduce_mul:
274 case Intrinsic::vector_reduce_fmul:
275 return true;
276 }
277}
278
279std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
280 if (ST->hasVInstructions())
282 return BaseT::getMaxVScale();
283}
284
285std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
286 if (ST->hasVInstructions())
287 if (unsigned MinVLen = ST->getRealMinVLen();
288 MinVLen >= RISCV::RVVBitsPerBlock)
289 return MinVLen / RISCV::RVVBitsPerBlock;
291}
292
295 unsigned LMUL =
296 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
297 switch (K) {
299 return TypeSize::getFixed(ST->getXLen());
301 return TypeSize::getFixed(
302 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
305 (ST->hasVInstructions() &&
308 : 0);
309 }
310
311 llvm_unreachable("Unsupported register kind");
312}
313
315RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
316 // Add a cost of address generation + the cost of the load. The address
317 // is expected to be a PC relative offset to a constant pool entry
318 // using auipc/addi.
319 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
320 /*AddressSpace=*/0, CostKind);
321}
322
324 LLVMContext &C) {
325 assert((DataVT.getScalarSizeInBits() != 8 ||
326 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
327 MVT IndexVT = DataVT.changeTypeToInteger();
328 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
329 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
330 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
331}
332
334 VectorType *Tp, ArrayRef<int> Mask,
336 int Index, VectorType *SubTp,
338 const Instruction *CxtI) {
339 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
340
341 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
342
343 // First, handle cases where having a fixed length vector enables us to
344 // give a more accurate cost than falling back to generic scalable codegen.
345 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
346 if (isa<FixedVectorType>(Tp)) {
347 switch (Kind) {
348 default:
349 break;
351 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
352 MVT EltTp = LT.second.getVectorElementType();
353 // If the size of the element is < ELEN then shuffles of interleaves and
354 // deinterleaves of 2 vectors can be lowered into the following
355 // sequences
356 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
357 // Example sequence:
358 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
359 // vwaddu.vv v10, v8, v9
360 // li a0, -1 (ignored)
361 // vwmaccu.vx v10, a0, v9
362 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
363 return 2 * LT.first * TLI->getLMULCost(LT.second);
364
365 if (Mask[0] == 0 || Mask[0] == 1) {
366 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
367 // Example sequence:
368 // vnsrl.wi v10, v8, 0
369 if (equal(DeinterleaveMask, Mask))
370 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
371 LT.second, CostKind);
372 }
373 }
374 }
375 // vrgather + cost of generating the mask constant.
376 // We model this for an unknown mask with a single vrgather.
377 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
378 (LT.second.getScalarSizeInBits() != 8 ||
379 LT.second.getVectorNumElements() <= 256)) {
380 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
381 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
382 return IndexCost +
383 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
384 }
385 [[fallthrough]];
386 }
389 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
390 // register for the second vrgather. We model this for an unknown
391 // (shuffle) mask.
392 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
393 (LT.second.getScalarSizeInBits() != 8 ||
394 LT.second.getVectorNumElements() <= 256)) {
395 auto &C = Tp->getContext();
396 auto EC = Tp->getElementCount();
397 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
399 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
400 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
401 return 2 * IndexCost +
402 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
403 LT.second, CostKind) +
404 MaskCost;
405 }
406 [[fallthrough]];
407 }
408 case TTI::SK_Select: {
409 // We are going to permute multiple sources and the result will be in
410 // multiple destinations. Providing an accurate cost only for splits where
411 // the element type remains the same.
412 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
413 LT.second.isFixedLengthVector() &&
414 LT.second.getVectorElementType().getSizeInBits() ==
416 LT.second.getVectorNumElements() <
417 cast<FixedVectorType>(Tp)->getNumElements() &&
418 divideCeil(Mask.size(),
419 cast<FixedVectorType>(Tp)->getNumElements()) ==
420 static_cast<unsigned>(*LT.first.getValue())) {
421 unsigned NumRegs = *LT.first.getValue();
422 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
423 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
424 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
425
427 for (unsigned I = 0; I < NumRegs; ++I) {
428 bool IsSingleVector = true;
429 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
430 transform(Mask.slice(I * SubVF,
431 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
432 SubMask.begin(), [&](int I) {
433 bool SingleSubVector = I / VF == 0;
434 IsSingleVector &= SingleSubVector;
435 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
436 });
439 SubVecTy, SubMask, CostKind, 0, nullptr);
440 return Cost;
441 }
442 }
443 break;
444 }
445 }
446 };
447
448 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
449 switch (Kind) {
450 default:
451 // Fallthrough to generic handling.
452 // TODO: Most of these cases will return getInvalid in generic code, and
453 // must be implemented here.
454 break;
456 // Extract at zero is always a subregister extract
457 if (Index == 0)
458 return TTI::TCC_Free;
459
460 // If we're extracting a subvector of at most m1 size at a sub-register
461 // boundary - which unfortunately we need exact vlen to identify - this is
462 // a subregister extract at worst and thus won't require a vslidedown.
463 // TODO: Extend for aligned m2, m4 subvector extracts
464 // TODO: Extend for misalgined (but contained) extracts
465 // TODO: Extend for scalable subvector types
466 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
467 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
468 const unsigned MinVLen = ST->getRealMinVLen();
469 const unsigned MaxVLen = ST->getRealMaxVLen();
470 if (MinVLen == MaxVLen &&
471 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
472 SubLT.second.getSizeInBits() <= MinVLen)
473 return TTI::TCC_Free;
474 }
475
476 // Example sequence:
477 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
478 // vslidedown.vi v8, v9, 2
479 return LT.first *
480 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
482 // Example sequence:
483 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
484 // vslideup.vi v8, v9, 2
485 return LT.first *
486 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
487 case TTI::SK_Select: {
488 // Example sequence:
489 // li a0, 90
490 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
491 // vmv.s.x v0, a0
492 // vmerge.vvm v8, v9, v8, v0
493 // We use 2 for the cost of the mask materialization as this is the true
494 // cost for small masks and most shuffles are small. At worst, this cost
495 // should be a very small constant for the constant pool load. As such,
496 // we may bias towards large selects slightly more than truely warranted.
497 return LT.first *
498 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
499 LT.second, CostKind));
500 }
501 case TTI::SK_Broadcast: {
502 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
503 Instruction::InsertElement);
504 if (LT.second.getScalarSizeInBits() == 1) {
505 if (HasScalar) {
506 // Example sequence:
507 // andi a0, a0, 1
508 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
509 // vmv.v.x v8, a0
510 // vmsne.vi v0, v8, 0
511 return LT.first *
512 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
513 LT.second, CostKind));
514 }
515 // Example sequence:
516 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
517 // vmv.v.i v8, 0
518 // vmerge.vim v8, v8, 1, v0
519 // vmv.x.s a0, v8
520 // andi a0, a0, 1
521 // vmv.v.x v8, a0
522 // vmsne.vi v0, v8, 0
523
524 return LT.first *
525 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
526 RISCV::VMV_X_S, RISCV::VMV_V_X,
527 RISCV::VMSNE_VI},
528 LT.second, CostKind));
529 }
530
531 if (HasScalar) {
532 // Example sequence:
533 // vmv.v.x v8, a0
534 return LT.first *
535 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
536 }
537
538 // Example sequence:
539 // vrgather.vi v9, v8, 0
540 return LT.first *
541 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
542 }
543 case TTI::SK_Splice: {
544 // vslidedown+vslideup.
545 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
546 // of similar code, but I think we expand through memory.
547 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
548 if (Index >= 0 && Index < 32)
549 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
550 else if (Index < 0 && Index > -32)
551 Opcodes[1] = RISCV::VSLIDEUP_VI;
552 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
553 }
554 case TTI::SK_Reverse: {
555 // TODO: Cases to improve here:
556 // * Illegal vector types
557 // * i64 on RV32
558 // * i1 vector
559 // At low LMUL, most of the cost is producing the vrgather index register.
560 // At high LMUL, the cost of the vrgather itself will dominate.
561 // Example sequence:
562 // csrr a0, vlenb
563 // srli a0, a0, 3
564 // addi a0, a0, -1
565 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
566 // vid.v v9
567 // vrsub.vx v10, v9, a0
568 // vrgather.vv v9, v8, v10
569 InstructionCost LenCost = 3;
570 if (LT.second.isFixedLengthVector())
571 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
572 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
573 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
574 if (LT.second.isFixedLengthVector() &&
575 isInt<5>(LT.second.getVectorNumElements() - 1))
576 Opcodes[1] = RISCV::VRSUB_VI;
577 InstructionCost GatherCost =
578 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
579 // Mask operation additionally required extend and truncate
580 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
581 return LT.first * (LenCost + GatherCost + ExtendCost);
582 }
583 }
584 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
585}
586
588RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
589 unsigned AddressSpace,
591 if (!isLegalMaskedLoadStore(Src, Alignment) ||
593 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
594 CostKind);
595
596 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
597}
598
600 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
601 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
602 bool UseMaskForCond, bool UseMaskForGaps) {
603 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
605
606 // The interleaved memory access pass will lower interleaved memory ops (i.e
607 // a load and store followed by a specific shuffle) to vlseg/vsseg
608 // intrinsics. In those cases then we can treat it as if it's just one (legal)
609 // memory op
610 if (!UseMaskForCond && !UseMaskForGaps &&
611 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
612 auto *VTy = cast<VectorType>(VecTy);
613 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
614 // Need to make sure type has't been scalarized
615 if (LT.second.isVector()) {
616 auto *LegalVTy = VectorType::get(VTy->getElementType(),
617 LT.second.getVectorElementCount());
618 // FIXME: We use the memory op cost of the *legalized* type here, becuase
619 // it's getMemoryOpCost returns a really expensive cost for types like
620 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
621 // Should the memory op cost of these be cheaper?
622 if (TLI->isLegalInterleavedAccessType(LegalVTy, Factor, Alignment,
623 AddressSpace, DL)) {
624 InstructionCost LegalMemCost = getMemoryOpCost(
625 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
626 return LT.first + LegalMemCost;
627 }
628 }
629 }
630
631 // TODO: Return the cost of interleaved accesses for scalable vector when
632 // unable to convert to segment accesses instructions.
633 if (isa<ScalableVectorType>(VecTy))
635
636 auto *FVTy = cast<FixedVectorType>(VecTy);
637 InstructionCost MemCost =
638 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
639 unsigned VF = FVTy->getNumElements() / Factor;
640
641 // An interleaved load will look like this for Factor=3:
642 // %wide.vec = load <12 x i32>, ptr %3, align 4
643 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
644 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
645 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
646 if (Opcode == Instruction::Load) {
647 InstructionCost Cost = MemCost;
648 for (unsigned Index : Indices) {
649 FixedVectorType *SubVecTy =
650 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
651 auto Mask = createStrideMask(Index, Factor, VF);
652 InstructionCost ShuffleCost =
654 CostKind, 0, nullptr, {});
655 Cost += ShuffleCost;
656 }
657 return Cost;
658 }
659
660 // TODO: Model for NF > 2
661 // We'll need to enhance getShuffleCost to model shuffles that are just
662 // inserts and extracts into subvectors, since they won't have the full cost
663 // of a vrgather.
664 // An interleaved store for 3 vectors of 4 lanes will look like
665 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
666 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
667 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
668 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
669 // store <12 x i32> %interleaved.vec, ptr %10, align 4
670 if (Factor != 2)
671 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
672 Alignment, AddressSpace, CostKind,
673 UseMaskForCond, UseMaskForGaps);
674
675 assert(Opcode == Instruction::Store && "Opcode must be a store");
676 // For an interleaving store of 2 vectors, we perform one large interleaving
677 // shuffle that goes into the wide store
678 auto Mask = createInterleaveMask(VF, Factor);
679 InstructionCost ShuffleCost =
681 CostKind, 0, nullptr, {});
682 return MemCost + ShuffleCost;
683}
684
686 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
687 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
689 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
690 Alignment, CostKind, I);
691
692 if ((Opcode == Instruction::Load &&
693 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
694 (Opcode == Instruction::Store &&
695 !isLegalMaskedScatter(DataTy, Align(Alignment))))
696 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
697 Alignment, CostKind, I);
698
699 // Cost is proportional to the number of memory operations implied. For
700 // scalable vectors, we use an estimate on that number since we don't
701 // know exactly what VL will be.
702 auto &VTy = *cast<VectorType>(DataTy);
703 InstructionCost MemOpCost =
704 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
705 {TTI::OK_AnyValue, TTI::OP_None}, I);
706 unsigned NumLoads = getEstimatedVLFor(&VTy);
707 return NumLoads * MemOpCost;
708}
709
711 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
712 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
713 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
714 !isLegalStridedLoadStore(DataTy, Alignment)) ||
715 (Opcode != Instruction::Load && Opcode != Instruction::Store))
716 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
717 Alignment, CostKind, I);
718
720 return TTI::TCC_Basic;
721
722 // Cost is proportional to the number of memory operations implied. For
723 // scalable vectors, we use an estimate on that number since we don't
724 // know exactly what VL will be.
725 auto &VTy = *cast<VectorType>(DataTy);
726 InstructionCost MemOpCost =
727 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
728 {TTI::OK_AnyValue, TTI::OP_None}, I);
729 unsigned NumLoads = getEstimatedVLFor(&VTy);
730 return NumLoads * MemOpCost;
731}
732
733// Currently, these represent both throughput and codesize costs
734// for the respective intrinsics. The costs in this table are simply
735// instruction counts with the following adjustments made:
736// * One vsetvli is considered free.
738 {Intrinsic::floor, MVT::f32, 9},
739 {Intrinsic::floor, MVT::f64, 9},
740 {Intrinsic::ceil, MVT::f32, 9},
741 {Intrinsic::ceil, MVT::f64, 9},
742 {Intrinsic::trunc, MVT::f32, 7},
743 {Intrinsic::trunc, MVT::f64, 7},
744 {Intrinsic::round, MVT::f32, 9},
745 {Intrinsic::round, MVT::f64, 9},
746 {Intrinsic::roundeven, MVT::f32, 9},
747 {Intrinsic::roundeven, MVT::f64, 9},
748 {Intrinsic::rint, MVT::f32, 7},
749 {Intrinsic::rint, MVT::f64, 7},
750 {Intrinsic::lrint, MVT::i32, 1},
751 {Intrinsic::lrint, MVT::i64, 1},
752 {Intrinsic::llrint, MVT::i64, 1},
753 {Intrinsic::nearbyint, MVT::f32, 9},
754 {Intrinsic::nearbyint, MVT::f64, 9},
755 {Intrinsic::bswap, MVT::i16, 3},
756 {Intrinsic::bswap, MVT::i32, 12},
757 {Intrinsic::bswap, MVT::i64, 31},
758 {Intrinsic::vp_bswap, MVT::i16, 3},
759 {Intrinsic::vp_bswap, MVT::i32, 12},
760 {Intrinsic::vp_bswap, MVT::i64, 31},
761 {Intrinsic::vp_fshl, MVT::i8, 7},
762 {Intrinsic::vp_fshl, MVT::i16, 7},
763 {Intrinsic::vp_fshl, MVT::i32, 7},
764 {Intrinsic::vp_fshl, MVT::i64, 7},
765 {Intrinsic::vp_fshr, MVT::i8, 7},
766 {Intrinsic::vp_fshr, MVT::i16, 7},
767 {Intrinsic::vp_fshr, MVT::i32, 7},
768 {Intrinsic::vp_fshr, MVT::i64, 7},
769 {Intrinsic::bitreverse, MVT::i8, 17},
770 {Intrinsic::bitreverse, MVT::i16, 24},
771 {Intrinsic::bitreverse, MVT::i32, 33},
772 {Intrinsic::bitreverse, MVT::i64, 52},
773 {Intrinsic::vp_bitreverse, MVT::i8, 17},
774 {Intrinsic::vp_bitreverse, MVT::i16, 24},
775 {Intrinsic::vp_bitreverse, MVT::i32, 33},
776 {Intrinsic::vp_bitreverse, MVT::i64, 52},
777 {Intrinsic::ctpop, MVT::i8, 12},
778 {Intrinsic::ctpop, MVT::i16, 19},
779 {Intrinsic::ctpop, MVT::i32, 20},
780 {Intrinsic::ctpop, MVT::i64, 21},
781 {Intrinsic::vp_ctpop, MVT::i8, 12},
782 {Intrinsic::vp_ctpop, MVT::i16, 19},
783 {Intrinsic::vp_ctpop, MVT::i32, 20},
784 {Intrinsic::vp_ctpop, MVT::i64, 21},
785 {Intrinsic::vp_ctlz, MVT::i8, 19},
786 {Intrinsic::vp_ctlz, MVT::i16, 28},
787 {Intrinsic::vp_ctlz, MVT::i32, 31},
788 {Intrinsic::vp_ctlz, MVT::i64, 35},
789 {Intrinsic::vp_cttz, MVT::i8, 16},
790 {Intrinsic::vp_cttz, MVT::i16, 23},
791 {Intrinsic::vp_cttz, MVT::i32, 24},
792 {Intrinsic::vp_cttz, MVT::i64, 25},
793};
794
796 switch (ID) {
797#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
798 case Intrinsic::VPID: \
799 return ISD::VPSD;
800#include "llvm/IR/VPIntrinsics.def"
801#undef HELPER_MAP_VPID_TO_VPSD
802 }
803 return ISD::DELETED_NODE;
804}
805
809 auto *RetTy = ICA.getReturnType();
810 switch (ICA.getID()) {
811 case Intrinsic::ceil:
812 case Intrinsic::floor:
813 case Intrinsic::trunc:
814 case Intrinsic::rint:
815 case Intrinsic::lrint:
816 case Intrinsic::llrint:
817 case Intrinsic::round:
818 case Intrinsic::roundeven: {
819 // These all use the same code.
821 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
822 return LT.first * 8;
823 break;
824 }
825 case Intrinsic::umin:
826 case Intrinsic::umax:
827 case Intrinsic::smin:
828 case Intrinsic::smax: {
830 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
831 return LT.first;
832
833 if (ST->hasVInstructions() && LT.second.isVector()) {
834 unsigned Op;
835 switch (ICA.getID()) {
836 case Intrinsic::umin:
837 Op = RISCV::VMINU_VV;
838 break;
839 case Intrinsic::umax:
840 Op = RISCV::VMAXU_VV;
841 break;
842 case Intrinsic::smin:
843 Op = RISCV::VMIN_VV;
844 break;
845 case Intrinsic::smax:
846 Op = RISCV::VMAX_VV;
847 break;
848 }
849 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
850 }
851 break;
852 }
853 case Intrinsic::sadd_sat:
854 case Intrinsic::ssub_sat:
855 case Intrinsic::uadd_sat:
856 case Intrinsic::usub_sat:
857 case Intrinsic::fabs:
858 case Intrinsic::sqrt: {
860 if (ST->hasVInstructions() && LT.second.isVector())
861 return LT.first;
862 break;
863 }
864 case Intrinsic::ctpop: {
866 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
867 return LT.first;
868 break;
869 }
870 case Intrinsic::abs: {
872 if (ST->hasVInstructions() && LT.second.isVector()) {
873 // vrsub.vi v10, v8, 0
874 // vmax.vv v8, v8, v10
875 return LT.first * 2;
876 }
877 break;
878 }
879 case Intrinsic::get_active_lane_mask: {
880 if (ST->hasVInstructions()) {
881 Type *ExpRetTy = VectorType::get(
882 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
883 auto LT = getTypeLegalizationCost(ExpRetTy);
884
885 // vid.v v8 // considered hoisted
886 // vsaddu.vx v8, v8, a0
887 // vmsltu.vx v0, v8, a1
888 return LT.first *
889 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
890 LT.second, CostKind);
891 }
892 break;
893 }
894 // TODO: add more intrinsic
895 case Intrinsic::experimental_stepvector: {
897 // Legalisation of illegal types involves an `index' instruction plus
898 // (LT.first - 1) vector adds.
899 if (ST->hasVInstructions())
900 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
901 (LT.first - 1) *
902 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
903 return 1 + (LT.first - 1);
904 }
905 case Intrinsic::experimental_cttz_elts: {
906 Type *ArgTy = ICA.getArgTypes()[0];
907 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
908 if (getTLI()->shouldExpandCttzElements(ArgType))
909 break;
910 InstructionCost Cost = getRISCVInstructionCost(
911 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
912
913 // If zero_is_poison is false, then we will generate additional
914 // cmp + select instructions to convert -1 to EVL.
915 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
916 if (ICA.getArgs().size() > 1 &&
917 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
918 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
920 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
922
923 return Cost;
924 }
925 case Intrinsic::vp_rint: {
926 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
927 unsigned Cost = 5;
929 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
930 return Cost * LT.first;
931 break;
932 }
933 case Intrinsic::vp_nearbyint: {
934 // More one read and one write for fflags than vp_rint.
935 unsigned Cost = 7;
937 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
938 return Cost * LT.first;
939 break;
940 }
941 case Intrinsic::vp_ceil:
942 case Intrinsic::vp_floor:
943 case Intrinsic::vp_round:
944 case Intrinsic::vp_roundeven:
945 case Intrinsic::vp_roundtozero: {
946 // Rounding with static rounding mode needs two more instructions to
947 // swap/write FRM than vp_rint.
948 unsigned Cost = 7;
950 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
951 if (TLI->isOperationCustom(VPISD, LT.second))
952 return Cost * LT.first;
953 break;
954 }
955 }
956
957 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
958 if (auto LT = getTypeLegalizationCost(RetTy);
959 LT.second.isVector()) {
960 MVT EltTy = LT.second.getVectorElementType();
961 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
962 ICA.getID(), EltTy))
963 return LT.first * Entry->Cost;
964 }
965 }
966
968}
969
971 Type *Src,
974 const Instruction *I) {
975 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
976 if (!IsVectorType)
977 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
978
979 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
980 (Src->getScalarSizeInBits() <= ST->getELen()) &&
981 (Dst->getScalarSizeInBits() <= ST->getELen());
982
983 // FIXME: Need to compute legalizing cost for illegal types.
984 if (!IsTypeLegal)
985 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
986
987 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
988 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
989
990 int ISD = TLI->InstructionOpcodeToISD(Opcode);
991 assert(ISD && "Invalid opcode");
992
993 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
994 (int)Log2_32(Src->getScalarSizeInBits());
995 switch (ISD) {
996 case ISD::SIGN_EXTEND:
997 case ISD::ZERO_EXTEND: {
998 const unsigned SrcEltSize = Src->getScalarSizeInBits();
999 if (SrcEltSize == 1) {
1000 // We do not use vsext/vzext to extend from mask vector.
1001 // Instead we use the following instructions to extend from mask vector:
1002 // vmv.v.i v8, 0
1003 // vmerge.vim v8, v8, -1, v0
1004 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1005 DstLT.second, CostKind);
1006 }
1007 if ((PowDiff < 1) || (PowDiff > 3))
1008 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1009 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1010 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1011 unsigned Op =
1012 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1013 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1014 }
1015 case ISD::TRUNCATE:
1016 if (Dst->getScalarSizeInBits() == 1) {
1017 // We do not use several vncvt to truncate to mask vector. So we could
1018 // not use PowDiff to calculate it.
1019 // Instead we use the following instructions to truncate to mask vector:
1020 // vand.vi v8, v8, 1
1021 // vmsne.vi v0, v8, 0
1022 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1023 SrcLT.second, CostKind);
1024 }
1025 [[fallthrough]];
1026 case ISD::FP_EXTEND:
1027 case ISD::FP_ROUND: {
1028 // Counts of narrow/widen instructions.
1029 unsigned SrcEltSize = Src->getScalarSizeInBits();
1030 unsigned DstEltSize = Dst->getScalarSizeInBits();
1031
1032 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1033 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1034 : RISCV::VFNCVT_F_F_W;
1036 for (; SrcEltSize != DstEltSize;) {
1037 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1038 ? MVT::getIntegerVT(DstEltSize)
1039 : MVT::getFloatingPointVT(DstEltSize);
1040 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1041 DstEltSize =
1042 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1043 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1044 }
1045 return Cost;
1046 }
1047 case ISD::FP_TO_SINT:
1048 case ISD::FP_TO_UINT:
1049 case ISD::SINT_TO_FP:
1050 case ISD::UINT_TO_FP:
1051 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1052 // The cost of convert from or to mask vector is different from other
1053 // cases. We could not use PowDiff to calculate it.
1054 // For mask vector to fp, we should use the following instructions:
1055 // vmv.v.i v8, 0
1056 // vmerge.vim v8, v8, -1, v0
1057 // vfcvt.f.x.v v8, v8
1058
1059 // And for fp vector to mask, we use:
1060 // vfncvt.rtz.x.f.w v9, v8
1061 // vand.vi v8, v9, 1
1062 // vmsne.vi v0, v8, 0
1063 return 3;
1064 }
1065 if (std::abs(PowDiff) <= 1)
1066 return 1;
1067 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1068 // so it only need two conversion.
1069 if (Src->isIntOrIntVectorTy())
1070 return 2;
1071 // Counts of narrow/widen instructions.
1072 return std::abs(PowDiff);
1073 }
1074 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1075}
1076
1077unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1078 if (isa<ScalableVectorType>(Ty)) {
1079 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1080 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1081 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1082 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1083 }
1084 return cast<FixedVectorType>(Ty)->getNumElements();
1085}
1086
1089 FastMathFlags FMF,
1091 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1092 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1093
1094 // Skip if scalar size of Ty is bigger than ELEN.
1095 if (Ty->getScalarSizeInBits() > ST->getELen())
1096 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1097
1098 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1099 if (Ty->getElementType()->isIntegerTy(1)) {
1100 // SelectionDAGBuilder does following transforms:
1101 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1102 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1103 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1104 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1105 else
1106 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1107 }
1108
1109 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1111 InstructionCost ExtraCost = 0;
1112 switch (IID) {
1113 case Intrinsic::maximum:
1114 if (FMF.noNaNs()) {
1115 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1116 } else {
1117 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1118 RISCV::VFMV_F_S};
1119 // Cost of Canonical Nan + branch
1120 // lui a0, 523264
1121 // fmv.w.x fa0, a0
1122 Type *DstTy = Ty->getScalarType();
1123 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1124 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1125 ExtraCost = 1 +
1126 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1128 getCFInstrCost(Instruction::Br, CostKind);
1129 }
1130 break;
1131
1132 case Intrinsic::minimum:
1133 if (FMF.noNaNs()) {
1134 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1135 } else {
1136 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1137 RISCV::VFMV_F_S};
1138 // Cost of Canonical Nan + branch
1139 // lui a0, 523264
1140 // fmv.w.x fa0, a0
1141 Type *DstTy = Ty->getScalarType();
1142 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1143 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1144 ExtraCost = 1 +
1145 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1147 getCFInstrCost(Instruction::Br, CostKind);
1148 }
1149 break;
1150 }
1151 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1152 }
1153
1154 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1155 unsigned SplitOp;
1157 switch (IID) {
1158 default:
1159 llvm_unreachable("Unsupported intrinsic");
1160 case Intrinsic::smax:
1161 SplitOp = RISCV::VMAX_VV;
1162 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1163 break;
1164 case Intrinsic::smin:
1165 SplitOp = RISCV::VMIN_VV;
1166 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1167 break;
1168 case Intrinsic::umax:
1169 SplitOp = RISCV::VMAXU_VV;
1170 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1171 break;
1172 case Intrinsic::umin:
1173 SplitOp = RISCV::VMINU_VV;
1174 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1175 break;
1176 case Intrinsic::maxnum:
1177 SplitOp = RISCV::VFMAX_VV;
1178 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1179 break;
1180 case Intrinsic::minnum:
1181 SplitOp = RISCV::VFMIN_VV;
1182 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1183 break;
1184 }
1185 // Add a cost for data larger than LMUL8
1186 InstructionCost SplitCost =
1187 (LT.first > 1) ? (LT.first - 1) *
1188 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1189 : 0;
1190 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1191}
1192
1195 std::optional<FastMathFlags> FMF,
1197 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1198 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1199
1200 // Skip if scalar size of Ty is bigger than ELEN.
1201 if (Ty->getScalarSizeInBits() > ST->getELen())
1202 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1203
1204 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1205 assert(ISD && "Invalid opcode");
1206
1207 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1208 ISD != ISD::FADD)
1209 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1210
1211 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1213 Type *ElementTy = Ty->getElementType();
1214 if (ElementTy->isIntegerTy(1)) {
1215 if (ISD == ISD::AND) {
1216 // Example sequences:
1217 // vsetvli a0, zero, e8, mf8, ta, ma
1218 // vmnot.m v8, v0
1219 // vcpop.m a0, v8
1220 // seqz a0, a0
1221 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1222 return (LT.first - 1) +
1223 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1224 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1226 } else {
1227 // Example sequences:
1228 // vsetvli a0, zero, e8, mf8, ta, ma
1229 // vcpop.m a0, v0
1230 // snez a0, a0
1231 Opcodes = {RISCV::VCPOP_M};
1232 return (LT.first - 1) +
1233 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1234 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1236 }
1237 }
1238
1239 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1241 Opcodes.push_back(RISCV::VFMV_S_F);
1242 for (unsigned i = 0; i < LT.first.getValue(); i++)
1243 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1244 Opcodes.push_back(RISCV::VFMV_F_S);
1245 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1246 }
1247 unsigned SplitOp;
1248 switch (ISD) {
1249 case ISD::ADD:
1250 SplitOp = RISCV::VADD_VV;
1251 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1252 break;
1253 case ISD::OR:
1254 SplitOp = RISCV::VOR_VV;
1255 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1256 break;
1257 case ISD::XOR:
1258 SplitOp = RISCV::VXOR_VV;
1259 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1260 break;
1261 case ISD::AND:
1262 SplitOp = RISCV::VAND_VV;
1263 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1264 break;
1265 case ISD::FADD:
1266 SplitOp = RISCV::VFADD_VV;
1267 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1268 break;
1269 }
1270 // Add a cost for data larger than LMUL8
1271 InstructionCost SplitCost =
1272 (LT.first > 1) ? (LT.first - 1) *
1273 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1274 : 0;
1275 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1276}
1277
1279 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1281 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1282 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1283 FMF, CostKind);
1284
1285 // Skip if scalar size of ResTy is bigger than ELEN.
1286 if (ResTy->getScalarSizeInBits() > ST->getELen())
1287 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1288 FMF, CostKind);
1289
1290 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1291 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1292 FMF, CostKind);
1293
1294 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1295
1296 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1297 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1298 FMF, CostKind);
1299
1300 return (LT.first - 1) +
1301 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1302}
1303
1305 TTI::OperandValueInfo OpInfo,
1307 assert(OpInfo.isConstant() && "non constant operand?");
1308 if (!isa<VectorType>(Ty))
1309 // FIXME: We need to account for immediate materialization here, but doing
1310 // a decent job requires more knowledge about the immediate than we
1311 // currently have here.
1312 return 0;
1313
1314 if (OpInfo.isUniform())
1315 // vmv.x.i, vmv.v.x, or vfmv.v.f
1316 // We ignore the cost of the scalar constant materialization to be consistent
1317 // with how we treat scalar constants themselves just above.
1318 return 1;
1319
1320 return getConstantPoolLoadCost(Ty, CostKind);
1321}
1322
1323
1325 MaybeAlign Alignment,
1326 unsigned AddressSpace,
1328 TTI::OperandValueInfo OpInfo,
1329 const Instruction *I) {
1330 EVT VT = TLI->getValueType(DL, Src, true);
1331 // Type legalization can't handle structs
1332 if (VT == MVT::Other)
1333 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1334 CostKind, OpInfo, I);
1335
1337 if (Opcode == Instruction::Store && OpInfo.isConstant())
1338 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1339 InstructionCost BaseCost =
1340 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1341 CostKind, OpInfo, I);
1342 // Assume memory ops cost scale with the number of vector registers
1343 // possible accessed by the instruction. Note that BasicTTI already
1344 // handles the LT.first term for us.
1345 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1346 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1347 BaseCost *= TLI->getLMULCost(LT.second);
1348 return Cost + BaseCost;
1349
1350}
1351
1353 Type *CondTy,
1354 CmpInst::Predicate VecPred,
1356 const Instruction *I) {
1358 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1359 I);
1360
1361 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1362 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1363 I);
1364
1365 // Skip if scalar size of ValTy is bigger than ELEN.
1366 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1367 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1368 I);
1369
1370 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1371 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1372 if (CondTy->isVectorTy()) {
1373 if (ValTy->getScalarSizeInBits() == 1) {
1374 // vmandn.mm v8, v8, v9
1375 // vmand.mm v9, v0, v9
1376 // vmor.mm v0, v9, v8
1377 return LT.first *
1378 getRISCVInstructionCost(
1379 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1380 LT.second, CostKind);
1381 }
1382 // vselect and max/min are supported natively.
1383 return LT.first *
1384 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1385 }
1386
1387 if (ValTy->getScalarSizeInBits() == 1) {
1388 // vmv.v.x v9, a0
1389 // vmsne.vi v9, v9, 0
1390 // vmandn.mm v8, v8, v9
1391 // vmand.mm v9, v0, v9
1392 // vmor.mm v0, v9, v8
1393 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1394 return LT.first *
1395 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1396 InterimVT, CostKind) +
1397 LT.first * getRISCVInstructionCost(
1398 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1399 LT.second, CostKind);
1400 }
1401
1402 // vmv.v.x v10, a0
1403 // vmsne.vi v0, v10, 0
1404 // vmerge.vvm v8, v9, v8, v0
1405 return LT.first * getRISCVInstructionCost(
1406 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1407 LT.second, CostKind);
1408 }
1409
1410 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1411 CmpInst::isIntPredicate(VecPred)) {
1412 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1413 // provided they incur the same cost across all implementations
1414 return LT.first *
1415 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1416 }
1417
1418 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1419 CmpInst::isFPPredicate(VecPred)) {
1420
1421 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1422 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1423 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1424
1425 // If we do not support the input floating point vector type, use the base
1426 // one which will calculate as:
1427 // ScalarizeCost + Num * Cost for fixed vector,
1428 // InvalidCost for scalable vector.
1429 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1430 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1431 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1432 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1433 I);
1434
1435 // Assuming vector fp compare and mask instructions are all the same cost
1436 // until a need arises to differentiate them.
1437 switch (VecPred) {
1438 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1439 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1440 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1441 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1442 return LT.first * getRISCVInstructionCost(
1443 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1444 LT.second, CostKind);
1445
1446 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1447 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1448 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1449 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1450 return LT.first *
1451 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1452 LT.second, CostKind);
1453
1454 case CmpInst::FCMP_OEQ: // vmfeq.vv
1455 case CmpInst::FCMP_OGT: // vmflt.vv
1456 case CmpInst::FCMP_OGE: // vmfle.vv
1457 case CmpInst::FCMP_OLT: // vmflt.vv
1458 case CmpInst::FCMP_OLE: // vmfle.vv
1459 case CmpInst::FCMP_UNE: // vmfne.vv
1460 return LT.first *
1461 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1462 default:
1463 break;
1464 }
1465 }
1466
1467 // TODO: Add cost for scalar type.
1468
1469 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1470}
1471
1474 const Instruction *I) {
1476 return Opcode == Instruction::PHI ? 0 : 1;
1477 // Branches are assumed to be predicted.
1478 return 0;
1479}
1480
1483 unsigned Index, Value *Op0,
1484 Value *Op1) {
1485 assert(Val->isVectorTy() && "This must be a vector type");
1486
1487 if (Opcode != Instruction::ExtractElement &&
1488 Opcode != Instruction::InsertElement)
1489 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1490
1491 // Legalize the type.
1492 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1493
1494 // This type is legalized to a scalar type.
1495 if (!LT.second.isVector()) {
1496 auto *FixedVecTy = cast<FixedVectorType>(Val);
1497 // If Index is a known constant, cost is zero.
1498 if (Index != -1U)
1499 return 0;
1500 // Extract/InsertElement with non-constant index is very costly when
1501 // scalarized; estimate cost of loads/stores sequence via the stack:
1502 // ExtractElement cost: store vector to stack, load scalar;
1503 // InsertElement cost: store vector to stack, store scalar, load vector.
1504 Type *ElemTy = FixedVecTy->getElementType();
1505 auto NumElems = FixedVecTy->getNumElements();
1506 auto Align = DL.getPrefTypeAlign(ElemTy);
1507 InstructionCost LoadCost =
1508 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1509 InstructionCost StoreCost =
1510 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1511 return Opcode == Instruction::ExtractElement
1512 ? StoreCost * NumElems + LoadCost
1513 : (StoreCost + LoadCost) * NumElems + StoreCost;
1514 }
1515
1516 // For unsupported scalable vector.
1517 if (LT.second.isScalableVector() && !LT.first.isValid())
1518 return LT.first;
1519
1520 if (!isTypeLegal(Val))
1521 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1522
1523 // Mask vector extract/insert is expanded via e8.
1524 if (Val->getScalarSizeInBits() == 1) {
1525 VectorType *WideTy =
1527 cast<VectorType>(Val)->getElementCount());
1528 if (Opcode == Instruction::ExtractElement) {
1529 InstructionCost ExtendCost
1530 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1532 InstructionCost ExtractCost
1533 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1534 return ExtendCost + ExtractCost;
1535 }
1536 InstructionCost ExtendCost
1537 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1539 InstructionCost InsertCost
1540 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1541 InstructionCost TruncCost
1542 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1544 return ExtendCost + InsertCost + TruncCost;
1545 }
1546
1547
1548 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1549 // and vslideup + vmv.s.x to insert element to vector.
1550 unsigned BaseCost = 1;
1551 // When insertelement we should add the index with 1 as the input of vslideup.
1552 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1553
1554 if (Index != -1U) {
1555 // The type may be split. For fixed-width vectors we can normalize the
1556 // index to the new type.
1557 if (LT.second.isFixedLengthVector()) {
1558 unsigned Width = LT.second.getVectorNumElements();
1559 Index = Index % Width;
1560 }
1561
1562 // We could extract/insert the first element without vslidedown/vslideup.
1563 if (Index == 0)
1564 SlideCost = 0;
1565 else if (Opcode == Instruction::InsertElement)
1566 SlideCost = 1; // With a constant index, we do not need to use addi.
1567 }
1568
1569 // Extract i64 in the target that has XLEN=32 need more instruction.
1570 if (Val->getScalarType()->isIntegerTy() &&
1571 ST->getXLen() < Val->getScalarSizeInBits()) {
1572 // For extractelement, we need the following instructions:
1573 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1574 // vslidedown.vx v8, v8, a0
1575 // vmv.x.s a0, v8
1576 // li a1, 32
1577 // vsrl.vx v8, v8, a1
1578 // vmv.x.s a1, v8
1579
1580 // For insertelement, we need the following instructions:
1581 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1582 // vmv.v.i v12, 0
1583 // vslide1up.vx v16, v12, a1
1584 // vslide1up.vx v12, v16, a0
1585 // addi a0, a2, 1
1586 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1587 // vslideup.vx v8, v12, a2
1588
1589 // TODO: should we count these special vsetvlis?
1590 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1591 }
1592 return BaseCost + SlideCost;
1593}
1594
1596 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1598 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1599
1600 // TODO: Handle more cost kinds.
1602 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1603 Args, CxtI);
1604
1605 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1606 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1607 Args, CxtI);
1608
1609 // Skip if scalar size of Ty is bigger than ELEN.
1610 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1611 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1612 Args, CxtI);
1613
1614 // Legalize the type.
1615 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1616
1617 // TODO: Handle scalar type.
1618 if (!LT.second.isVector())
1619 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1620 Args, CxtI);
1621
1622
1623 auto getConstantMatCost =
1624 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1625 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1626 // Two sub-cases:
1627 // * Has a 5 bit immediate operand which can be splatted.
1628 // * Has a larger immediate which must be materialized in scalar register
1629 // We return 0 for both as we currently ignore the cost of materializing
1630 // scalar constants in GPRs.
1631 return 0;
1632
1633 return getConstantPoolLoadCost(Ty, CostKind);
1634 };
1635
1636 // Add the cost of materializing any constant vectors required.
1637 InstructionCost ConstantMatCost = 0;
1638 if (Op1Info.isConstant())
1639 ConstantMatCost += getConstantMatCost(0, Op1Info);
1640 if (Op2Info.isConstant())
1641 ConstantMatCost += getConstantMatCost(1, Op2Info);
1642
1643 unsigned Op;
1644 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1645 case ISD::ADD:
1646 case ISD::SUB:
1647 Op = RISCV::VADD_VV;
1648 break;
1649 case ISD::SHL:
1650 case ISD::SRL:
1651 case ISD::SRA:
1652 Op = RISCV::VSLL_VV;
1653 break;
1654 case ISD::AND:
1655 case ISD::OR:
1656 case ISD::XOR:
1657 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1658 break;
1659 case ISD::MUL:
1660 case ISD::MULHS:
1661 case ISD::MULHU:
1662 Op = RISCV::VMUL_VV;
1663 break;
1664 case ISD::SDIV:
1665 case ISD::UDIV:
1666 Op = RISCV::VDIV_VV;
1667 break;
1668 case ISD::SREM:
1669 case ISD::UREM:
1670 Op = RISCV::VREM_VV;
1671 break;
1672 case ISD::FADD:
1673 case ISD::FSUB:
1674 // TODO: Address FP16 with VFHMIN
1675 Op = RISCV::VFADD_VV;
1676 break;
1677 case ISD::FMUL:
1678 // TODO: Address FP16 with VFHMIN
1679 Op = RISCV::VFMUL_VV;
1680 break;
1681 case ISD::FDIV:
1682 Op = RISCV::VFDIV_VV;
1683 break;
1684 case ISD::FNEG:
1685 Op = RISCV::VFSGNJN_VV;
1686 break;
1687 default:
1688 // Assuming all other instructions have the same cost until a need arises to
1689 // differentiate them.
1690 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1691 Op1Info, Op2Info,
1692 Args, CxtI);
1693 }
1694 return ConstantMatCost +
1695 LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1696}
1697
1698// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1700 ArrayRef<const Value *> Ptrs, const Value *Base,
1701 const TTI::PointersChainInfo &Info, Type *AccessTy,
1704 // In the basic model we take into account GEP instructions only
1705 // (although here can come alloca instruction, a value, constants and/or
1706 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1707 // pointer). Typically, if Base is a not a GEP-instruction and all the
1708 // pointers are relative to the same base address, all the rest are
1709 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1710 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1711 // any their index is a non-const.
1712 // If no known dependecies between the pointers cost is calculated as a sum
1713 // of costs of GEP instructions.
1714 for (auto [I, V] : enumerate(Ptrs)) {
1715 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1716 if (!GEP)
1717 continue;
1718 if (Info.isSameBase() && V != Base) {
1719 if (GEP->hasAllConstantIndices())
1720 continue;
1721 // If the chain is unit-stride and BaseReg + stride*i is a legal
1722 // addressing mode, then presume the base GEP is sitting around in a
1723 // register somewhere and check if we can fold the offset relative to
1724 // it.
1725 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1726 if (Info.isUnitStride() &&
1727 isLegalAddressingMode(AccessTy,
1728 /* BaseGV */ nullptr,
1729 /* BaseOffset */ Stride * I,
1730 /* HasBaseReg */ true,
1731 /* Scale */ 0,
1732 GEP->getType()->getPointerAddressSpace()))
1733 continue;
1734 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1735 {TTI::OK_AnyValue, TTI::OP_None},
1736 {TTI::OK_AnyValue, TTI::OP_None},
1737 std::nullopt);
1738 } else {
1739 SmallVector<const Value *> Indices(GEP->indices());
1740 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1741 Indices, AccessTy, CostKind);
1742 }
1743 }
1744 return Cost;
1745}
1746
1750 // TODO: More tuning on benchmarks and metrics with changes as needed
1751 // would apply to all settings below to enable performance.
1752
1753
1754 if (ST->enableDefaultUnroll())
1755 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1756
1757 // Enable Upper bound unrolling universally, not dependant upon the conditions
1758 // below.
1759 UP.UpperBound = true;
1760
1761 // Disable loop unrolling for Oz and Os.
1762 UP.OptSizeThreshold = 0;
1764 if (L->getHeader()->getParent()->hasOptSize())
1765 return;
1766
1767 SmallVector<BasicBlock *, 4> ExitingBlocks;
1768 L->getExitingBlocks(ExitingBlocks);
1769 LLVM_DEBUG(dbgs() << "Loop has:\n"
1770 << "Blocks: " << L->getNumBlocks() << "\n"
1771 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1772
1773 // Only allow another exit other than the latch. This acts as an early exit
1774 // as it mirrors the profitability calculation of the runtime unroller.
1775 if (ExitingBlocks.size() > 2)
1776 return;
1777
1778 // Limit the CFG of the loop body for targets with a branch predictor.
1779 // Allowing 4 blocks permits if-then-else diamonds in the body.
1780 if (L->getNumBlocks() > 4)
1781 return;
1782
1783 // Don't unroll vectorized loops, including the remainder loop
1784 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1785 return;
1786
1787 // Scan the loop: don't unroll loops with calls as this could prevent
1788 // inlining.
1790 for (auto *BB : L->getBlocks()) {
1791 for (auto &I : *BB) {
1792 // Initial setting - Don't unroll loops containing vectorized
1793 // instructions.
1794 if (I.getType()->isVectorTy())
1795 return;
1796
1797 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1798 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1799 if (!isLoweredToCall(F))
1800 continue;
1801 }
1802 return;
1803 }
1804
1805 SmallVector<const Value *> Operands(I.operand_values());
1808 }
1809 }
1810
1811 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1812
1813 UP.Partial = true;
1814 UP.Runtime = true;
1815 UP.UnrollRemainder = true;
1816 UP.UnrollAndJam = true;
1818
1819 // Force unrolling small loops can be very useful because of the branch
1820 // taken cost of the backedge.
1821 if (Cost < 12)
1822 UP.Force = true;
1823}
1824
1828}
1829
1832 if (Ty->isVectorTy()) {
1833 if (Size.isScalable() && ST->hasVInstructions())
1834 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1835
1837 return divideCeil(Size, ST->getRealMinVLen());
1838 }
1839
1840 return BaseT::getRegUsageForType(Ty);
1841}
1842
1843unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1844 if (SLPMaxVF.getNumOccurrences())
1845 return SLPMaxVF;
1846
1847 // Return how many elements can fit in getRegisterBitwidth. This is the
1848 // same routine as used in LoopVectorizer. We should probably be
1849 // accounting for whether we actually have instructions with the right
1850 // lane type, but we don't have enough information to do that without
1851 // some additional plumbing which hasn't been justified yet.
1852 TypeSize RegWidth =
1854 // If no vector registers, or absurd element widths, disable
1855 // vectorization by returning 1.
1856 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1857}
1858
1860 const TargetTransformInfo::LSRCost &C2) {
1861 // RISC-V specific here are "instruction number 1st priority".
1862 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1863 C1.NumIVMuls, C1.NumBaseAdds,
1864 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1865 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1866 C2.NumIVMuls, C2.NumBaseAdds,
1867 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1868}
1869
1871 auto *VTy = dyn_cast<VectorType>(DataTy);
1872 if (!VTy || VTy->isScalableTy())
1873 return false;
1874
1875 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1876 return false;
1877 return true;
1878}
1879
1881 const Function *Callee) const {
1882 const TargetMachine &TM = getTLI()->getTargetMachine();
1883
1884 const FeatureBitset &CallerBits =
1885 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1886 const FeatureBitset &CalleeBits =
1887 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1888
1889 // Inline a callee if its target-features are a subset of the callers
1890 // target-features.
1891 return (CallerBits & CalleeBits) == CalleeBits;
1892}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:584
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:758
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:757
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:971
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:440
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:656
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:893
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:857
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:1010
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:995
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).