LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include <cmath>
19#include <optional>
20using namespace llvm;
21using namespace llvm::PatternMatch;
22
23#define DEBUG_TYPE "riscvtti"
24
26 "riscv-v-register-bit-width-lmul",
28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29 "by autovectorized code. Fractional LMULs are not supported."),
31
33 "riscv-v-slp-max-vf",
35 "Overrides result used for getMaximumVF query which is used "
36 "exclusively by SLP vectorizer."),
38
40RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
42 // Check if the type is valid for all CostKind
43 if (!VT.isVector())
45 size_t NumInstr = OpCodes.size();
47 return NumInstr;
48 InstructionCost LMULCost = TLI->getLMULCost(VT);
50 return LMULCost * NumInstr;
52 for (auto Op : OpCodes) {
53 switch (Op) {
54 case RISCV::VRGATHER_VI:
55 Cost += TLI->getVRGatherVICost(VT);
56 break;
57 case RISCV::VRGATHER_VV:
58 Cost += TLI->getVRGatherVVCost(VT);
59 break;
60 case RISCV::VSLIDEUP_VI:
61 case RISCV::VSLIDEDOWN_VI:
62 Cost += TLI->getVSlideVICost(VT);
63 break;
64 case RISCV::VSLIDEUP_VX:
65 case RISCV::VSLIDEDOWN_VX:
66 Cost += TLI->getVSlideVXCost(VT);
67 break;
68 case RISCV::VREDMAX_VS:
69 case RISCV::VREDMIN_VS:
70 case RISCV::VREDMAXU_VS:
71 case RISCV::VREDMINU_VS:
72 case RISCV::VREDSUM_VS:
73 case RISCV::VREDAND_VS:
74 case RISCV::VREDOR_VS:
75 case RISCV::VREDXOR_VS:
76 case RISCV::VFREDMAX_VS:
77 case RISCV::VFREDMIN_VS:
78 case RISCV::VFREDUSUM_VS: {
79 unsigned VL = VT.getVectorMinNumElements();
80 if (!VT.isFixedLengthVector())
81 VL *= *getVScaleForTuning();
82 Cost += Log2_32_Ceil(VL);
83 break;
84 }
85 case RISCV::VFREDOSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += VL;
90 break;
91 }
92 case RISCV::VMV_X_S:
93 case RISCV::VMV_S_X:
94 case RISCV::VFMV_F_S:
95 case RISCV::VFMV_S_F:
96 case RISCV::VMOR_MM:
97 case RISCV::VMXOR_MM:
98 case RISCV::VMAND_MM:
99 case RISCV::VMANDN_MM:
100 case RISCV::VMNAND_MM:
101 case RISCV::VCPOP_M:
102 case RISCV::VFIRST_M:
103 Cost += 1;
104 break;
105 default:
106 Cost += LMULCost;
107 }
108 }
109 return Cost;
110}
111
114 assert(Ty->isIntegerTy() &&
115 "getIntImmCost can only estimate cost of materialising integers");
116
117 // We have a Zero register, so 0 is always free.
118 if (Imm == 0)
119 return TTI::TCC_Free;
120
121 // Otherwise, we check how many instructions it will take to materialise.
122 const DataLayout &DL = getDataLayout();
123 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
124}
125
126// Look for patterns of shift followed by AND that can be turned into a pair of
127// shifts. We won't need to materialize an immediate for the AND so these can
128// be considered free.
129static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
130 uint64_t Mask = Imm.getZExtValue();
131 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
132 if (!BO || !BO->hasOneUse())
133 return false;
134
135 if (BO->getOpcode() != Instruction::Shl)
136 return false;
137
138 if (!isa<ConstantInt>(BO->getOperand(1)))
139 return false;
140
141 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
142 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
143 // is a mask shifted by c2 bits with c3 leading zeros.
144 if (isShiftedMask_64(Mask)) {
145 unsigned Trailing = llvm::countr_zero(Mask);
146 if (ShAmt == Trailing)
147 return true;
148 }
149
150 return false;
151}
152
154 const APInt &Imm, Type *Ty,
156 Instruction *Inst) {
157 assert(Ty->isIntegerTy() &&
158 "getIntImmCost can only estimate cost of materialising integers");
159
160 // We have a Zero register, so 0 is always free.
161 if (Imm == 0)
162 return TTI::TCC_Free;
163
164 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
165 // commutative, in others the immediate comes from a specific argument index.
166 bool Takes12BitImm = false;
167 unsigned ImmArgIdx = ~0U;
168
169 switch (Opcode) {
170 case Instruction::GetElementPtr:
171 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
172 // split up large offsets in GEP into better parts than ConstantHoisting
173 // can.
174 return TTI::TCC_Free;
175 case Instruction::Store:
176 // If the address is a constant, use the materialization cost.
177 if (Idx == 1)
178 return getIntImmCost(Imm, Ty, CostKind);
179 return TTI::TCC_Free;
180 case Instruction::Load:
181 // If the address is a constant, use the materialization cost.
182 return getIntImmCost(Imm, Ty, CostKind);
183 case Instruction::And:
184 // zext.h
185 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
186 return TTI::TCC_Free;
187 // zext.w
188 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
189 return TTI::TCC_Free;
190 // bclri
191 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
192 return TTI::TCC_Free;
193 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
194 canUseShiftPair(Inst, Imm))
195 return TTI::TCC_Free;
196 Takes12BitImm = true;
197 break;
198 case Instruction::Add:
199 Takes12BitImm = true;
200 break;
201 case Instruction::Or:
202 case Instruction::Xor:
203 // bseti/binvi
204 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
205 return TTI::TCC_Free;
206 Takes12BitImm = true;
207 break;
208 case Instruction::Mul:
209 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
210 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
211 return TTI::TCC_Free;
212 // One more or less than a power of 2 can use SLLI+ADD/SUB.
213 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
214 return TTI::TCC_Free;
215 // FIXME: There is no MULI instruction.
216 Takes12BitImm = true;
217 break;
218 case Instruction::Sub:
219 case Instruction::Shl:
220 case Instruction::LShr:
221 case Instruction::AShr:
222 Takes12BitImm = true;
223 ImmArgIdx = 1;
224 break;
225 default:
226 break;
227 }
228
229 if (Takes12BitImm) {
230 // Check immediate is the correct argument...
231 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
232 // ... and fits into the 12-bit immediate.
233 if (Imm.getSignificantBits() <= 64 &&
234 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
235 return TTI::TCC_Free;
236 }
237 }
238
239 // Otherwise, use the full materialisation cost.
240 return getIntImmCost(Imm, Ty, CostKind);
241 }
242
243 // By default, prevent hoisting.
244 return TTI::TCC_Free;
245}
246
249 const APInt &Imm, Type *Ty,
251 // Prevent hoisting in unknown cases.
252 return TTI::TCC_Free;
253}
254
255bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
256 return ST->hasVInstructions();
257}
258
261 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
262 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
265}
266
268 // Currently, the ExpandReductions pass can't expand scalable-vector
269 // reductions, but we still request expansion as RVV doesn't support certain
270 // reductions and the SelectionDAG can't legalize them either.
271 switch (II->getIntrinsicID()) {
272 default:
273 return false;
274 // These reductions have no equivalent in RVV
275 case Intrinsic::vector_reduce_mul:
276 case Intrinsic::vector_reduce_fmul:
277 return true;
278 }
279}
280
281std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
282 if (ST->hasVInstructions())
284 return BaseT::getMaxVScale();
285}
286
287std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
288 if (ST->hasVInstructions())
289 if (unsigned MinVLen = ST->getRealMinVLen();
290 MinVLen >= RISCV::RVVBitsPerBlock)
291 return MinVLen / RISCV::RVVBitsPerBlock;
293}
294
297 unsigned LMUL =
298 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
299 switch (K) {
301 return TypeSize::getFixed(ST->getXLen());
303 return TypeSize::getFixed(
304 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
307 (ST->hasVInstructions() &&
310 : 0);
311 }
312
313 llvm_unreachable("Unsupported register kind");
314}
315
317RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
318 // Add a cost of address generation + the cost of the load. The address
319 // is expected to be a PC relative offset to a constant pool entry
320 // using auipc/addi.
321 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
322 /*AddressSpace=*/0, CostKind);
323}
324
326 LLVMContext &C) {
327 assert((DataVT.getScalarSizeInBits() != 8 ||
328 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
329 MVT IndexVT = DataVT.changeTypeToInteger();
330 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
331 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
332 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
333}
334
336 VectorType *Tp, ArrayRef<int> Mask,
338 int Index, VectorType *SubTp,
340 const Instruction *CxtI) {
341 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
342
343 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
344
345 // First, handle cases where having a fixed length vector enables us to
346 // give a more accurate cost than falling back to generic scalable codegen.
347 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
348 if (isa<FixedVectorType>(Tp)) {
349 switch (Kind) {
350 default:
351 break;
353 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
354 MVT EltTp = LT.second.getVectorElementType();
355 // If the size of the element is < ELEN then shuffles of interleaves and
356 // deinterleaves of 2 vectors can be lowered into the following
357 // sequences
358 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
359 // Example sequence:
360 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
361 // vwaddu.vv v10, v8, v9
362 // li a0, -1 (ignored)
363 // vwmaccu.vx v10, a0, v9
364 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
365 return 2 * LT.first * TLI->getLMULCost(LT.second);
366
367 if (Mask[0] == 0 || Mask[0] == 1) {
368 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
369 // Example sequence:
370 // vnsrl.wi v10, v8, 0
371 if (equal(DeinterleaveMask, Mask))
372 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
373 LT.second, CostKind);
374 }
375 }
376 }
377 // vrgather + cost of generating the mask constant.
378 // We model this for an unknown mask with a single vrgather.
379 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
380 (LT.second.getScalarSizeInBits() != 8 ||
381 LT.second.getVectorNumElements() <= 256)) {
382 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
383 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
384 return IndexCost +
385 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
386 }
387 [[fallthrough]];
388 }
391 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
392 // register for the second vrgather. We model this for an unknown
393 // (shuffle) mask.
394 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
395 (LT.second.getScalarSizeInBits() != 8 ||
396 LT.second.getVectorNumElements() <= 256)) {
397 auto &C = Tp->getContext();
398 auto EC = Tp->getElementCount();
399 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
401 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
402 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
403 return 2 * IndexCost +
404 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
405 LT.second, CostKind) +
406 MaskCost;
407 }
408 [[fallthrough]];
409 }
410 case TTI::SK_Select: {
411 // We are going to permute multiple sources and the result will be in
412 // multiple destinations. Providing an accurate cost only for splits where
413 // the element type remains the same.
414 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
415 LT.second.isFixedLengthVector() &&
416 LT.second.getVectorElementType().getSizeInBits() ==
418 LT.second.getVectorNumElements() <
419 cast<FixedVectorType>(Tp)->getNumElements() &&
420 divideCeil(Mask.size(),
421 cast<FixedVectorType>(Tp)->getNumElements()) ==
422 static_cast<unsigned>(*LT.first.getValue())) {
423 unsigned NumRegs = *LT.first.getValue();
424 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
425 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
426 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
427
429 for (unsigned I = 0; I < NumRegs; ++I) {
430 bool IsSingleVector = true;
431 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
432 transform(Mask.slice(I * SubVF,
433 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
434 SubMask.begin(), [&](int I) {
435 bool SingleSubVector = I / VF == 0;
436 IsSingleVector &= SingleSubVector;
437 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
438 });
441 SubVecTy, SubMask, CostKind, 0, nullptr);
442 return Cost;
443 }
444 }
445 break;
446 }
447 }
448 };
449
450 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
451 switch (Kind) {
452 default:
453 // Fallthrough to generic handling.
454 // TODO: Most of these cases will return getInvalid in generic code, and
455 // must be implemented here.
456 break;
458 // Extract at zero is always a subregister extract
459 if (Index == 0)
460 return TTI::TCC_Free;
461
462 // If we're extracting a subvector of at most m1 size at a sub-register
463 // boundary - which unfortunately we need exact vlen to identify - this is
464 // a subregister extract at worst and thus won't require a vslidedown.
465 // TODO: Extend for aligned m2, m4 subvector extracts
466 // TODO: Extend for misalgined (but contained) extracts
467 // TODO: Extend for scalable subvector types
468 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
469 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
470 const unsigned MinVLen = ST->getRealMinVLen();
471 const unsigned MaxVLen = ST->getRealMaxVLen();
472 if (MinVLen == MaxVLen &&
473 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
474 SubLT.second.getSizeInBits() <= MinVLen)
475 return TTI::TCC_Free;
476 }
477
478 // Example sequence:
479 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
480 // vslidedown.vi v8, v9, 2
481 return LT.first *
482 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
484 // Example sequence:
485 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
486 // vslideup.vi v8, v9, 2
487 return LT.first *
488 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
489 case TTI::SK_Select: {
490 // Example sequence:
491 // li a0, 90
492 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
493 // vmv.s.x v0, a0
494 // vmerge.vvm v8, v9, v8, v0
495 // We use 2 for the cost of the mask materialization as this is the true
496 // cost for small masks and most shuffles are small. At worst, this cost
497 // should be a very small constant for the constant pool load. As such,
498 // we may bias towards large selects slightly more than truely warranted.
499 return LT.first *
500 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
501 LT.second, CostKind));
502 }
503 case TTI::SK_Broadcast: {
504 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
505 Instruction::InsertElement);
506 if (LT.second.getScalarSizeInBits() == 1) {
507 if (HasScalar) {
508 // Example sequence:
509 // andi a0, a0, 1
510 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
511 // vmv.v.x v8, a0
512 // vmsne.vi v0, v8, 0
513 return LT.first *
514 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
515 LT.second, CostKind));
516 }
517 // Example sequence:
518 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
519 // vmv.v.i v8, 0
520 // vmerge.vim v8, v8, 1, v0
521 // vmv.x.s a0, v8
522 // andi a0, a0, 1
523 // vmv.v.x v8, a0
524 // vmsne.vi v0, v8, 0
525
526 return LT.first *
527 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
528 RISCV::VMV_X_S, RISCV::VMV_V_X,
529 RISCV::VMSNE_VI},
530 LT.second, CostKind));
531 }
532
533 if (HasScalar) {
534 // Example sequence:
535 // vmv.v.x v8, a0
536 return LT.first *
537 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
538 }
539
540 // Example sequence:
541 // vrgather.vi v9, v8, 0
542 return LT.first *
543 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
544 }
545 case TTI::SK_Splice: {
546 // vslidedown+vslideup.
547 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
548 // of similar code, but I think we expand through memory.
549 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
550 if (Index >= 0 && Index < 32)
551 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
552 else if (Index < 0 && Index > -32)
553 Opcodes[1] = RISCV::VSLIDEUP_VI;
554 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
555 }
556 case TTI::SK_Reverse: {
557 // TODO: Cases to improve here:
558 // * Illegal vector types
559 // * i64 on RV32
560 // * i1 vector
561 // At low LMUL, most of the cost is producing the vrgather index register.
562 // At high LMUL, the cost of the vrgather itself will dominate.
563 // Example sequence:
564 // csrr a0, vlenb
565 // srli a0, a0, 3
566 // addi a0, a0, -1
567 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
568 // vid.v v9
569 // vrsub.vx v10, v9, a0
570 // vrgather.vv v9, v8, v10
571 InstructionCost LenCost = 3;
572 if (LT.second.isFixedLengthVector())
573 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
574 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
575 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
576 if (LT.second.isFixedLengthVector() &&
577 isInt<5>(LT.second.getVectorNumElements() - 1))
578 Opcodes[1] = RISCV::VRSUB_VI;
579 InstructionCost GatherCost =
580 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
581 // Mask operation additionally required extend and truncate
582 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
583 return LT.first * (LenCost + GatherCost + ExtendCost);
584 }
585 }
586 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
587}
588
590RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
591 unsigned AddressSpace,
593 if (!isLegalMaskedLoadStore(Src, Alignment) ||
595 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
596 CostKind);
597
598 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
599}
600
602 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
603 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
604 bool UseMaskForCond, bool UseMaskForGaps) {
605 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
607
608 // The interleaved memory access pass will lower interleaved memory ops (i.e
609 // a load and store followed by a specific shuffle) to vlseg/vsseg
610 // intrinsics. In those cases then we can treat it as if it's just one (legal)
611 // memory op
612 if (!UseMaskForCond && !UseMaskForGaps &&
613 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
614 auto *VTy = cast<VectorType>(VecTy);
615 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
616 // Need to make sure type has't been scalarized
617 if (LT.second.isVector()) {
618 auto *SubVecTy =
619 VectorType::get(VTy->getElementType(),
620 VTy->getElementCount().divideCoefficientBy(Factor));
621
622 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
623 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
624 AddressSpace, DL)) {
625 // FIXME: We use the memory op cost of the *legalized* type here,
626 // because it's getMemoryOpCost returns a really expensive cost for
627 // types like <6 x i8>, which show up when doing interleaves of
628 // Factor=3 etc. Should the memory op cost of these be cheaper?
629 auto *LegalVTy = VectorType::get(VTy->getElementType(),
630 LT.second.getVectorElementCount());
631 InstructionCost LegalMemCost = getMemoryOpCost(
632 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
633 return LT.first + LegalMemCost;
634 }
635 }
636 }
637
638 // TODO: Return the cost of interleaved accesses for scalable vector when
639 // unable to convert to segment accesses instructions.
640 if (isa<ScalableVectorType>(VecTy))
642
643 auto *FVTy = cast<FixedVectorType>(VecTy);
644 InstructionCost MemCost =
645 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
646 unsigned VF = FVTy->getNumElements() / Factor;
647
648 // An interleaved load will look like this for Factor=3:
649 // %wide.vec = load <12 x i32>, ptr %3, align 4
650 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
651 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
652 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
653 if (Opcode == Instruction::Load) {
654 InstructionCost Cost = MemCost;
655 for (unsigned Index : Indices) {
656 FixedVectorType *SubVecTy =
657 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
658 auto Mask = createStrideMask(Index, Factor, VF);
659 InstructionCost ShuffleCost =
661 CostKind, 0, nullptr, {});
662 Cost += ShuffleCost;
663 }
664 return Cost;
665 }
666
667 // TODO: Model for NF > 2
668 // We'll need to enhance getShuffleCost to model shuffles that are just
669 // inserts and extracts into subvectors, since they won't have the full cost
670 // of a vrgather.
671 // An interleaved store for 3 vectors of 4 lanes will look like
672 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
673 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
674 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
675 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
676 // store <12 x i32> %interleaved.vec, ptr %10, align 4
677 if (Factor != 2)
678 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
679 Alignment, AddressSpace, CostKind,
680 UseMaskForCond, UseMaskForGaps);
681
682 assert(Opcode == Instruction::Store && "Opcode must be a store");
683 // For an interleaving store of 2 vectors, we perform one large interleaving
684 // shuffle that goes into the wide store
685 auto Mask = createInterleaveMask(VF, Factor);
686 InstructionCost ShuffleCost =
688 CostKind, 0, nullptr, {});
689 return MemCost + ShuffleCost;
690}
691
693 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
694 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
696 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
697 Alignment, CostKind, I);
698
699 if ((Opcode == Instruction::Load &&
700 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
701 (Opcode == Instruction::Store &&
702 !isLegalMaskedScatter(DataTy, Align(Alignment))))
703 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
704 Alignment, CostKind, I);
705
706 // Cost is proportional to the number of memory operations implied. For
707 // scalable vectors, we use an estimate on that number since we don't
708 // know exactly what VL will be.
709 auto &VTy = *cast<VectorType>(DataTy);
710 InstructionCost MemOpCost =
711 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
712 {TTI::OK_AnyValue, TTI::OP_None}, I);
713 unsigned NumLoads = getEstimatedVLFor(&VTy);
714 return NumLoads * MemOpCost;
715}
716
718 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
719 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
720 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
721 !isLegalStridedLoadStore(DataTy, Alignment)) ||
722 (Opcode != Instruction::Load && Opcode != Instruction::Store))
723 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
724 Alignment, CostKind, I);
725
727 return TTI::TCC_Basic;
728
729 // Cost is proportional to the number of memory operations implied. For
730 // scalable vectors, we use an estimate on that number since we don't
731 // know exactly what VL will be.
732 auto &VTy = *cast<VectorType>(DataTy);
733 InstructionCost MemOpCost =
734 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
735 {TTI::OK_AnyValue, TTI::OP_None}, I);
736 unsigned NumLoads = getEstimatedVLFor(&VTy);
737 return NumLoads * MemOpCost;
738}
739
740// Currently, these represent both throughput and codesize costs
741// for the respective intrinsics. The costs in this table are simply
742// instruction counts with the following adjustments made:
743// * One vsetvli is considered free.
745 {Intrinsic::floor, MVT::f32, 9},
746 {Intrinsic::floor, MVT::f64, 9},
747 {Intrinsic::ceil, MVT::f32, 9},
748 {Intrinsic::ceil, MVT::f64, 9},
749 {Intrinsic::trunc, MVT::f32, 7},
750 {Intrinsic::trunc, MVT::f64, 7},
751 {Intrinsic::round, MVT::f32, 9},
752 {Intrinsic::round, MVT::f64, 9},
753 {Intrinsic::roundeven, MVT::f32, 9},
754 {Intrinsic::roundeven, MVT::f64, 9},
755 {Intrinsic::rint, MVT::f32, 7},
756 {Intrinsic::rint, MVT::f64, 7},
757 {Intrinsic::lrint, MVT::i32, 1},
758 {Intrinsic::lrint, MVT::i64, 1},
759 {Intrinsic::llrint, MVT::i64, 1},
760 {Intrinsic::nearbyint, MVT::f32, 9},
761 {Intrinsic::nearbyint, MVT::f64, 9},
762 {Intrinsic::bswap, MVT::i16, 3},
763 {Intrinsic::bswap, MVT::i32, 12},
764 {Intrinsic::bswap, MVT::i64, 31},
765 {Intrinsic::vp_bswap, MVT::i16, 3},
766 {Intrinsic::vp_bswap, MVT::i32, 12},
767 {Intrinsic::vp_bswap, MVT::i64, 31},
768 {Intrinsic::vp_fshl, MVT::i8, 7},
769 {Intrinsic::vp_fshl, MVT::i16, 7},
770 {Intrinsic::vp_fshl, MVT::i32, 7},
771 {Intrinsic::vp_fshl, MVT::i64, 7},
772 {Intrinsic::vp_fshr, MVT::i8, 7},
773 {Intrinsic::vp_fshr, MVT::i16, 7},
774 {Intrinsic::vp_fshr, MVT::i32, 7},
775 {Intrinsic::vp_fshr, MVT::i64, 7},
776 {Intrinsic::bitreverse, MVT::i8, 17},
777 {Intrinsic::bitreverse, MVT::i16, 24},
778 {Intrinsic::bitreverse, MVT::i32, 33},
779 {Intrinsic::bitreverse, MVT::i64, 52},
780 {Intrinsic::vp_bitreverse, MVT::i8, 17},
781 {Intrinsic::vp_bitreverse, MVT::i16, 24},
782 {Intrinsic::vp_bitreverse, MVT::i32, 33},
783 {Intrinsic::vp_bitreverse, MVT::i64, 52},
784 {Intrinsic::ctpop, MVT::i8, 12},
785 {Intrinsic::ctpop, MVT::i16, 19},
786 {Intrinsic::ctpop, MVT::i32, 20},
787 {Intrinsic::ctpop, MVT::i64, 21},
788 {Intrinsic::vp_ctpop, MVT::i8, 12},
789 {Intrinsic::vp_ctpop, MVT::i16, 19},
790 {Intrinsic::vp_ctpop, MVT::i32, 20},
791 {Intrinsic::vp_ctpop, MVT::i64, 21},
792 {Intrinsic::vp_ctlz, MVT::i8, 19},
793 {Intrinsic::vp_ctlz, MVT::i16, 28},
794 {Intrinsic::vp_ctlz, MVT::i32, 31},
795 {Intrinsic::vp_ctlz, MVT::i64, 35},
796 {Intrinsic::vp_cttz, MVT::i8, 16},
797 {Intrinsic::vp_cttz, MVT::i16, 23},
798 {Intrinsic::vp_cttz, MVT::i32, 24},
799 {Intrinsic::vp_cttz, MVT::i64, 25},
800};
801
803 switch (ID) {
804#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
805 case Intrinsic::VPID: \
806 return ISD::VPSD;
807#include "llvm/IR/VPIntrinsics.def"
808#undef HELPER_MAP_VPID_TO_VPSD
809 }
810 return ISD::DELETED_NODE;
811}
812
816 auto *RetTy = ICA.getReturnType();
817 switch (ICA.getID()) {
818 case Intrinsic::ceil:
819 case Intrinsic::floor:
820 case Intrinsic::trunc:
821 case Intrinsic::rint:
822 case Intrinsic::lrint:
823 case Intrinsic::llrint:
824 case Intrinsic::round:
825 case Intrinsic::roundeven: {
826 // These all use the same code.
828 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
829 return LT.first * 8;
830 break;
831 }
832 case Intrinsic::umin:
833 case Intrinsic::umax:
834 case Intrinsic::smin:
835 case Intrinsic::smax: {
837 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
838 return LT.first;
839
840 if (ST->hasVInstructions() && LT.second.isVector()) {
841 unsigned Op;
842 switch (ICA.getID()) {
843 case Intrinsic::umin:
844 Op = RISCV::VMINU_VV;
845 break;
846 case Intrinsic::umax:
847 Op = RISCV::VMAXU_VV;
848 break;
849 case Intrinsic::smin:
850 Op = RISCV::VMIN_VV;
851 break;
852 case Intrinsic::smax:
853 Op = RISCV::VMAX_VV;
854 break;
855 }
856 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
857 }
858 break;
859 }
860 case Intrinsic::sadd_sat:
861 case Intrinsic::ssub_sat:
862 case Intrinsic::uadd_sat:
863 case Intrinsic::usub_sat:
864 case Intrinsic::fabs:
865 case Intrinsic::sqrt: {
867 if (ST->hasVInstructions() && LT.second.isVector())
868 return LT.first;
869 break;
870 }
871 case Intrinsic::ctpop: {
873 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
874 return LT.first;
875 break;
876 }
877 case Intrinsic::abs: {
879 if (ST->hasVInstructions() && LT.second.isVector()) {
880 // vrsub.vi v10, v8, 0
881 // vmax.vv v8, v8, v10
882 return LT.first * 2;
883 }
884 break;
885 }
886 case Intrinsic::get_active_lane_mask: {
887 if (ST->hasVInstructions()) {
888 Type *ExpRetTy = VectorType::get(
889 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
890 auto LT = getTypeLegalizationCost(ExpRetTy);
891
892 // vid.v v8 // considered hoisted
893 // vsaddu.vx v8, v8, a0
894 // vmsltu.vx v0, v8, a1
895 return LT.first *
896 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
897 LT.second, CostKind);
898 }
899 break;
900 }
901 // TODO: add more intrinsic
902 case Intrinsic::experimental_stepvector: {
904 // Legalisation of illegal types involves an `index' instruction plus
905 // (LT.first - 1) vector adds.
906 if (ST->hasVInstructions())
907 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
908 (LT.first - 1) *
909 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
910 return 1 + (LT.first - 1);
911 }
912 case Intrinsic::experimental_cttz_elts: {
913 Type *ArgTy = ICA.getArgTypes()[0];
914 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
915 if (getTLI()->shouldExpandCttzElements(ArgType))
916 break;
917 InstructionCost Cost = getRISCVInstructionCost(
918 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
919
920 // If zero_is_poison is false, then we will generate additional
921 // cmp + select instructions to convert -1 to EVL.
922 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
923 if (ICA.getArgs().size() > 1 &&
924 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
925 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
927 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
929
930 return Cost;
931 }
932 case Intrinsic::vp_rint: {
933 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
934 unsigned Cost = 5;
936 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
937 return Cost * LT.first;
938 break;
939 }
940 case Intrinsic::vp_nearbyint: {
941 // More one read and one write for fflags than vp_rint.
942 unsigned Cost = 7;
944 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
945 return Cost * LT.first;
946 break;
947 }
948 case Intrinsic::vp_ceil:
949 case Intrinsic::vp_floor:
950 case Intrinsic::vp_round:
951 case Intrinsic::vp_roundeven:
952 case Intrinsic::vp_roundtozero: {
953 // Rounding with static rounding mode needs two more instructions to
954 // swap/write FRM than vp_rint.
955 unsigned Cost = 7;
957 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
958 if (TLI->isOperationCustom(VPISD, LT.second))
959 return Cost * LT.first;
960 break;
961 }
962 }
963
964 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
965 if (auto LT = getTypeLegalizationCost(RetTy);
966 LT.second.isVector()) {
967 MVT EltTy = LT.second.getVectorElementType();
968 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
969 ICA.getID(), EltTy))
970 return LT.first * Entry->Cost;
971 }
972 }
973
975}
976
978 Type *Src,
981 const Instruction *I) {
982 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
983 if (!IsVectorType)
984 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
985
986 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
987 (Src->getScalarSizeInBits() <= ST->getELen()) &&
988 (Dst->getScalarSizeInBits() <= ST->getELen());
989
990 // FIXME: Need to compute legalizing cost for illegal types.
991 if (!IsTypeLegal)
992 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
993
994 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
995 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
996
997 int ISD = TLI->InstructionOpcodeToISD(Opcode);
998 assert(ISD && "Invalid opcode");
999
1000 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1001 (int)Log2_32(Src->getScalarSizeInBits());
1002 switch (ISD) {
1003 case ISD::SIGN_EXTEND:
1004 case ISD::ZERO_EXTEND: {
1005 const unsigned SrcEltSize = Src->getScalarSizeInBits();
1006 if (SrcEltSize == 1) {
1007 // We do not use vsext/vzext to extend from mask vector.
1008 // Instead we use the following instructions to extend from mask vector:
1009 // vmv.v.i v8, 0
1010 // vmerge.vim v8, v8, -1, v0
1011 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1012 DstLT.second, CostKind);
1013 }
1014 if ((PowDiff < 1) || (PowDiff > 3))
1015 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1016 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1017 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1018 unsigned Op =
1019 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1020 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1021 }
1022 case ISD::TRUNCATE:
1023 if (Dst->getScalarSizeInBits() == 1) {
1024 // We do not use several vncvt to truncate to mask vector. So we could
1025 // not use PowDiff to calculate it.
1026 // Instead we use the following instructions to truncate to mask vector:
1027 // vand.vi v8, v8, 1
1028 // vmsne.vi v0, v8, 0
1029 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1030 SrcLT.second, CostKind);
1031 }
1032 [[fallthrough]];
1033 case ISD::FP_EXTEND:
1034 case ISD::FP_ROUND: {
1035 // Counts of narrow/widen instructions.
1036 unsigned SrcEltSize = Src->getScalarSizeInBits();
1037 unsigned DstEltSize = Dst->getScalarSizeInBits();
1038
1039 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1040 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1041 : RISCV::VFNCVT_F_F_W;
1043 for (; SrcEltSize != DstEltSize;) {
1044 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1045 ? MVT::getIntegerVT(DstEltSize)
1046 : MVT::getFloatingPointVT(DstEltSize);
1047 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1048 DstEltSize =
1049 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1050 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1051 }
1052 return Cost;
1053 }
1054 case ISD::FP_TO_SINT:
1055 case ISD::FP_TO_UINT:
1056 case ISD::SINT_TO_FP:
1057 case ISD::UINT_TO_FP:
1058 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1059 // The cost of convert from or to mask vector is different from other
1060 // cases. We could not use PowDiff to calculate it.
1061 // For mask vector to fp, we should use the following instructions:
1062 // vmv.v.i v8, 0
1063 // vmerge.vim v8, v8, -1, v0
1064 // vfcvt.f.x.v v8, v8
1065
1066 // And for fp vector to mask, we use:
1067 // vfncvt.rtz.x.f.w v9, v8
1068 // vand.vi v8, v9, 1
1069 // vmsne.vi v0, v8, 0
1070 return 3;
1071 }
1072 if (std::abs(PowDiff) <= 1)
1073 return 1;
1074 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1075 // so it only need two conversion.
1076 if (Src->isIntOrIntVectorTy())
1077 return 2;
1078 // Counts of narrow/widen instructions.
1079 return std::abs(PowDiff);
1080 }
1081 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1082}
1083
1084unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1085 if (isa<ScalableVectorType>(Ty)) {
1086 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1087 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1088 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1089 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1090 }
1091 return cast<FixedVectorType>(Ty)->getNumElements();
1092}
1093
1096 FastMathFlags FMF,
1098 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1099 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1100
1101 // Skip if scalar size of Ty is bigger than ELEN.
1102 if (Ty->getScalarSizeInBits() > ST->getELen())
1103 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1104
1105 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1106 if (Ty->getElementType()->isIntegerTy(1)) {
1107 // SelectionDAGBuilder does following transforms:
1108 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1109 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1110 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1111 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1112 else
1113 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1114 }
1115
1116 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1118 InstructionCost ExtraCost = 0;
1119 switch (IID) {
1120 case Intrinsic::maximum:
1121 if (FMF.noNaNs()) {
1122 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1123 } else {
1124 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1125 RISCV::VFMV_F_S};
1126 // Cost of Canonical Nan + branch
1127 // lui a0, 523264
1128 // fmv.w.x fa0, a0
1129 Type *DstTy = Ty->getScalarType();
1130 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1131 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1132 ExtraCost = 1 +
1133 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1135 getCFInstrCost(Instruction::Br, CostKind);
1136 }
1137 break;
1138
1139 case Intrinsic::minimum:
1140 if (FMF.noNaNs()) {
1141 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1142 } else {
1143 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1144 RISCV::VFMV_F_S};
1145 // Cost of Canonical Nan + branch
1146 // lui a0, 523264
1147 // fmv.w.x fa0, a0
1148 Type *DstTy = Ty->getScalarType();
1149 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1150 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1151 ExtraCost = 1 +
1152 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1154 getCFInstrCost(Instruction::Br, CostKind);
1155 }
1156 break;
1157 }
1158 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1159 }
1160
1161 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1162 unsigned SplitOp;
1164 switch (IID) {
1165 default:
1166 llvm_unreachable("Unsupported intrinsic");
1167 case Intrinsic::smax:
1168 SplitOp = RISCV::VMAX_VV;
1169 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1170 break;
1171 case Intrinsic::smin:
1172 SplitOp = RISCV::VMIN_VV;
1173 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1174 break;
1175 case Intrinsic::umax:
1176 SplitOp = RISCV::VMAXU_VV;
1177 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1178 break;
1179 case Intrinsic::umin:
1180 SplitOp = RISCV::VMINU_VV;
1181 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1182 break;
1183 case Intrinsic::maxnum:
1184 SplitOp = RISCV::VFMAX_VV;
1185 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1186 break;
1187 case Intrinsic::minnum:
1188 SplitOp = RISCV::VFMIN_VV;
1189 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1190 break;
1191 }
1192 // Add a cost for data larger than LMUL8
1193 InstructionCost SplitCost =
1194 (LT.first > 1) ? (LT.first - 1) *
1195 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1196 : 0;
1197 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1198}
1199
1202 std::optional<FastMathFlags> FMF,
1204 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1205 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1206
1207 // Skip if scalar size of Ty is bigger than ELEN.
1208 if (Ty->getScalarSizeInBits() > ST->getELen())
1209 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1210
1211 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1212 assert(ISD && "Invalid opcode");
1213
1214 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1215 ISD != ISD::FADD)
1216 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1217
1218 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1220 Type *ElementTy = Ty->getElementType();
1221 if (ElementTy->isIntegerTy(1)) {
1222 if (ISD == ISD::AND) {
1223 // Example sequences:
1224 // vsetvli a0, zero, e8, mf8, ta, ma
1225 // vmnot.m v8, v0
1226 // vcpop.m a0, v8
1227 // seqz a0, a0
1228 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1229 return (LT.first - 1) +
1230 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1231 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1233 } else {
1234 // Example sequences:
1235 // vsetvli a0, zero, e8, mf8, ta, ma
1236 // vcpop.m a0, v0
1237 // snez a0, a0
1238 Opcodes = {RISCV::VCPOP_M};
1239 return (LT.first - 1) +
1240 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1241 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1243 }
1244 }
1245
1246 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1248 Opcodes.push_back(RISCV::VFMV_S_F);
1249 for (unsigned i = 0; i < LT.first.getValue(); i++)
1250 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1251 Opcodes.push_back(RISCV::VFMV_F_S);
1252 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1253 }
1254 unsigned SplitOp;
1255 switch (ISD) {
1256 case ISD::ADD:
1257 SplitOp = RISCV::VADD_VV;
1258 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1259 break;
1260 case ISD::OR:
1261 SplitOp = RISCV::VOR_VV;
1262 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1263 break;
1264 case ISD::XOR:
1265 SplitOp = RISCV::VXOR_VV;
1266 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1267 break;
1268 case ISD::AND:
1269 SplitOp = RISCV::VAND_VV;
1270 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1271 break;
1272 case ISD::FADD:
1273 SplitOp = RISCV::VFADD_VV;
1274 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1275 break;
1276 }
1277 // Add a cost for data larger than LMUL8
1278 InstructionCost SplitCost =
1279 (LT.first > 1) ? (LT.first - 1) *
1280 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1281 : 0;
1282 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1283}
1284
1286 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1288 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1289 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1290 FMF, CostKind);
1291
1292 // Skip if scalar size of ResTy is bigger than ELEN.
1293 if (ResTy->getScalarSizeInBits() > ST->getELen())
1294 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1295 FMF, CostKind);
1296
1297 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1298 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1299 FMF, CostKind);
1300
1301 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1302
1303 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1304 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1305 FMF, CostKind);
1306
1307 return (LT.first - 1) +
1308 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1309}
1310
1312 TTI::OperandValueInfo OpInfo,
1314 assert(OpInfo.isConstant() && "non constant operand?");
1315 if (!isa<VectorType>(Ty))
1316 // FIXME: We need to account for immediate materialization here, but doing
1317 // a decent job requires more knowledge about the immediate than we
1318 // currently have here.
1319 return 0;
1320
1321 if (OpInfo.isUniform())
1322 // vmv.x.i, vmv.v.x, or vfmv.v.f
1323 // We ignore the cost of the scalar constant materialization to be consistent
1324 // with how we treat scalar constants themselves just above.
1325 return 1;
1326
1327 return getConstantPoolLoadCost(Ty, CostKind);
1328}
1329
1330
1332 MaybeAlign Alignment,
1333 unsigned AddressSpace,
1335 TTI::OperandValueInfo OpInfo,
1336 const Instruction *I) {
1337 EVT VT = TLI->getValueType(DL, Src, true);
1338 // Type legalization can't handle structs
1339 if (VT == MVT::Other)
1340 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1341 CostKind, OpInfo, I);
1342
1344 if (Opcode == Instruction::Store && OpInfo.isConstant())
1345 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1346 InstructionCost BaseCost =
1347 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1348 CostKind, OpInfo, I);
1349 // Assume memory ops cost scale with the number of vector registers
1350 // possible accessed by the instruction. Note that BasicTTI already
1351 // handles the LT.first term for us.
1352 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1353 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1354 BaseCost *= TLI->getLMULCost(LT.second);
1355 return Cost + BaseCost;
1356
1357}
1358
1360 Type *CondTy,
1361 CmpInst::Predicate VecPred,
1363 const Instruction *I) {
1365 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1366 I);
1367
1368 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1369 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1370 I);
1371
1372 // Skip if scalar size of ValTy is bigger than ELEN.
1373 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1374 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1375 I);
1376
1377 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1378 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1379 if (CondTy->isVectorTy()) {
1380 if (ValTy->getScalarSizeInBits() == 1) {
1381 // vmandn.mm v8, v8, v9
1382 // vmand.mm v9, v0, v9
1383 // vmor.mm v0, v9, v8
1384 return LT.first *
1385 getRISCVInstructionCost(
1386 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1387 LT.second, CostKind);
1388 }
1389 // vselect and max/min are supported natively.
1390 return LT.first *
1391 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1392 }
1393
1394 if (ValTy->getScalarSizeInBits() == 1) {
1395 // vmv.v.x v9, a0
1396 // vmsne.vi v9, v9, 0
1397 // vmandn.mm v8, v8, v9
1398 // vmand.mm v9, v0, v9
1399 // vmor.mm v0, v9, v8
1400 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1401 return LT.first *
1402 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1403 InterimVT, CostKind) +
1404 LT.first * getRISCVInstructionCost(
1405 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1406 LT.second, CostKind);
1407 }
1408
1409 // vmv.v.x v10, a0
1410 // vmsne.vi v0, v10, 0
1411 // vmerge.vvm v8, v9, v8, v0
1412 return LT.first * getRISCVInstructionCost(
1413 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1414 LT.second, CostKind);
1415 }
1416
1417 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1418 CmpInst::isIntPredicate(VecPred)) {
1419 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1420 // provided they incur the same cost across all implementations
1421 return LT.first *
1422 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1423 }
1424
1425 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1426 CmpInst::isFPPredicate(VecPred)) {
1427
1428 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1429 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1430 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1431
1432 // If we do not support the input floating point vector type, use the base
1433 // one which will calculate as:
1434 // ScalarizeCost + Num * Cost for fixed vector,
1435 // InvalidCost for scalable vector.
1436 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1437 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1438 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1439 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1440 I);
1441
1442 // Assuming vector fp compare and mask instructions are all the same cost
1443 // until a need arises to differentiate them.
1444 switch (VecPred) {
1445 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1446 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1447 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1448 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1449 return LT.first * getRISCVInstructionCost(
1450 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1451 LT.second, CostKind);
1452
1453 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1454 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1455 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1456 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1457 return LT.first *
1458 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1459 LT.second, CostKind);
1460
1461 case CmpInst::FCMP_OEQ: // vmfeq.vv
1462 case CmpInst::FCMP_OGT: // vmflt.vv
1463 case CmpInst::FCMP_OGE: // vmfle.vv
1464 case CmpInst::FCMP_OLT: // vmflt.vv
1465 case CmpInst::FCMP_OLE: // vmfle.vv
1466 case CmpInst::FCMP_UNE: // vmfne.vv
1467 return LT.first *
1468 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1469 default:
1470 break;
1471 }
1472 }
1473
1474 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1475 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1476 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1477 // be (0 + select instr cost).
1478 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1479 ValTy->isIntegerTy() && !I->user_empty()) {
1480 if (all_of(I->users(), [&](const User *U) {
1481 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1482 U->getType()->isIntegerTy() &&
1483 !isa<ConstantData>(U->getOperand(1)) &&
1484 !isa<ConstantData>(U->getOperand(2));
1485 }))
1486 return 0;
1487 }
1488
1489 // TODO: Add cost for scalar type.
1490
1491 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1492}
1493
1496 const Instruction *I) {
1498 return Opcode == Instruction::PHI ? 0 : 1;
1499 // Branches are assumed to be predicted.
1500 return 0;
1501}
1502
1505 unsigned Index, Value *Op0,
1506 Value *Op1) {
1507 assert(Val->isVectorTy() && "This must be a vector type");
1508
1509 if (Opcode != Instruction::ExtractElement &&
1510 Opcode != Instruction::InsertElement)
1511 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1512
1513 // Legalize the type.
1514 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1515
1516 // This type is legalized to a scalar type.
1517 if (!LT.second.isVector()) {
1518 auto *FixedVecTy = cast<FixedVectorType>(Val);
1519 // If Index is a known constant, cost is zero.
1520 if (Index != -1U)
1521 return 0;
1522 // Extract/InsertElement with non-constant index is very costly when
1523 // scalarized; estimate cost of loads/stores sequence via the stack:
1524 // ExtractElement cost: store vector to stack, load scalar;
1525 // InsertElement cost: store vector to stack, store scalar, load vector.
1526 Type *ElemTy = FixedVecTy->getElementType();
1527 auto NumElems = FixedVecTy->getNumElements();
1528 auto Align = DL.getPrefTypeAlign(ElemTy);
1529 InstructionCost LoadCost =
1530 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1531 InstructionCost StoreCost =
1532 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1533 return Opcode == Instruction::ExtractElement
1534 ? StoreCost * NumElems + LoadCost
1535 : (StoreCost + LoadCost) * NumElems + StoreCost;
1536 }
1537
1538 // For unsupported scalable vector.
1539 if (LT.second.isScalableVector() && !LT.first.isValid())
1540 return LT.first;
1541
1542 if (!isTypeLegal(Val))
1543 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1544
1545 // Mask vector extract/insert is expanded via e8.
1546 if (Val->getScalarSizeInBits() == 1) {
1547 VectorType *WideTy =
1549 cast<VectorType>(Val)->getElementCount());
1550 if (Opcode == Instruction::ExtractElement) {
1551 InstructionCost ExtendCost
1552 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1554 InstructionCost ExtractCost
1555 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1556 return ExtendCost + ExtractCost;
1557 }
1558 InstructionCost ExtendCost
1559 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1561 InstructionCost InsertCost
1562 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1563 InstructionCost TruncCost
1564 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1566 return ExtendCost + InsertCost + TruncCost;
1567 }
1568
1569
1570 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1571 // and vslideup + vmv.s.x to insert element to vector.
1572 unsigned BaseCost = 1;
1573 // When insertelement we should add the index with 1 as the input of vslideup.
1574 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1575
1576 if (Index != -1U) {
1577 // The type may be split. For fixed-width vectors we can normalize the
1578 // index to the new type.
1579 if (LT.second.isFixedLengthVector()) {
1580 unsigned Width = LT.second.getVectorNumElements();
1581 Index = Index % Width;
1582 }
1583
1584 // We could extract/insert the first element without vslidedown/vslideup.
1585 if (Index == 0)
1586 SlideCost = 0;
1587 else if (Opcode == Instruction::InsertElement)
1588 SlideCost = 1; // With a constant index, we do not need to use addi.
1589 }
1590
1591 // Extract i64 in the target that has XLEN=32 need more instruction.
1592 if (Val->getScalarType()->isIntegerTy() &&
1593 ST->getXLen() < Val->getScalarSizeInBits()) {
1594 // For extractelement, we need the following instructions:
1595 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1596 // vslidedown.vx v8, v8, a0
1597 // vmv.x.s a0, v8
1598 // li a1, 32
1599 // vsrl.vx v8, v8, a1
1600 // vmv.x.s a1, v8
1601
1602 // For insertelement, we need the following instructions:
1603 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1604 // vmv.v.i v12, 0
1605 // vslide1up.vx v16, v12, a1
1606 // vslide1up.vx v12, v16, a0
1607 // addi a0, a2, 1
1608 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1609 // vslideup.vx v8, v12, a2
1610
1611 // TODO: should we count these special vsetvlis?
1612 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1613 }
1614 return BaseCost + SlideCost;
1615}
1616
1618 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1620 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1621
1622 // TODO: Handle more cost kinds.
1624 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1625 Args, CxtI);
1626
1627 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1628 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1629 Args, CxtI);
1630
1631 // Skip if scalar size of Ty is bigger than ELEN.
1632 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1633 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1634 Args, CxtI);
1635
1636 // Legalize the type.
1637 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1638
1639 // TODO: Handle scalar type.
1640 if (!LT.second.isVector())
1641 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1642 Args, CxtI);
1643
1644
1645 auto getConstantMatCost =
1646 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1647 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1648 // Two sub-cases:
1649 // * Has a 5 bit immediate operand which can be splatted.
1650 // * Has a larger immediate which must be materialized in scalar register
1651 // We return 0 for both as we currently ignore the cost of materializing
1652 // scalar constants in GPRs.
1653 return 0;
1654
1655 return getConstantPoolLoadCost(Ty, CostKind);
1656 };
1657
1658 // Add the cost of materializing any constant vectors required.
1659 InstructionCost ConstantMatCost = 0;
1660 if (Op1Info.isConstant())
1661 ConstantMatCost += getConstantMatCost(0, Op1Info);
1662 if (Op2Info.isConstant())
1663 ConstantMatCost += getConstantMatCost(1, Op2Info);
1664
1665 unsigned Op;
1666 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1667 case ISD::ADD:
1668 case ISD::SUB:
1669 Op = RISCV::VADD_VV;
1670 break;
1671 case ISD::SHL:
1672 case ISD::SRL:
1673 case ISD::SRA:
1674 Op = RISCV::VSLL_VV;
1675 break;
1676 case ISD::AND:
1677 case ISD::OR:
1678 case ISD::XOR:
1679 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1680 break;
1681 case ISD::MUL:
1682 case ISD::MULHS:
1683 case ISD::MULHU:
1684 Op = RISCV::VMUL_VV;
1685 break;
1686 case ISD::SDIV:
1687 case ISD::UDIV:
1688 Op = RISCV::VDIV_VV;
1689 break;
1690 case ISD::SREM:
1691 case ISD::UREM:
1692 Op = RISCV::VREM_VV;
1693 break;
1694 case ISD::FADD:
1695 case ISD::FSUB:
1696 // TODO: Address FP16 with VFHMIN
1697 Op = RISCV::VFADD_VV;
1698 break;
1699 case ISD::FMUL:
1700 // TODO: Address FP16 with VFHMIN
1701 Op = RISCV::VFMUL_VV;
1702 break;
1703 case ISD::FDIV:
1704 Op = RISCV::VFDIV_VV;
1705 break;
1706 case ISD::FNEG:
1707 Op = RISCV::VFSGNJN_VV;
1708 break;
1709 default:
1710 // Assuming all other instructions have the same cost until a need arises to
1711 // differentiate them.
1712 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1713 Op1Info, Op2Info,
1714 Args, CxtI);
1715 }
1716 return ConstantMatCost +
1717 LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1718}
1719
1720// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1722 ArrayRef<const Value *> Ptrs, const Value *Base,
1723 const TTI::PointersChainInfo &Info, Type *AccessTy,
1726 // In the basic model we take into account GEP instructions only
1727 // (although here can come alloca instruction, a value, constants and/or
1728 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1729 // pointer). Typically, if Base is a not a GEP-instruction and all the
1730 // pointers are relative to the same base address, all the rest are
1731 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1732 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1733 // any their index is a non-const.
1734 // If no known dependecies between the pointers cost is calculated as a sum
1735 // of costs of GEP instructions.
1736 for (auto [I, V] : enumerate(Ptrs)) {
1737 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1738 if (!GEP)
1739 continue;
1740 if (Info.isSameBase() && V != Base) {
1741 if (GEP->hasAllConstantIndices())
1742 continue;
1743 // If the chain is unit-stride and BaseReg + stride*i is a legal
1744 // addressing mode, then presume the base GEP is sitting around in a
1745 // register somewhere and check if we can fold the offset relative to
1746 // it.
1747 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1748 if (Info.isUnitStride() &&
1749 isLegalAddressingMode(AccessTy,
1750 /* BaseGV */ nullptr,
1751 /* BaseOffset */ Stride * I,
1752 /* HasBaseReg */ true,
1753 /* Scale */ 0,
1754 GEP->getType()->getPointerAddressSpace()))
1755 continue;
1756 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1757 {TTI::OK_AnyValue, TTI::OP_None},
1758 {TTI::OK_AnyValue, TTI::OP_None},
1759 std::nullopt);
1760 } else {
1761 SmallVector<const Value *> Indices(GEP->indices());
1762 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1763 Indices, AccessTy, CostKind);
1764 }
1765 }
1766 return Cost;
1767}
1768
1772 // TODO: More tuning on benchmarks and metrics with changes as needed
1773 // would apply to all settings below to enable performance.
1774
1775
1776 if (ST->enableDefaultUnroll())
1777 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1778
1779 // Enable Upper bound unrolling universally, not dependant upon the conditions
1780 // below.
1781 UP.UpperBound = true;
1782
1783 // Disable loop unrolling for Oz and Os.
1784 UP.OptSizeThreshold = 0;
1786 if (L->getHeader()->getParent()->hasOptSize())
1787 return;
1788
1789 SmallVector<BasicBlock *, 4> ExitingBlocks;
1790 L->getExitingBlocks(ExitingBlocks);
1791 LLVM_DEBUG(dbgs() << "Loop has:\n"
1792 << "Blocks: " << L->getNumBlocks() << "\n"
1793 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1794
1795 // Only allow another exit other than the latch. This acts as an early exit
1796 // as it mirrors the profitability calculation of the runtime unroller.
1797 if (ExitingBlocks.size() > 2)
1798 return;
1799
1800 // Limit the CFG of the loop body for targets with a branch predictor.
1801 // Allowing 4 blocks permits if-then-else diamonds in the body.
1802 if (L->getNumBlocks() > 4)
1803 return;
1804
1805 // Don't unroll vectorized loops, including the remainder loop
1806 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1807 return;
1808
1809 // Scan the loop: don't unroll loops with calls as this could prevent
1810 // inlining.
1812 for (auto *BB : L->getBlocks()) {
1813 for (auto &I : *BB) {
1814 // Initial setting - Don't unroll loops containing vectorized
1815 // instructions.
1816 if (I.getType()->isVectorTy())
1817 return;
1818
1819 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1820 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1821 if (!isLoweredToCall(F))
1822 continue;
1823 }
1824 return;
1825 }
1826
1827 SmallVector<const Value *> Operands(I.operand_values());
1830 }
1831 }
1832
1833 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1834
1835 UP.Partial = true;
1836 UP.Runtime = true;
1837 UP.UnrollRemainder = true;
1838 UP.UnrollAndJam = true;
1840
1841 // Force unrolling small loops can be very useful because of the branch
1842 // taken cost of the backedge.
1843 if (Cost < 12)
1844 UP.Force = true;
1845}
1846
1850}
1851
1854 if (Ty->isVectorTy()) {
1855 if (Size.isScalable() && ST->hasVInstructions())
1856 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1857
1859 return divideCeil(Size, ST->getRealMinVLen());
1860 }
1861
1862 return BaseT::getRegUsageForType(Ty);
1863}
1864
1865unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1866 if (SLPMaxVF.getNumOccurrences())
1867 return SLPMaxVF;
1868
1869 // Return how many elements can fit in getRegisterBitwidth. This is the
1870 // same routine as used in LoopVectorizer. We should probably be
1871 // accounting for whether we actually have instructions with the right
1872 // lane type, but we don't have enough information to do that without
1873 // some additional plumbing which hasn't been justified yet.
1874 TypeSize RegWidth =
1876 // If no vector registers, or absurd element widths, disable
1877 // vectorization by returning 1.
1878 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1879}
1880
1882 const TargetTransformInfo::LSRCost &C2) {
1883 // RISC-V specific here are "instruction number 1st priority".
1884 // If we need to emit adds inside the loop to add up base registers, then
1885 // we need at least one extra temporary register.
1886 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
1887 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
1888 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
1889 C1.NumIVMuls, C1.NumBaseAdds,
1890 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1891 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
1892 C2.NumIVMuls, C2.NumBaseAdds,
1893 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1894}
1895
1897 auto *VTy = dyn_cast<VectorType>(DataTy);
1898 if (!VTy || VTy->isScalableTy())
1899 return false;
1900
1901 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1902 return false;
1903 return true;
1904}
1905
1907 const Function *Callee) const {
1908 const TargetMachine &TM = getTLI()->getTargetMachine();
1909
1910 const FeatureBitset &CallerBits =
1911 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1912 const FeatureBitset &CalleeBits =
1913 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1914
1915 // Inline a callee if its target-features are a subset of the callers
1916 // target-features.
1917 return (CallerBits & CalleeBits) == CalleeBits;
1918}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
uint64_t IntrinsicInst * II
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:77
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:762
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:761
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:42
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).