LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
43 size_t NumInstr = OpCodes.size();
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
48 return LMULCost * NumInstr;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMOR_MM:
95 case RISCV::VMXOR_MM:
96 case RISCV::VMAND_MM:
97 case RISCV::VMANDN_MM:
98 case RISCV::VMNAND_MM:
99 case RISCV::VCPOP_M:
100 Cost += 1;
101 break;
102 default:
103 Cost += LMULCost;
104 }
105 }
106 return Cost;
107}
108
111 assert(Ty->isIntegerTy() &&
112 "getIntImmCost can only estimate cost of materialising integers");
113
114 // We have a Zero register, so 0 is always free.
115 if (Imm == 0)
116 return TTI::TCC_Free;
117
118 // Otherwise, we check how many instructions it will take to materialise.
119 const DataLayout &DL = getDataLayout();
120 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
121}
122
123// Look for patterns of shift followed by AND that can be turned into a pair of
124// shifts. We won't need to materialize an immediate for the AND so these can
125// be considered free.
126static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
127 uint64_t Mask = Imm.getZExtValue();
128 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
129 if (!BO || !BO->hasOneUse())
130 return false;
131
132 if (BO->getOpcode() != Instruction::Shl)
133 return false;
134
135 if (!isa<ConstantInt>(BO->getOperand(1)))
136 return false;
137
138 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
139 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
140 // is a mask shifted by c2 bits with c3 leading zeros.
141 if (isShiftedMask_64(Mask)) {
142 unsigned Trailing = llvm::countr_zero(Mask);
143 if (ShAmt == Trailing)
144 return true;
145 }
146
147 return false;
148}
149
151 const APInt &Imm, Type *Ty,
153 Instruction *Inst) {
154 assert(Ty->isIntegerTy() &&
155 "getIntImmCost can only estimate cost of materialising integers");
156
157 // We have a Zero register, so 0 is always free.
158 if (Imm == 0)
159 return TTI::TCC_Free;
160
161 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
162 // commutative, in others the immediate comes from a specific argument index.
163 bool Takes12BitImm = false;
164 unsigned ImmArgIdx = ~0U;
165
166 switch (Opcode) {
167 case Instruction::GetElementPtr:
168 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
169 // split up large offsets in GEP into better parts than ConstantHoisting
170 // can.
171 return TTI::TCC_Free;
172 case Instruction::Store:
173 // If the address is a constant, use the materialization cost.
174 if (Idx == 1)
175 return getIntImmCost(Imm, Ty, CostKind);
176 return TTI::TCC_Free;
177 case Instruction::Load:
178 // If the address is a constant, use the materialization cost.
179 return getIntImmCost(Imm, Ty, CostKind);
180 case Instruction::And:
181 // zext.h
182 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
183 return TTI::TCC_Free;
184 // zext.w
185 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
186 return TTI::TCC_Free;
187 // bclri
188 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
189 return TTI::TCC_Free;
190 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
191 canUseShiftPair(Inst, Imm))
192 return TTI::TCC_Free;
193 Takes12BitImm = true;
194 break;
195 case Instruction::Add:
196 Takes12BitImm = true;
197 break;
198 case Instruction::Or:
199 case Instruction::Xor:
200 // bseti/binvi
201 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
202 return TTI::TCC_Free;
203 Takes12BitImm = true;
204 break;
205 case Instruction::Mul:
206 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
207 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
208 return TTI::TCC_Free;
209 // One more or less than a power of 2 can use SLLI+ADD/SUB.
210 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
211 return TTI::TCC_Free;
212 // FIXME: There is no MULI instruction.
213 Takes12BitImm = true;
214 break;
215 case Instruction::Sub:
216 case Instruction::Shl:
217 case Instruction::LShr:
218 case Instruction::AShr:
219 Takes12BitImm = true;
220 ImmArgIdx = 1;
221 break;
222 default:
223 break;
224 }
225
226 if (Takes12BitImm) {
227 // Check immediate is the correct argument...
228 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
229 // ... and fits into the 12-bit immediate.
230 if (Imm.getSignificantBits() <= 64 &&
231 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
232 return TTI::TCC_Free;
233 }
234 }
235
236 // Otherwise, use the full materialisation cost.
237 return getIntImmCost(Imm, Ty, CostKind);
238 }
239
240 // By default, prevent hoisting.
241 return TTI::TCC_Free;
242}
243
246 const APInt &Imm, Type *Ty,
248 // Prevent hoisting in unknown cases.
249 return TTI::TCC_Free;
250}
251
252bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
253 return ST->hasVInstructions();
254}
255
258 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
259 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
262}
263
265 // Currently, the ExpandReductions pass can't expand scalable-vector
266 // reductions, but we still request expansion as RVV doesn't support certain
267 // reductions and the SelectionDAG can't legalize them either.
268 switch (II->getIntrinsicID()) {
269 default:
270 return false;
271 // These reductions have no equivalent in RVV
272 case Intrinsic::vector_reduce_mul:
273 case Intrinsic::vector_reduce_fmul:
274 return true;
275 }
276}
277
278std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
279 if (ST->hasVInstructions())
281 return BaseT::getMaxVScale();
282}
283
284std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
285 if (ST->hasVInstructions())
286 if (unsigned MinVLen = ST->getRealMinVLen();
287 MinVLen >= RISCV::RVVBitsPerBlock)
288 return MinVLen / RISCV::RVVBitsPerBlock;
290}
291
294 unsigned LMUL =
295 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
296 switch (K) {
298 return TypeSize::getFixed(ST->getXLen());
300 return TypeSize::getFixed(
301 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
304 (ST->hasVInstructions() &&
307 : 0);
308 }
309
310 llvm_unreachable("Unsupported register kind");
311}
312
314RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
315 // Add a cost of address generation + the cost of the load. The address
316 // is expected to be a PC relative offset to a constant pool entry
317 // using auipc/addi.
318 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
319 /*AddressSpace=*/0, CostKind);
320}
321
323 LLVMContext &C) {
324 assert((DataVT.getScalarSizeInBits() != 8 ||
325 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
326 MVT IndexVT = DataVT.changeTypeToInteger();
327 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
328 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
329 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
330}
331
333 VectorType *Tp, ArrayRef<int> Mask,
335 int Index, VectorType *SubTp,
337 const Instruction *CxtI) {
338 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
339
340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
341
342 // First, handle cases where having a fixed length vector enables us to
343 // give a more accurate cost than falling back to generic scalable codegen.
344 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
345 if (isa<FixedVectorType>(Tp)) {
346 switch (Kind) {
347 default:
348 break;
350 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
351 MVT EltTp = LT.second.getVectorElementType();
352 // If the size of the element is < ELEN then shuffles of interleaves and
353 // deinterleaves of 2 vectors can be lowered into the following
354 // sequences
355 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
356 // Example sequence:
357 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
358 // vwaddu.vv v10, v8, v9
359 // li a0, -1 (ignored)
360 // vwmaccu.vx v10, a0, v9
361 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
362 return 2 * LT.first * TLI->getLMULCost(LT.second);
363
364 if (Mask[0] == 0 || Mask[0] == 1) {
365 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
366 // Example sequence:
367 // vnsrl.wi v10, v8, 0
368 if (equal(DeinterleaveMask, Mask))
369 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
370 LT.second, CostKind);
371 }
372 }
373 }
374 // vrgather + cost of generating the mask constant.
375 // We model this for an unknown mask with a single vrgather.
376 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
377 (LT.second.getScalarSizeInBits() != 8 ||
378 LT.second.getVectorNumElements() <= 256)) {
379 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
380 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
381 return IndexCost +
382 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
383 }
384 [[fallthrough]];
385 }
388 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
389 // register for the second vrgather. We model this for an unknown
390 // (shuffle) mask.
391 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
392 (LT.second.getScalarSizeInBits() != 8 ||
393 LT.second.getVectorNumElements() <= 256)) {
394 auto &C = Tp->getContext();
395 auto EC = Tp->getElementCount();
396 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
398 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
399 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
400 return 2 * IndexCost +
401 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
402 LT.second, CostKind) +
403 MaskCost;
404 }
405 [[fallthrough]];
406 }
407 case TTI::SK_Select: {
408 // We are going to permute multiple sources and the result will be in
409 // multiple destinations. Providing an accurate cost only for splits where
410 // the element type remains the same.
411 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
412 LT.second.isFixedLengthVector() &&
413 LT.second.getVectorElementType().getSizeInBits() ==
415 LT.second.getVectorNumElements() <
416 cast<FixedVectorType>(Tp)->getNumElements() &&
417 divideCeil(Mask.size(),
418 cast<FixedVectorType>(Tp)->getNumElements()) ==
419 static_cast<unsigned>(*LT.first.getValue())) {
420 unsigned NumRegs = *LT.first.getValue();
421 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
422 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
423 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
424
426 for (unsigned I = 0; I < NumRegs; ++I) {
427 bool IsSingleVector = true;
428 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
429 transform(Mask.slice(I * SubVF,
430 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
431 SubMask.begin(), [&](int I) {
432 bool SingleSubVector = I / VF == 0;
433 IsSingleVector &= SingleSubVector;
434 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
435 });
438 SubVecTy, SubMask, CostKind, 0, nullptr);
439 return Cost;
440 }
441 }
442 break;
443 }
444 }
445 };
446
447 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
448 switch (Kind) {
449 default:
450 // Fallthrough to generic handling.
451 // TODO: Most of these cases will return getInvalid in generic code, and
452 // must be implemented here.
453 break;
455 // Extract at zero is always a subregister extract
456 if (Index == 0)
457 return TTI::TCC_Free;
458
459 // If we're extracting a subvector of at most m1 size at a sub-register
460 // boundary - which unfortunately we need exact vlen to identify - this is
461 // a subregister extract at worst and thus won't require a vslidedown.
462 // TODO: Extend for aligned m2, m4 subvector extracts
463 // TODO: Extend for misalgined (but contained) extracts
464 // TODO: Extend for scalable subvector types
465 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
466 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
467 const unsigned MinVLen = ST->getRealMinVLen();
468 const unsigned MaxVLen = ST->getRealMaxVLen();
469 if (MinVLen == MaxVLen &&
470 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
471 SubLT.second.getSizeInBits() <= MinVLen)
472 return TTI::TCC_Free;
473 }
474
475 // Example sequence:
476 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
477 // vslidedown.vi v8, v9, 2
478 return LT.first *
479 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
481 // Example sequence:
482 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
483 // vslideup.vi v8, v9, 2
484 return LT.first *
485 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
486 case TTI::SK_Select: {
487 // Example sequence:
488 // li a0, 90
489 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
490 // vmv.s.x v0, a0
491 // vmerge.vvm v8, v9, v8, v0
492 // We use 2 for the cost of the mask materialization as this is the true
493 // cost for small masks and most shuffles are small. At worst, this cost
494 // should be a very small constant for the constant pool load. As such,
495 // we may bias towards large selects slightly more than truely warranted.
496 return LT.first *
497 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
498 LT.second, CostKind));
499 }
500 case TTI::SK_Broadcast: {
501 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
502 Instruction::InsertElement);
503 if (LT.second.getScalarSizeInBits() == 1) {
504 if (HasScalar) {
505 // Example sequence:
506 // andi a0, a0, 1
507 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
508 // vmv.v.x v8, a0
509 // vmsne.vi v0, v8, 0
510 return LT.first *
511 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
512 LT.second, CostKind));
513 }
514 // Example sequence:
515 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
516 // vmv.v.i v8, 0
517 // vmerge.vim v8, v8, 1, v0
518 // vmv.x.s a0, v8
519 // andi a0, a0, 1
520 // vmv.v.x v8, a0
521 // vmsne.vi v0, v8, 0
522
523 return LT.first *
524 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
525 RISCV::VMV_X_S, RISCV::VMV_V_X,
526 RISCV::VMSNE_VI},
527 LT.second, CostKind));
528 }
529
530 if (HasScalar) {
531 // Example sequence:
532 // vmv.v.x v8, a0
533 return LT.first *
534 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
535 }
536
537 // Example sequence:
538 // vrgather.vi v9, v8, 0
539 return LT.first *
540 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
541 }
542 case TTI::SK_Splice: {
543 // vslidedown+vslideup.
544 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
545 // of similar code, but I think we expand through memory.
546 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
547 if (Index >= 0 && Index < 32)
548 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
549 else if (Index < 0 && Index > -32)
550 Opcodes[1] = RISCV::VSLIDEUP_VI;
551 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
552 }
553 case TTI::SK_Reverse: {
554 // TODO: Cases to improve here:
555 // * Illegal vector types
556 // * i64 on RV32
557 // * i1 vector
558 // At low LMUL, most of the cost is producing the vrgather index register.
559 // At high LMUL, the cost of the vrgather itself will dominate.
560 // Example sequence:
561 // csrr a0, vlenb
562 // srli a0, a0, 3
563 // addi a0, a0, -1
564 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
565 // vid.v v9
566 // vrsub.vx v10, v9, a0
567 // vrgather.vv v9, v8, v10
568 InstructionCost LenCost = 3;
569 if (LT.second.isFixedLengthVector())
570 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
571 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
572 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
573 if (LT.second.isFixedLengthVector() &&
574 isInt<5>(LT.second.getVectorNumElements() - 1))
575 Opcodes[1] = RISCV::VRSUB_VI;
576 InstructionCost GatherCost =
577 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
578 // Mask operation additionally required extend and truncate
579 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
580 return LT.first * (LenCost + GatherCost + ExtendCost);
581 }
582 }
583 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
584}
585
587RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
588 unsigned AddressSpace,
590 if (!isLegalMaskedLoadStore(Src, Alignment) ||
592 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
593 CostKind);
594
595 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
596}
597
599 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
600 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
601 bool UseMaskForCond, bool UseMaskForGaps) {
602 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
604
605 // The interleaved memory access pass will lower interleaved memory ops (i.e
606 // a load and store followed by a specific shuffle) to vlseg/vsseg
607 // intrinsics. In those cases then we can treat it as if it's just one (legal)
608 // memory op
609 if (!UseMaskForCond && !UseMaskForGaps &&
610 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
611 auto *VTy = cast<VectorType>(VecTy);
612 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
613 // Need to make sure type has't been scalarized
614 if (LT.second.isVector()) {
615 auto *LegalVTy = VectorType::get(VTy->getElementType(),
616 LT.second.getVectorElementCount());
617 // FIXME: We use the memory op cost of the *legalized* type here, becuase
618 // it's getMemoryOpCost returns a really expensive cost for types like
619 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
620 // Should the memory op cost of these be cheaper?
621 if (TLI->isLegalInterleavedAccessType(LegalVTy, Factor, Alignment,
622 AddressSpace, DL)) {
623 InstructionCost LegalMemCost = getMemoryOpCost(
624 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
625 return LT.first + LegalMemCost;
626 }
627 }
628 }
629
630 // TODO: Return the cost of interleaved accesses for scalable vector when
631 // unable to convert to segment accesses instructions.
632 if (isa<ScalableVectorType>(VecTy))
634
635 auto *FVTy = cast<FixedVectorType>(VecTy);
636 InstructionCost MemCost =
637 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
638 unsigned VF = FVTy->getNumElements() / Factor;
639
640 // An interleaved load will look like this for Factor=3:
641 // %wide.vec = load <12 x i32>, ptr %3, align 4
642 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
643 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
644 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
645 if (Opcode == Instruction::Load) {
646 InstructionCost Cost = MemCost;
647 for (unsigned Index : Indices) {
648 FixedVectorType *SubVecTy =
649 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
650 auto Mask = createStrideMask(Index, Factor, VF);
651 InstructionCost ShuffleCost =
653 CostKind, 0, nullptr, {});
654 Cost += ShuffleCost;
655 }
656 return Cost;
657 }
658
659 // TODO: Model for NF > 2
660 // We'll need to enhance getShuffleCost to model shuffles that are just
661 // inserts and extracts into subvectors, since they won't have the full cost
662 // of a vrgather.
663 // An interleaved store for 3 vectors of 4 lanes will look like
664 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
665 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
666 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
667 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
668 // store <12 x i32> %interleaved.vec, ptr %10, align 4
669 if (Factor != 2)
670 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
671 Alignment, AddressSpace, CostKind,
672 UseMaskForCond, UseMaskForGaps);
673
674 assert(Opcode == Instruction::Store && "Opcode must be a store");
675 // For an interleaving store of 2 vectors, we perform one large interleaving
676 // shuffle that goes into the wide store
677 auto Mask = createInterleaveMask(VF, Factor);
678 InstructionCost ShuffleCost =
680 CostKind, 0, nullptr, {});
681 return MemCost + ShuffleCost;
682}
683
685 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
686 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
688 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
689 Alignment, CostKind, I);
690
691 if ((Opcode == Instruction::Load &&
692 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
693 (Opcode == Instruction::Store &&
694 !isLegalMaskedScatter(DataTy, Align(Alignment))))
695 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
696 Alignment, CostKind, I);
697
698 // Cost is proportional to the number of memory operations implied. For
699 // scalable vectors, we use an estimate on that number since we don't
700 // know exactly what VL will be.
701 auto &VTy = *cast<VectorType>(DataTy);
702 InstructionCost MemOpCost =
703 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
704 {TTI::OK_AnyValue, TTI::OP_None}, I);
705 unsigned NumLoads = getEstimatedVLFor(&VTy);
706 return NumLoads * MemOpCost;
707}
708
710 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
711 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
712 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
713 !isLegalStridedLoadStore(DataTy, Alignment)) ||
714 (Opcode != Instruction::Load && Opcode != Instruction::Store))
715 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
716 Alignment, CostKind, I);
717
719 return TTI::TCC_Basic;
720
721 // Cost is proportional to the number of memory operations implied. For
722 // scalable vectors, we use an estimate on that number since we don't
723 // know exactly what VL will be.
724 auto &VTy = *cast<VectorType>(DataTy);
725 InstructionCost MemOpCost =
726 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
727 {TTI::OK_AnyValue, TTI::OP_None}, I);
728 unsigned NumLoads = getEstimatedVLFor(&VTy);
729 return NumLoads * MemOpCost;
730}
731
732// Currently, these represent both throughput and codesize costs
733// for the respective intrinsics. The costs in this table are simply
734// instruction counts with the following adjustments made:
735// * One vsetvli is considered free.
737 {Intrinsic::floor, MVT::f32, 9},
738 {Intrinsic::floor, MVT::f64, 9},
739 {Intrinsic::ceil, MVT::f32, 9},
740 {Intrinsic::ceil, MVT::f64, 9},
741 {Intrinsic::trunc, MVT::f32, 7},
742 {Intrinsic::trunc, MVT::f64, 7},
743 {Intrinsic::round, MVT::f32, 9},
744 {Intrinsic::round, MVT::f64, 9},
745 {Intrinsic::roundeven, MVT::f32, 9},
746 {Intrinsic::roundeven, MVT::f64, 9},
747 {Intrinsic::rint, MVT::f32, 7},
748 {Intrinsic::rint, MVT::f64, 7},
749 {Intrinsic::lrint, MVT::i32, 1},
750 {Intrinsic::lrint, MVT::i64, 1},
751 {Intrinsic::llrint, MVT::i64, 1},
752 {Intrinsic::nearbyint, MVT::f32, 9},
753 {Intrinsic::nearbyint, MVT::f64, 9},
754 {Intrinsic::bswap, MVT::i16, 3},
755 {Intrinsic::bswap, MVT::i32, 12},
756 {Intrinsic::bswap, MVT::i64, 31},
757 {Intrinsic::vp_bswap, MVT::i16, 3},
758 {Intrinsic::vp_bswap, MVT::i32, 12},
759 {Intrinsic::vp_bswap, MVT::i64, 31},
760 {Intrinsic::vp_fshl, MVT::i8, 7},
761 {Intrinsic::vp_fshl, MVT::i16, 7},
762 {Intrinsic::vp_fshl, MVT::i32, 7},
763 {Intrinsic::vp_fshl, MVT::i64, 7},
764 {Intrinsic::vp_fshr, MVT::i8, 7},
765 {Intrinsic::vp_fshr, MVT::i16, 7},
766 {Intrinsic::vp_fshr, MVT::i32, 7},
767 {Intrinsic::vp_fshr, MVT::i64, 7},
768 {Intrinsic::bitreverse, MVT::i8, 17},
769 {Intrinsic::bitreverse, MVT::i16, 24},
770 {Intrinsic::bitreverse, MVT::i32, 33},
771 {Intrinsic::bitreverse, MVT::i64, 52},
772 {Intrinsic::vp_bitreverse, MVT::i8, 17},
773 {Intrinsic::vp_bitreverse, MVT::i16, 24},
774 {Intrinsic::vp_bitreverse, MVT::i32, 33},
775 {Intrinsic::vp_bitreverse, MVT::i64, 52},
776 {Intrinsic::ctpop, MVT::i8, 12},
777 {Intrinsic::ctpop, MVT::i16, 19},
778 {Intrinsic::ctpop, MVT::i32, 20},
779 {Intrinsic::ctpop, MVT::i64, 21},
780 {Intrinsic::vp_ctpop, MVT::i8, 12},
781 {Intrinsic::vp_ctpop, MVT::i16, 19},
782 {Intrinsic::vp_ctpop, MVT::i32, 20},
783 {Intrinsic::vp_ctpop, MVT::i64, 21},
784 {Intrinsic::vp_ctlz, MVT::i8, 19},
785 {Intrinsic::vp_ctlz, MVT::i16, 28},
786 {Intrinsic::vp_ctlz, MVT::i32, 31},
787 {Intrinsic::vp_ctlz, MVT::i64, 35},
788 {Intrinsic::vp_cttz, MVT::i8, 16},
789 {Intrinsic::vp_cttz, MVT::i16, 23},
790 {Intrinsic::vp_cttz, MVT::i32, 24},
791 {Intrinsic::vp_cttz, MVT::i64, 25},
792};
793
795 switch (ID) {
796#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
797 case Intrinsic::VPID: \
798 return ISD::VPSD;
799#include "llvm/IR/VPIntrinsics.def"
800#undef HELPER_MAP_VPID_TO_VPSD
801 }
802 return ISD::DELETED_NODE;
803}
804
808 auto *RetTy = ICA.getReturnType();
809 switch (ICA.getID()) {
810 case Intrinsic::ceil:
811 case Intrinsic::floor:
812 case Intrinsic::trunc:
813 case Intrinsic::rint:
814 case Intrinsic::lrint:
815 case Intrinsic::llrint:
816 case Intrinsic::round:
817 case Intrinsic::roundeven: {
818 // These all use the same code.
820 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
821 return LT.first * 8;
822 break;
823 }
824 case Intrinsic::umin:
825 case Intrinsic::umax:
826 case Intrinsic::smin:
827 case Intrinsic::smax: {
829 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
830 return LT.first;
831
832 if (ST->hasVInstructions() && LT.second.isVector()) {
833 unsigned Op;
834 switch (ICA.getID()) {
835 case Intrinsic::umin:
836 Op = RISCV::VMINU_VV;
837 break;
838 case Intrinsic::umax:
839 Op = RISCV::VMAXU_VV;
840 break;
841 case Intrinsic::smin:
842 Op = RISCV::VMIN_VV;
843 break;
844 case Intrinsic::smax:
845 Op = RISCV::VMAX_VV;
846 break;
847 }
848 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
849 }
850 break;
851 }
852 case Intrinsic::sadd_sat:
853 case Intrinsic::ssub_sat:
854 case Intrinsic::uadd_sat:
855 case Intrinsic::usub_sat:
856 case Intrinsic::fabs:
857 case Intrinsic::sqrt: {
859 if (ST->hasVInstructions() && LT.second.isVector())
860 return LT.first;
861 break;
862 }
863 case Intrinsic::ctpop: {
865 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
866 return LT.first;
867 break;
868 }
869 case Intrinsic::abs: {
871 if (ST->hasVInstructions() && LT.second.isVector()) {
872 // vrsub.vi v10, v8, 0
873 // vmax.vv v8, v8, v10
874 return LT.first * 2;
875 }
876 break;
877 }
878 case Intrinsic::get_active_lane_mask: {
879 if (ST->hasVInstructions()) {
880 Type *ExpRetTy = VectorType::get(
881 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
882 auto LT = getTypeLegalizationCost(ExpRetTy);
883
884 // vid.v v8 // considered hoisted
885 // vsaddu.vx v8, v8, a0
886 // vmsltu.vx v0, v8, a1
887 return LT.first *
888 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
889 LT.second, CostKind);
890 }
891 break;
892 }
893 // TODO: add more intrinsic
894 case Intrinsic::experimental_stepvector: {
896 // Legalisation of illegal types involves an `index' instruction plus
897 // (LT.first - 1) vector adds.
898 if (ST->hasVInstructions())
899 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
900 (LT.first - 1) *
901 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
902 return 1 + (LT.first - 1);
903 }
904 case Intrinsic::vp_rint: {
905 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
906 unsigned Cost = 5;
908 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
909 return Cost * LT.first;
910 break;
911 }
912 case Intrinsic::vp_nearbyint: {
913 // More one read and one write for fflags than vp_rint.
914 unsigned Cost = 7;
916 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
917 return Cost * LT.first;
918 break;
919 }
920 case Intrinsic::vp_ceil:
921 case Intrinsic::vp_floor:
922 case Intrinsic::vp_round:
923 case Intrinsic::vp_roundeven:
924 case Intrinsic::vp_roundtozero: {
925 // Rounding with static rounding mode needs two more instructions to
926 // swap/write FRM than vp_rint.
927 unsigned Cost = 7;
929 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
930 if (TLI->isOperationCustom(VPISD, LT.second))
931 return Cost * LT.first;
932 break;
933 }
934 }
935
936 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
937 if (auto LT = getTypeLegalizationCost(RetTy);
938 LT.second.isVector()) {
939 MVT EltTy = LT.second.getVectorElementType();
940 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
941 ICA.getID(), EltTy))
942 return LT.first * Entry->Cost;
943 }
944 }
945
947}
948
950 Type *Src,
953 const Instruction *I) {
954 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
955 if (!IsVectorType)
956 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
957
958 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
959 (Src->getScalarSizeInBits() <= ST->getELen()) &&
960 (Dst->getScalarSizeInBits() <= ST->getELen());
961
962 // FIXME: Need to compute legalizing cost for illegal types.
963 if (!IsTypeLegal)
964 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
965
966 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
967 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
968
969 int ISD = TLI->InstructionOpcodeToISD(Opcode);
970 assert(ISD && "Invalid opcode");
971
972 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
973 (int)Log2_32(Src->getScalarSizeInBits());
974 switch (ISD) {
975 case ISD::SIGN_EXTEND:
976 case ISD::ZERO_EXTEND: {
977 const unsigned SrcEltSize = Src->getScalarSizeInBits();
978 if (SrcEltSize == 1) {
979 // We do not use vsext/vzext to extend from mask vector.
980 // Instead we use the following instructions to extend from mask vector:
981 // vmv.v.i v8, 0
982 // vmerge.vim v8, v8, -1, v0
983 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
984 DstLT.second, CostKind);
985 }
986 if ((PowDiff < 1) || (PowDiff > 3))
987 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
988 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
989 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
990 unsigned Op =
991 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
992 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
993 }
994 case ISD::TRUNCATE:
995 if (Dst->getScalarSizeInBits() == 1) {
996 // We do not use several vncvt to truncate to mask vector. So we could
997 // not use PowDiff to calculate it.
998 // Instead we use the following instructions to truncate to mask vector:
999 // vand.vi v8, v8, 1
1000 // vmsne.vi v0, v8, 0
1001 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1002 SrcLT.second, CostKind);
1003 }
1004 [[fallthrough]];
1005 case ISD::FP_EXTEND:
1006 case ISD::FP_ROUND: {
1007 // Counts of narrow/widen instructions.
1008 unsigned SrcEltSize = Src->getScalarSizeInBits();
1009 unsigned DstEltSize = Dst->getScalarSizeInBits();
1010
1011 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1012 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1013 : RISCV::VFNCVT_F_F_W;
1015 for (; SrcEltSize != DstEltSize;) {
1016 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1017 ? MVT::getIntegerVT(DstEltSize)
1018 : MVT::getFloatingPointVT(DstEltSize);
1019 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1020 DstEltSize =
1021 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1022 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1023 }
1024 return Cost;
1025 }
1026 case ISD::FP_TO_SINT:
1027 case ISD::FP_TO_UINT:
1028 case ISD::SINT_TO_FP:
1029 case ISD::UINT_TO_FP:
1030 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1031 // The cost of convert from or to mask vector is different from other
1032 // cases. We could not use PowDiff to calculate it.
1033 // For mask vector to fp, we should use the following instructions:
1034 // vmv.v.i v8, 0
1035 // vmerge.vim v8, v8, -1, v0
1036 // vfcvt.f.x.v v8, v8
1037
1038 // And for fp vector to mask, we use:
1039 // vfncvt.rtz.x.f.w v9, v8
1040 // vand.vi v8, v9, 1
1041 // vmsne.vi v0, v8, 0
1042 return 3;
1043 }
1044 if (std::abs(PowDiff) <= 1)
1045 return 1;
1046 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1047 // so it only need two conversion.
1048 if (Src->isIntOrIntVectorTy())
1049 return 2;
1050 // Counts of narrow/widen instructions.
1051 return std::abs(PowDiff);
1052 }
1053 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1054}
1055
1056unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1057 if (isa<ScalableVectorType>(Ty)) {
1058 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1059 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1060 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1061 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1062 }
1063 return cast<FixedVectorType>(Ty)->getNumElements();
1064}
1065
1068 FastMathFlags FMF,
1070 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1071 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1072
1073 // Skip if scalar size of Ty is bigger than ELEN.
1074 if (Ty->getScalarSizeInBits() > ST->getELen())
1075 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1076
1077 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1078 if (Ty->getElementType()->isIntegerTy(1)) {
1079 // SelectionDAGBuilder does following transforms:
1080 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1081 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1082 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1083 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1084 else
1085 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1086 }
1087
1088 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1090 InstructionCost ExtraCost = 0;
1091 switch (IID) {
1092 case Intrinsic::maximum:
1093 if (FMF.noNaNs()) {
1094 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1095 } else {
1096 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1097 RISCV::VFMV_F_S};
1098 // Cost of Canonical Nan + branch
1099 // lui a0, 523264
1100 // fmv.w.x fa0, a0
1101 Type *DstTy = Ty->getScalarType();
1102 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1103 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1104 ExtraCost = 1 +
1105 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1107 getCFInstrCost(Instruction::Br, CostKind);
1108 }
1109 break;
1110
1111 case Intrinsic::minimum:
1112 if (FMF.noNaNs()) {
1113 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1114 } else {
1115 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1116 RISCV::VFMV_F_S};
1117 // Cost of Canonical Nan + branch
1118 // lui a0, 523264
1119 // fmv.w.x fa0, a0
1120 Type *DstTy = Ty->getScalarType();
1121 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1122 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1123 ExtraCost = 1 +
1124 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1126 getCFInstrCost(Instruction::Br, CostKind);
1127 }
1128 break;
1129 }
1130 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1131 }
1132
1133 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1134 unsigned SplitOp;
1136 switch (IID) {
1137 default:
1138 llvm_unreachable("Unsupported intrinsic");
1139 case Intrinsic::smax:
1140 SplitOp = RISCV::VMAX_VV;
1141 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1142 break;
1143 case Intrinsic::smin:
1144 SplitOp = RISCV::VMIN_VV;
1145 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1146 break;
1147 case Intrinsic::umax:
1148 SplitOp = RISCV::VMAXU_VV;
1149 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1150 break;
1151 case Intrinsic::umin:
1152 SplitOp = RISCV::VMINU_VV;
1153 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1154 break;
1155 case Intrinsic::maxnum:
1156 SplitOp = RISCV::VFMAX_VV;
1157 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1158 break;
1159 case Intrinsic::minnum:
1160 SplitOp = RISCV::VFMIN_VV;
1161 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1162 break;
1163 }
1164 // Add a cost for data larger than LMUL8
1165 InstructionCost SplitCost =
1166 (LT.first > 1) ? (LT.first - 1) *
1167 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1168 : 0;
1169 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1170}
1171
1174 std::optional<FastMathFlags> FMF,
1176 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1177 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1178
1179 // Skip if scalar size of Ty is bigger than ELEN.
1180 if (Ty->getScalarSizeInBits() > ST->getELen())
1181 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1182
1183 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1184 assert(ISD && "Invalid opcode");
1185
1186 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1187 ISD != ISD::FADD)
1188 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1189
1190 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1192 Type *ElementTy = Ty->getElementType();
1193 if (ElementTy->isIntegerTy(1)) {
1194 if (ISD == ISD::AND) {
1195 // Example sequences:
1196 // vsetvli a0, zero, e8, mf8, ta, ma
1197 // vmnot.m v8, v0
1198 // vcpop.m a0, v8
1199 // seqz a0, a0
1200 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1201 return (LT.first - 1) +
1202 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1203 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1205 } else {
1206 // Example sequences:
1207 // vsetvli a0, zero, e8, mf8, ta, ma
1208 // vcpop.m a0, v0
1209 // snez a0, a0
1210 Opcodes = {RISCV::VCPOP_M};
1211 return (LT.first - 1) +
1212 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1213 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1215 }
1216 }
1217
1218 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1220 Opcodes.push_back(RISCV::VFMV_S_F);
1221 for (unsigned i = 0; i < LT.first.getValue(); i++)
1222 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1223 Opcodes.push_back(RISCV::VFMV_F_S);
1224 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1225 }
1226 unsigned SplitOp;
1227 switch (ISD) {
1228 case ISD::ADD:
1229 SplitOp = RISCV::VADD_VV;
1230 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1231 break;
1232 case ISD::OR:
1233 SplitOp = RISCV::VOR_VV;
1234 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1235 break;
1236 case ISD::XOR:
1237 SplitOp = RISCV::VXOR_VV;
1238 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1239 break;
1240 case ISD::AND:
1241 SplitOp = RISCV::VAND_VV;
1242 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1243 break;
1244 case ISD::FADD:
1245 SplitOp = RISCV::VFADD_VV;
1246 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1247 break;
1248 }
1249 // Add a cost for data larger than LMUL8
1250 InstructionCost SplitCost =
1251 (LT.first > 1) ? (LT.first - 1) *
1252 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1253 : 0;
1254 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1255}
1256
1258 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1260 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1261 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1262 FMF, CostKind);
1263
1264 // Skip if scalar size of ResTy is bigger than ELEN.
1265 if (ResTy->getScalarSizeInBits() > ST->getELen())
1266 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1267 FMF, CostKind);
1268
1269 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1270 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1271 FMF, CostKind);
1272
1273 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1274
1275 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1276 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1277 FMF, CostKind);
1278
1279 return (LT.first - 1) +
1280 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1281}
1282
1284 TTI::OperandValueInfo OpInfo,
1286 assert(OpInfo.isConstant() && "non constant operand?");
1287 if (!isa<VectorType>(Ty))
1288 // FIXME: We need to account for immediate materialization here, but doing
1289 // a decent job requires more knowledge about the immediate than we
1290 // currently have here.
1291 return 0;
1292
1293 if (OpInfo.isUniform())
1294 // vmv.x.i, vmv.v.x, or vfmv.v.f
1295 // We ignore the cost of the scalar constant materialization to be consistent
1296 // with how we treat scalar constants themselves just above.
1297 return 1;
1298
1299 return getConstantPoolLoadCost(Ty, CostKind);
1300}
1301
1302
1304 MaybeAlign Alignment,
1305 unsigned AddressSpace,
1307 TTI::OperandValueInfo OpInfo,
1308 const Instruction *I) {
1309 EVT VT = TLI->getValueType(DL, Src, true);
1310 // Type legalization can't handle structs
1311 if (VT == MVT::Other)
1312 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1313 CostKind, OpInfo, I);
1314
1316 if (Opcode == Instruction::Store && OpInfo.isConstant())
1317 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1318 InstructionCost BaseCost =
1319 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1320 CostKind, OpInfo, I);
1321 // Assume memory ops cost scale with the number of vector registers
1322 // possible accessed by the instruction. Note that BasicTTI already
1323 // handles the LT.first term for us.
1324 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1325 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1326 BaseCost *= TLI->getLMULCost(LT.second);
1327 return Cost + BaseCost;
1328
1329}
1330
1332 Type *CondTy,
1333 CmpInst::Predicate VecPred,
1335 const Instruction *I) {
1337 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1338 I);
1339
1340 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1341 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1342 I);
1343
1344 // Skip if scalar size of ValTy is bigger than ELEN.
1345 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1346 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1347 I);
1348
1349 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1350 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1351 if (CondTy->isVectorTy()) {
1352 if (ValTy->getScalarSizeInBits() == 1) {
1353 // vmandn.mm v8, v8, v9
1354 // vmand.mm v9, v0, v9
1355 // vmor.mm v0, v9, v8
1356 return LT.first *
1357 getRISCVInstructionCost(
1358 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1359 LT.second, CostKind);
1360 }
1361 // vselect and max/min are supported natively.
1362 return LT.first *
1363 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1364 }
1365
1366 if (ValTy->getScalarSizeInBits() == 1) {
1367 // vmv.v.x v9, a0
1368 // vmsne.vi v9, v9, 0
1369 // vmandn.mm v8, v8, v9
1370 // vmand.mm v9, v0, v9
1371 // vmor.mm v0, v9, v8
1372 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1373 return LT.first *
1374 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1375 InterimVT, CostKind) +
1376 LT.first * getRISCVInstructionCost(
1377 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1378 LT.second, CostKind);
1379 }
1380
1381 // vmv.v.x v10, a0
1382 // vmsne.vi v0, v10, 0
1383 // vmerge.vvm v8, v9, v8, v0
1384 return LT.first * getRISCVInstructionCost(
1385 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1386 LT.second, CostKind);
1387 }
1388
1389 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1390 CmpInst::isIntPredicate(VecPred)) {
1391 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1392 // provided they incur the same cost across all implementations
1393 return LT.first *
1394 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1395 }
1396
1397 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1398 CmpInst::isFPPredicate(VecPred)) {
1399
1400 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1401 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1402 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1403
1404 // If we do not support the input floating point vector type, use the base
1405 // one which will calculate as:
1406 // ScalarizeCost + Num * Cost for fixed vector,
1407 // InvalidCost for scalable vector.
1408 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1409 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1410 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1411 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1412 I);
1413
1414 // Assuming vector fp compare and mask instructions are all the same cost
1415 // until a need arises to differentiate them.
1416 switch (VecPred) {
1417 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1418 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1419 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1420 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1421 return LT.first * getRISCVInstructionCost(
1422 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1423 LT.second, CostKind);
1424
1425 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1426 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1427 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1428 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1429 return LT.first *
1430 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1431 LT.second, CostKind);
1432
1433 case CmpInst::FCMP_OEQ: // vmfeq.vv
1434 case CmpInst::FCMP_OGT: // vmflt.vv
1435 case CmpInst::FCMP_OGE: // vmfle.vv
1436 case CmpInst::FCMP_OLT: // vmflt.vv
1437 case CmpInst::FCMP_OLE: // vmfle.vv
1438 case CmpInst::FCMP_UNE: // vmfne.vv
1439 return LT.first *
1440 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1441 default:
1442 break;
1443 }
1444 }
1445
1446 // TODO: Add cost for scalar type.
1447
1448 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1449}
1450
1453 const Instruction *I) {
1455 return Opcode == Instruction::PHI ? 0 : 1;
1456 // Branches are assumed to be predicted.
1457 return 0;
1458}
1459
1462 unsigned Index, Value *Op0,
1463 Value *Op1) {
1464 assert(Val->isVectorTy() && "This must be a vector type");
1465
1466 if (Opcode != Instruction::ExtractElement &&
1467 Opcode != Instruction::InsertElement)
1468 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1469
1470 // Legalize the type.
1471 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1472
1473 // This type is legalized to a scalar type.
1474 if (!LT.second.isVector()) {
1475 auto *FixedVecTy = cast<FixedVectorType>(Val);
1476 // If Index is a known constant, cost is zero.
1477 if (Index != -1U)
1478 return 0;
1479 // Extract/InsertElement with non-constant index is very costly when
1480 // scalarized; estimate cost of loads/stores sequence via the stack:
1481 // ExtractElement cost: store vector to stack, load scalar;
1482 // InsertElement cost: store vector to stack, store scalar, load vector.
1483 Type *ElemTy = FixedVecTy->getElementType();
1484 auto NumElems = FixedVecTy->getNumElements();
1485 auto Align = DL.getPrefTypeAlign(ElemTy);
1486 InstructionCost LoadCost =
1487 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1488 InstructionCost StoreCost =
1489 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1490 return Opcode == Instruction::ExtractElement
1491 ? StoreCost * NumElems + LoadCost
1492 : (StoreCost + LoadCost) * NumElems + StoreCost;
1493 }
1494
1495 // For unsupported scalable vector.
1496 if (LT.second.isScalableVector() && !LT.first.isValid())
1497 return LT.first;
1498
1499 if (!isTypeLegal(Val))
1500 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1501
1502 // Mask vector extract/insert is expanded via e8.
1503 if (Val->getScalarSizeInBits() == 1) {
1504 VectorType *WideTy =
1506 cast<VectorType>(Val)->getElementCount());
1507 if (Opcode == Instruction::ExtractElement) {
1508 InstructionCost ExtendCost
1509 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1511 InstructionCost ExtractCost
1512 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1513 return ExtendCost + ExtractCost;
1514 }
1515 InstructionCost ExtendCost
1516 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1518 InstructionCost InsertCost
1519 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1520 InstructionCost TruncCost
1521 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1523 return ExtendCost + InsertCost + TruncCost;
1524 }
1525
1526
1527 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1528 // and vslideup + vmv.s.x to insert element to vector.
1529 unsigned BaseCost = 1;
1530 // When insertelement we should add the index with 1 as the input of vslideup.
1531 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1532
1533 if (Index != -1U) {
1534 // The type may be split. For fixed-width vectors we can normalize the
1535 // index to the new type.
1536 if (LT.second.isFixedLengthVector()) {
1537 unsigned Width = LT.second.getVectorNumElements();
1538 Index = Index % Width;
1539 }
1540
1541 // We could extract/insert the first element without vslidedown/vslideup.
1542 if (Index == 0)
1543 SlideCost = 0;
1544 else if (Opcode == Instruction::InsertElement)
1545 SlideCost = 1; // With a constant index, we do not need to use addi.
1546 }
1547
1548 // Extract i64 in the target that has XLEN=32 need more instruction.
1549 if (Val->getScalarType()->isIntegerTy() &&
1550 ST->getXLen() < Val->getScalarSizeInBits()) {
1551 // For extractelement, we need the following instructions:
1552 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1553 // vslidedown.vx v8, v8, a0
1554 // vmv.x.s a0, v8
1555 // li a1, 32
1556 // vsrl.vx v8, v8, a1
1557 // vmv.x.s a1, v8
1558
1559 // For insertelement, we need the following instructions:
1560 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1561 // vmv.v.i v12, 0
1562 // vslide1up.vx v16, v12, a1
1563 // vslide1up.vx v12, v16, a0
1564 // addi a0, a2, 1
1565 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1566 // vslideup.vx v8, v12, a2
1567
1568 // TODO: should we count these special vsetvlis?
1569 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1570 }
1571 return BaseCost + SlideCost;
1572}
1573
1575 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1577 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1578
1579 // TODO: Handle more cost kinds.
1581 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1582 Args, CxtI);
1583
1584 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1585 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1586 Args, CxtI);
1587
1588 // Skip if scalar size of Ty is bigger than ELEN.
1589 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1590 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1591 Args, CxtI);
1592
1593 // Legalize the type.
1594 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1595
1596 // TODO: Handle scalar type.
1597 if (!LT.second.isVector())
1598 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1599 Args, CxtI);
1600
1601
1602 auto getConstantMatCost =
1603 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1604 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1605 // Two sub-cases:
1606 // * Has a 5 bit immediate operand which can be splatted.
1607 // * Has a larger immediate which must be materialized in scalar register
1608 // We return 0 for both as we currently ignore the cost of materializing
1609 // scalar constants in GPRs.
1610 return 0;
1611
1612 return getConstantPoolLoadCost(Ty, CostKind);
1613 };
1614
1615 // Add the cost of materializing any constant vectors required.
1616 InstructionCost ConstantMatCost = 0;
1617 if (Op1Info.isConstant())
1618 ConstantMatCost += getConstantMatCost(0, Op1Info);
1619 if (Op2Info.isConstant())
1620 ConstantMatCost += getConstantMatCost(1, Op2Info);
1621
1622 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1623 case ISD::ADD:
1624 case ISD::SUB:
1625 case ISD::AND:
1626 case ISD::OR:
1627 case ISD::XOR:
1628 case ISD::SHL:
1629 case ISD::SRL:
1630 case ISD::SRA:
1631 case ISD::MUL:
1632 case ISD::MULHS:
1633 case ISD::MULHU:
1634 case ISD::FADD:
1635 case ISD::FSUB:
1636 case ISD::FMUL:
1637 case ISD::FNEG: {
1638 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1639 }
1640 default:
1641 return ConstantMatCost +
1642 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1643 Args, CxtI);
1644 }
1645}
1646
1647// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1649 ArrayRef<const Value *> Ptrs, const Value *Base,
1650 const TTI::PointersChainInfo &Info, Type *AccessTy,
1653 // In the basic model we take into account GEP instructions only
1654 // (although here can come alloca instruction, a value, constants and/or
1655 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1656 // pointer). Typically, if Base is a not a GEP-instruction and all the
1657 // pointers are relative to the same base address, all the rest are
1658 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1659 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1660 // any their index is a non-const.
1661 // If no known dependecies between the pointers cost is calculated as a sum
1662 // of costs of GEP instructions.
1663 for (auto [I, V] : enumerate(Ptrs)) {
1664 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1665 if (!GEP)
1666 continue;
1667 if (Info.isSameBase() && V != Base) {
1668 if (GEP->hasAllConstantIndices())
1669 continue;
1670 // If the chain is unit-stride and BaseReg + stride*i is a legal
1671 // addressing mode, then presume the base GEP is sitting around in a
1672 // register somewhere and check if we can fold the offset relative to
1673 // it.
1674 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1675 if (Info.isUnitStride() &&
1676 isLegalAddressingMode(AccessTy,
1677 /* BaseGV */ nullptr,
1678 /* BaseOffset */ Stride * I,
1679 /* HasBaseReg */ true,
1680 /* Scale */ 0,
1681 GEP->getType()->getPointerAddressSpace()))
1682 continue;
1683 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1684 {TTI::OK_AnyValue, TTI::OP_None},
1685 {TTI::OK_AnyValue, TTI::OP_None},
1686 std::nullopt);
1687 } else {
1688 SmallVector<const Value *> Indices(GEP->indices());
1689 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1690 Indices, AccessTy, CostKind);
1691 }
1692 }
1693 return Cost;
1694}
1695
1699 // TODO: More tuning on benchmarks and metrics with changes as needed
1700 // would apply to all settings below to enable performance.
1701
1702
1703 if (ST->enableDefaultUnroll())
1704 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1705
1706 // Enable Upper bound unrolling universally, not dependant upon the conditions
1707 // below.
1708 UP.UpperBound = true;
1709
1710 // Disable loop unrolling for Oz and Os.
1711 UP.OptSizeThreshold = 0;
1713 if (L->getHeader()->getParent()->hasOptSize())
1714 return;
1715
1716 SmallVector<BasicBlock *, 4> ExitingBlocks;
1717 L->getExitingBlocks(ExitingBlocks);
1718 LLVM_DEBUG(dbgs() << "Loop has:\n"
1719 << "Blocks: " << L->getNumBlocks() << "\n"
1720 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1721
1722 // Only allow another exit other than the latch. This acts as an early exit
1723 // as it mirrors the profitability calculation of the runtime unroller.
1724 if (ExitingBlocks.size() > 2)
1725 return;
1726
1727 // Limit the CFG of the loop body for targets with a branch predictor.
1728 // Allowing 4 blocks permits if-then-else diamonds in the body.
1729 if (L->getNumBlocks() > 4)
1730 return;
1731
1732 // Don't unroll vectorized loops, including the remainder loop
1733 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1734 return;
1735
1736 // Scan the loop: don't unroll loops with calls as this could prevent
1737 // inlining.
1739 for (auto *BB : L->getBlocks()) {
1740 for (auto &I : *BB) {
1741 // Initial setting - Don't unroll loops containing vectorized
1742 // instructions.
1743 if (I.getType()->isVectorTy())
1744 return;
1745
1746 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1747 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1748 if (!isLoweredToCall(F))
1749 continue;
1750 }
1751 return;
1752 }
1753
1754 SmallVector<const Value *> Operands(I.operand_values());
1757 }
1758 }
1759
1760 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1761
1762 UP.Partial = true;
1763 UP.Runtime = true;
1764 UP.UnrollRemainder = true;
1765 UP.UnrollAndJam = true;
1767
1768 // Force unrolling small loops can be very useful because of the branch
1769 // taken cost of the backedge.
1770 if (Cost < 12)
1771 UP.Force = true;
1772}
1773
1777}
1778
1781 if (Ty->isVectorTy()) {
1782 if (Size.isScalable() && ST->hasVInstructions())
1783 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1784
1786 return divideCeil(Size, ST->getRealMinVLen());
1787 }
1788
1789 return BaseT::getRegUsageForType(Ty);
1790}
1791
1792unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1793 if (SLPMaxVF.getNumOccurrences())
1794 return SLPMaxVF;
1795
1796 // Return how many elements can fit in getRegisterBitwidth. This is the
1797 // same routine as used in LoopVectorizer. We should probably be
1798 // accounting for whether we actually have instructions with the right
1799 // lane type, but we don't have enough information to do that without
1800 // some additional plumbing which hasn't been justified yet.
1801 TypeSize RegWidth =
1803 // If no vector registers, or absurd element widths, disable
1804 // vectorization by returning 1.
1805 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1806}
1807
1809 const TargetTransformInfo::LSRCost &C2) {
1810 // RISC-V specific here are "instruction number 1st priority".
1811 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1812 C1.NumIVMuls, C1.NumBaseAdds,
1813 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1814 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1815 C2.NumIVMuls, C2.NumBaseAdds,
1816 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1817}
1818
1820 auto *VTy = dyn_cast<VectorType>(DataTy);
1821 if (!VTy || VTy->isScalableTy())
1822 return false;
1823
1824 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1825 return false;
1826 return true;
1827}
1828
1830 const Function *Callee) const {
1831 const TargetMachine &TM = getTLI()->getTargetMachine();
1832
1833 const FeatureBitset &CallerBits =
1834 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1835 const FeatureBitset &CalleeBits =
1836 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1837
1838 // Inline a callee if its target-features are a subset of the callers
1839 // target-features.
1840 return (CallerBits & CalleeBits) == CalleeBits;
1841}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:756
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:755
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:1010
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:995
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).