LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include <cmath>
19#include <optional>
20using namespace llvm;
21using namespace llvm::PatternMatch;
22
23#define DEBUG_TYPE "riscvtti"
24
26 "riscv-v-register-bit-width-lmul",
28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29 "by autovectorized code. Fractional LMULs are not supported."),
31
33 "riscv-v-slp-max-vf",
35 "Overrides result used for getMaximumVF query which is used "
36 "exclusively by SLP vectorizer."),
38
40RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
42 // Check if the type is valid for all CostKind
43 if (!VT.isVector())
45 size_t NumInstr = OpCodes.size();
47 return NumInstr;
48 InstructionCost LMULCost = TLI->getLMULCost(VT);
50 return LMULCost * NumInstr;
52 for (auto Op : OpCodes) {
53 switch (Op) {
54 case RISCV::VRGATHER_VI:
55 Cost += TLI->getVRGatherVICost(VT);
56 break;
57 case RISCV::VRGATHER_VV:
58 Cost += TLI->getVRGatherVVCost(VT);
59 break;
60 case RISCV::VSLIDEUP_VI:
61 case RISCV::VSLIDEDOWN_VI:
62 Cost += TLI->getVSlideVICost(VT);
63 break;
64 case RISCV::VSLIDEUP_VX:
65 case RISCV::VSLIDEDOWN_VX:
66 Cost += TLI->getVSlideVXCost(VT);
67 break;
68 case RISCV::VREDMAX_VS:
69 case RISCV::VREDMIN_VS:
70 case RISCV::VREDMAXU_VS:
71 case RISCV::VREDMINU_VS:
72 case RISCV::VREDSUM_VS:
73 case RISCV::VREDAND_VS:
74 case RISCV::VREDOR_VS:
75 case RISCV::VREDXOR_VS:
76 case RISCV::VFREDMAX_VS:
77 case RISCV::VFREDMIN_VS:
78 case RISCV::VFREDUSUM_VS: {
79 unsigned VL = VT.getVectorMinNumElements();
80 if (!VT.isFixedLengthVector())
81 VL *= *getVScaleForTuning();
82 Cost += Log2_32_Ceil(VL);
83 break;
84 }
85 case RISCV::VFREDOSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += VL;
90 break;
91 }
92 case RISCV::VMV_X_S:
93 case RISCV::VMV_S_X:
94 case RISCV::VFMV_F_S:
95 case RISCV::VFMV_S_F:
96 case RISCV::VMOR_MM:
97 case RISCV::VMXOR_MM:
98 case RISCV::VMAND_MM:
99 case RISCV::VMANDN_MM:
100 case RISCV::VMNAND_MM:
101 case RISCV::VCPOP_M:
102 case RISCV::VFIRST_M:
103 Cost += 1;
104 break;
105 default:
106 Cost += LMULCost;
107 }
108 }
109 return Cost;
110}
111
113 const RISCVSubtarget *ST,
114 const APInt &Imm, Type *Ty,
116 bool FreeZeroes) {
117 assert(Ty->isIntegerTy() &&
118 "getIntImmCost can only estimate cost of materialising integers");
119
120 // We have a Zero register, so 0 is always free.
121 if (Imm == 0)
122 return TTI::TCC_Free;
123
124 // Otherwise, we check how many instructions it will take to materialise.
125 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
126 /*CompressionCost=*/false, FreeZeroes);
127}
128
131 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
132}
133
134// Look for patterns of shift followed by AND that can be turned into a pair of
135// shifts. We won't need to materialize an immediate for the AND so these can
136// be considered free.
137static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
138 uint64_t Mask = Imm.getZExtValue();
139 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
140 if (!BO || !BO->hasOneUse())
141 return false;
142
143 if (BO->getOpcode() != Instruction::Shl)
144 return false;
145
146 if (!isa<ConstantInt>(BO->getOperand(1)))
147 return false;
148
149 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
150 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
151 // is a mask shifted by c2 bits with c3 leading zeros.
152 if (isShiftedMask_64(Mask)) {
153 unsigned Trailing = llvm::countr_zero(Mask);
154 if (ShAmt == Trailing)
155 return true;
156 }
157
158 return false;
159}
160
162 const APInt &Imm, Type *Ty,
164 Instruction *Inst) {
165 assert(Ty->isIntegerTy() &&
166 "getIntImmCost can only estimate cost of materialising integers");
167
168 // We have a Zero register, so 0 is always free.
169 if (Imm == 0)
170 return TTI::TCC_Free;
171
172 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
173 // commutative, in others the immediate comes from a specific argument index.
174 bool Takes12BitImm = false;
175 unsigned ImmArgIdx = ~0U;
176
177 switch (Opcode) {
178 case Instruction::GetElementPtr:
179 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
180 // split up large offsets in GEP into better parts than ConstantHoisting
181 // can.
182 return TTI::TCC_Free;
183 case Instruction::Store: {
184 // Use the materialization cost regardless of if it's the address or the
185 // value that is constant, except for if the store is misaligned and
186 // misaligned accesses are not legal (experience shows constant hoisting
187 // can sometimes be harmful in such cases).
188 if (Idx == 1 || !Inst)
189 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
190 /*FreeZeroes=*/true);
191
192 StoreInst *ST = cast<StoreInst>(Inst);
193 if (!getTLI()->allowsMemoryAccessForAlignment(
194 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
195 ST->getPointerAddressSpace(), ST->getAlign()))
196 return TTI::TCC_Free;
197
198 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
199 /*FreeZeroes=*/true);
200 }
201 case Instruction::Load:
202 // If the address is a constant, use the materialization cost.
203 return getIntImmCost(Imm, Ty, CostKind);
204 case Instruction::And:
205 // zext.h
206 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
207 return TTI::TCC_Free;
208 // zext.w
209 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
210 return TTI::TCC_Free;
211 // bclri
212 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
213 return TTI::TCC_Free;
214 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
215 canUseShiftPair(Inst, Imm))
216 return TTI::TCC_Free;
217 Takes12BitImm = true;
218 break;
219 case Instruction::Add:
220 Takes12BitImm = true;
221 break;
222 case Instruction::Or:
223 case Instruction::Xor:
224 // bseti/binvi
225 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
226 return TTI::TCC_Free;
227 Takes12BitImm = true;
228 break;
229 case Instruction::Mul:
230 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
231 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
232 return TTI::TCC_Free;
233 // One more or less than a power of 2 can use SLLI+ADD/SUB.
234 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
235 return TTI::TCC_Free;
236 // FIXME: There is no MULI instruction.
237 Takes12BitImm = true;
238 break;
239 case Instruction::Sub:
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 Takes12BitImm = true;
244 ImmArgIdx = 1;
245 break;
246 default:
247 break;
248 }
249
250 if (Takes12BitImm) {
251 // Check immediate is the correct argument...
252 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
253 // ... and fits into the 12-bit immediate.
254 if (Imm.getSignificantBits() <= 64 &&
255 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
256 return TTI::TCC_Free;
257 }
258 }
259
260 // Otherwise, use the full materialisation cost.
261 return getIntImmCost(Imm, Ty, CostKind);
262 }
263
264 // By default, prevent hoisting.
265 return TTI::TCC_Free;
266}
267
270 const APInt &Imm, Type *Ty,
272 // Prevent hoisting in unknown cases.
273 return TTI::TCC_Free;
274}
275
276bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
277 return ST->hasVInstructions();
278}
279
282 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
283 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
286}
287
289 // Currently, the ExpandReductions pass can't expand scalable-vector
290 // reductions, but we still request expansion as RVV doesn't support certain
291 // reductions and the SelectionDAG can't legalize them either.
292 switch (II->getIntrinsicID()) {
293 default:
294 return false;
295 // These reductions have no equivalent in RVV
296 case Intrinsic::vector_reduce_mul:
297 case Intrinsic::vector_reduce_fmul:
298 return true;
299 }
300}
301
302std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
303 if (ST->hasVInstructions())
305 return BaseT::getMaxVScale();
306}
307
308std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
309 if (ST->hasVInstructions())
310 if (unsigned MinVLen = ST->getRealMinVLen();
311 MinVLen >= RISCV::RVVBitsPerBlock)
312 return MinVLen / RISCV::RVVBitsPerBlock;
314}
315
318 unsigned LMUL =
319 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
320 switch (K) {
322 return TypeSize::getFixed(ST->getXLen());
324 return TypeSize::getFixed(
325 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
328 (ST->hasVInstructions() &&
331 : 0);
332 }
333
334 llvm_unreachable("Unsupported register kind");
335}
336
338RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
339 // Add a cost of address generation + the cost of the load. The address
340 // is expected to be a PC relative offset to a constant pool entry
341 // using auipc/addi.
342 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
343 /*AddressSpace=*/0, CostKind);
344}
345
347 LLVMContext &C) {
348 assert((DataVT.getScalarSizeInBits() != 8 ||
349 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
350 MVT IndexVT = DataVT.changeTypeToInteger();
351 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
352 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
353 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
354}
355
357 VectorType *Tp, ArrayRef<int> Mask,
359 int Index, VectorType *SubTp,
361 const Instruction *CxtI) {
362 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
363
364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
365
366 // First, handle cases where having a fixed length vector enables us to
367 // give a more accurate cost than falling back to generic scalable codegen.
368 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
369 if (isa<FixedVectorType>(Tp)) {
370 switch (Kind) {
371 default:
372 break;
374 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
375 MVT EltTp = LT.second.getVectorElementType();
376 // If the size of the element is < ELEN then shuffles of interleaves and
377 // deinterleaves of 2 vectors can be lowered into the following
378 // sequences
379 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
380 // Example sequence:
381 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
382 // vwaddu.vv v10, v8, v9
383 // li a0, -1 (ignored)
384 // vwmaccu.vx v10, a0, v9
385 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
386 return 2 * LT.first * TLI->getLMULCost(LT.second);
387
388 if (Mask[0] == 0 || Mask[0] == 1) {
389 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
390 // Example sequence:
391 // vnsrl.wi v10, v8, 0
392 if (equal(DeinterleaveMask, Mask))
393 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
394 LT.second, CostKind);
395 }
396 }
397 }
398 // vrgather + cost of generating the mask constant.
399 // We model this for an unknown mask with a single vrgather.
400 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
401 (LT.second.getScalarSizeInBits() != 8 ||
402 LT.second.getVectorNumElements() <= 256)) {
403 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
404 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
405 return IndexCost +
406 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
407 }
408 [[fallthrough]];
409 }
412 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
413 // register for the second vrgather. We model this for an unknown
414 // (shuffle) mask.
415 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
416 (LT.second.getScalarSizeInBits() != 8 ||
417 LT.second.getVectorNumElements() <= 256)) {
418 auto &C = Tp->getContext();
419 auto EC = Tp->getElementCount();
420 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
422 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
423 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
424 return 2 * IndexCost +
425 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
426 LT.second, CostKind) +
427 MaskCost;
428 }
429 [[fallthrough]];
430 }
431 case TTI::SK_Select: {
432 // We are going to permute multiple sources and the result will be in
433 // multiple destinations. Providing an accurate cost only for splits where
434 // the element type remains the same.
435 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
436 LT.second.isFixedLengthVector() &&
437 LT.second.getVectorElementType().getSizeInBits() ==
439 LT.second.getVectorNumElements() <
440 cast<FixedVectorType>(Tp)->getNumElements() &&
441 divideCeil(Mask.size(),
442 cast<FixedVectorType>(Tp)->getNumElements()) ==
443 static_cast<unsigned>(*LT.first.getValue())) {
444 unsigned NumRegs = *LT.first.getValue();
445 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
446 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
447 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
448
450 for (unsigned I = 0; I < NumRegs; ++I) {
451 bool IsSingleVector = true;
452 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
453 transform(Mask.slice(I * SubVF,
454 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
455 SubMask.begin(), [&](int I) {
456 bool SingleSubVector = I / VF == 0;
457 IsSingleVector &= SingleSubVector;
458 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
459 });
462 SubVecTy, SubMask, CostKind, 0, nullptr);
463 return Cost;
464 }
465 }
466 break;
467 }
468 }
469 };
470
471 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
472 switch (Kind) {
473 default:
474 // Fallthrough to generic handling.
475 // TODO: Most of these cases will return getInvalid in generic code, and
476 // must be implemented here.
477 break;
479 // Extract at zero is always a subregister extract
480 if (Index == 0)
481 return TTI::TCC_Free;
482
483 // If we're extracting a subvector of at most m1 size at a sub-register
484 // boundary - which unfortunately we need exact vlen to identify - this is
485 // a subregister extract at worst and thus won't require a vslidedown.
486 // TODO: Extend for aligned m2, m4 subvector extracts
487 // TODO: Extend for misalgined (but contained) extracts
488 // TODO: Extend for scalable subvector types
489 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
490 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
491 const unsigned MinVLen = ST->getRealMinVLen();
492 const unsigned MaxVLen = ST->getRealMaxVLen();
493 if (MinVLen == MaxVLen &&
494 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
495 SubLT.second.getSizeInBits() <= MinVLen)
496 return TTI::TCC_Free;
497 }
498
499 // Example sequence:
500 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
501 // vslidedown.vi v8, v9, 2
502 return LT.first *
503 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
505 // Example sequence:
506 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
507 // vslideup.vi v8, v9, 2
508 return LT.first *
509 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
510 case TTI::SK_Select: {
511 // Example sequence:
512 // li a0, 90
513 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
514 // vmv.s.x v0, a0
515 // vmerge.vvm v8, v9, v8, v0
516 // We use 2 for the cost of the mask materialization as this is the true
517 // cost for small masks and most shuffles are small. At worst, this cost
518 // should be a very small constant for the constant pool load. As such,
519 // we may bias towards large selects slightly more than truely warranted.
520 return LT.first *
521 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
522 LT.second, CostKind));
523 }
524 case TTI::SK_Broadcast: {
525 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
526 Instruction::InsertElement);
527 if (LT.second.getScalarSizeInBits() == 1) {
528 if (HasScalar) {
529 // Example sequence:
530 // andi a0, a0, 1
531 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
532 // vmv.v.x v8, a0
533 // vmsne.vi v0, v8, 0
534 return LT.first *
535 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
536 LT.second, CostKind));
537 }
538 // Example sequence:
539 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
540 // vmv.v.i v8, 0
541 // vmerge.vim v8, v8, 1, v0
542 // vmv.x.s a0, v8
543 // andi a0, a0, 1
544 // vmv.v.x v8, a0
545 // vmsne.vi v0, v8, 0
546
547 return LT.first *
548 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
549 RISCV::VMV_X_S, RISCV::VMV_V_X,
550 RISCV::VMSNE_VI},
551 LT.second, CostKind));
552 }
553
554 if (HasScalar) {
555 // Example sequence:
556 // vmv.v.x v8, a0
557 return LT.first *
558 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
559 }
560
561 // Example sequence:
562 // vrgather.vi v9, v8, 0
563 return LT.first *
564 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
565 }
566 case TTI::SK_Splice: {
567 // vslidedown+vslideup.
568 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
569 // of similar code, but I think we expand through memory.
570 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
571 if (Index >= 0 && Index < 32)
572 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
573 else if (Index < 0 && Index > -32)
574 Opcodes[1] = RISCV::VSLIDEUP_VI;
575 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
576 }
577 case TTI::SK_Reverse: {
578 // TODO: Cases to improve here:
579 // * Illegal vector types
580 // * i64 on RV32
581 // * i1 vector
582 // At low LMUL, most of the cost is producing the vrgather index register.
583 // At high LMUL, the cost of the vrgather itself will dominate.
584 // Example sequence:
585 // csrr a0, vlenb
586 // srli a0, a0, 3
587 // addi a0, a0, -1
588 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
589 // vid.v v9
590 // vrsub.vx v10, v9, a0
591 // vrgather.vv v9, v8, v10
592 InstructionCost LenCost = 3;
593 if (LT.second.isFixedLengthVector())
594 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
595 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
596 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
597 if (LT.second.isFixedLengthVector() &&
598 isInt<5>(LT.second.getVectorNumElements() - 1))
599 Opcodes[1] = RISCV::VRSUB_VI;
600 InstructionCost GatherCost =
601 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
602 // Mask operation additionally required extend and truncate
603 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
604 return LT.first * (LenCost + GatherCost + ExtendCost);
605 }
606 }
607 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
608}
609
611RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
612 unsigned AddressSpace,
614 if (!isLegalMaskedLoadStore(Src, Alignment) ||
616 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
617 CostKind);
618
619 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
620}
621
623 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
624 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
625 bool UseMaskForCond, bool UseMaskForGaps) {
626 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
628
629 // The interleaved memory access pass will lower interleaved memory ops (i.e
630 // a load and store followed by a specific shuffle) to vlseg/vsseg
631 // intrinsics. In those cases then we can treat it as if it's just one (legal)
632 // memory op
633 if (!UseMaskForCond && !UseMaskForGaps &&
634 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
635 auto *VTy = cast<VectorType>(VecTy);
636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
637 // Need to make sure type has't been scalarized
638 if (LT.second.isVector()) {
639 auto *SubVecTy =
640 VectorType::get(VTy->getElementType(),
641 VTy->getElementCount().divideCoefficientBy(Factor));
642
643 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
644 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
645 AddressSpace, DL)) {
646 // FIXME: We use the memory op cost of the *legalized* type here,
647 // because it's getMemoryOpCost returns a really expensive cost for
648 // types like <6 x i8>, which show up when doing interleaves of
649 // Factor=3 etc. Should the memory op cost of these be cheaper?
650 auto *LegalVTy = VectorType::get(VTy->getElementType(),
651 LT.second.getVectorElementCount());
652 InstructionCost LegalMemCost = getMemoryOpCost(
653 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
654 return LT.first + LegalMemCost;
655 }
656 }
657 }
658
659 // TODO: Return the cost of interleaved accesses for scalable vector when
660 // unable to convert to segment accesses instructions.
661 if (isa<ScalableVectorType>(VecTy))
663
664 auto *FVTy = cast<FixedVectorType>(VecTy);
665 InstructionCost MemCost =
666 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
667 unsigned VF = FVTy->getNumElements() / Factor;
668
669 // An interleaved load will look like this for Factor=3:
670 // %wide.vec = load <12 x i32>, ptr %3, align 4
671 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
672 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
673 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
674 if (Opcode == Instruction::Load) {
675 InstructionCost Cost = MemCost;
676 for (unsigned Index : Indices) {
677 FixedVectorType *SubVecTy =
678 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
679 auto Mask = createStrideMask(Index, Factor, VF);
680 InstructionCost ShuffleCost =
682 CostKind, 0, nullptr, {});
683 Cost += ShuffleCost;
684 }
685 return Cost;
686 }
687
688 // TODO: Model for NF > 2
689 // We'll need to enhance getShuffleCost to model shuffles that are just
690 // inserts and extracts into subvectors, since they won't have the full cost
691 // of a vrgather.
692 // An interleaved store for 3 vectors of 4 lanes will look like
693 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
694 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
695 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
696 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
697 // store <12 x i32> %interleaved.vec, ptr %10, align 4
698 if (Factor != 2)
699 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
700 Alignment, AddressSpace, CostKind,
701 UseMaskForCond, UseMaskForGaps);
702
703 assert(Opcode == Instruction::Store && "Opcode must be a store");
704 // For an interleaving store of 2 vectors, we perform one large interleaving
705 // shuffle that goes into the wide store
706 auto Mask = createInterleaveMask(VF, Factor);
707 InstructionCost ShuffleCost =
709 CostKind, 0, nullptr, {});
710 return MemCost + ShuffleCost;
711}
712
714 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
715 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
717 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
718 Alignment, CostKind, I);
719
720 if ((Opcode == Instruction::Load &&
721 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
722 (Opcode == Instruction::Store &&
723 !isLegalMaskedScatter(DataTy, Align(Alignment))))
724 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
725 Alignment, CostKind, I);
726
727 // Cost is proportional to the number of memory operations implied. For
728 // scalable vectors, we use an estimate on that number since we don't
729 // know exactly what VL will be.
730 auto &VTy = *cast<VectorType>(DataTy);
731 InstructionCost MemOpCost =
732 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
733 {TTI::OK_AnyValue, TTI::OP_None}, I);
734 unsigned NumLoads = getEstimatedVLFor(&VTy);
735 return NumLoads * MemOpCost;
736}
737
739 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
740 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
741 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
742 !isLegalStridedLoadStore(DataTy, Alignment)) ||
743 (Opcode != Instruction::Load && Opcode != Instruction::Store))
744 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
745 Alignment, CostKind, I);
746
748 return TTI::TCC_Basic;
749
750 // Cost is proportional to the number of memory operations implied. For
751 // scalable vectors, we use an estimate on that number since we don't
752 // know exactly what VL will be.
753 auto &VTy = *cast<VectorType>(DataTy);
754 InstructionCost MemOpCost =
755 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
756 {TTI::OK_AnyValue, TTI::OP_None}, I);
757 unsigned NumLoads = getEstimatedVLFor(&VTy);
758 return NumLoads * MemOpCost;
759}
760
761// Currently, these represent both throughput and codesize costs
762// for the respective intrinsics. The costs in this table are simply
763// instruction counts with the following adjustments made:
764// * One vsetvli is considered free.
766 {Intrinsic::floor, MVT::f32, 9},
767 {Intrinsic::floor, MVT::f64, 9},
768 {Intrinsic::ceil, MVT::f32, 9},
769 {Intrinsic::ceil, MVT::f64, 9},
770 {Intrinsic::trunc, MVT::f32, 7},
771 {Intrinsic::trunc, MVT::f64, 7},
772 {Intrinsic::round, MVT::f32, 9},
773 {Intrinsic::round, MVT::f64, 9},
774 {Intrinsic::roundeven, MVT::f32, 9},
775 {Intrinsic::roundeven, MVT::f64, 9},
776 {Intrinsic::rint, MVT::f32, 7},
777 {Intrinsic::rint, MVT::f64, 7},
778 {Intrinsic::lrint, MVT::i32, 1},
779 {Intrinsic::lrint, MVT::i64, 1},
780 {Intrinsic::llrint, MVT::i64, 1},
781 {Intrinsic::nearbyint, MVT::f32, 9},
782 {Intrinsic::nearbyint, MVT::f64, 9},
783 {Intrinsic::bswap, MVT::i16, 3},
784 {Intrinsic::bswap, MVT::i32, 12},
785 {Intrinsic::bswap, MVT::i64, 31},
786 {Intrinsic::vp_bswap, MVT::i16, 3},
787 {Intrinsic::vp_bswap, MVT::i32, 12},
788 {Intrinsic::vp_bswap, MVT::i64, 31},
789 {Intrinsic::vp_fshl, MVT::i8, 7},
790 {Intrinsic::vp_fshl, MVT::i16, 7},
791 {Intrinsic::vp_fshl, MVT::i32, 7},
792 {Intrinsic::vp_fshl, MVT::i64, 7},
793 {Intrinsic::vp_fshr, MVT::i8, 7},
794 {Intrinsic::vp_fshr, MVT::i16, 7},
795 {Intrinsic::vp_fshr, MVT::i32, 7},
796 {Intrinsic::vp_fshr, MVT::i64, 7},
797 {Intrinsic::bitreverse, MVT::i8, 17},
798 {Intrinsic::bitreverse, MVT::i16, 24},
799 {Intrinsic::bitreverse, MVT::i32, 33},
800 {Intrinsic::bitreverse, MVT::i64, 52},
801 {Intrinsic::vp_bitreverse, MVT::i8, 17},
802 {Intrinsic::vp_bitreverse, MVT::i16, 24},
803 {Intrinsic::vp_bitreverse, MVT::i32, 33},
804 {Intrinsic::vp_bitreverse, MVT::i64, 52},
805 {Intrinsic::ctpop, MVT::i8, 12},
806 {Intrinsic::ctpop, MVT::i16, 19},
807 {Intrinsic::ctpop, MVT::i32, 20},
808 {Intrinsic::ctpop, MVT::i64, 21},
809 {Intrinsic::vp_ctpop, MVT::i8, 12},
810 {Intrinsic::vp_ctpop, MVT::i16, 19},
811 {Intrinsic::vp_ctpop, MVT::i32, 20},
812 {Intrinsic::vp_ctpop, MVT::i64, 21},
813 {Intrinsic::vp_ctlz, MVT::i8, 19},
814 {Intrinsic::vp_ctlz, MVT::i16, 28},
815 {Intrinsic::vp_ctlz, MVT::i32, 31},
816 {Intrinsic::vp_ctlz, MVT::i64, 35},
817 {Intrinsic::vp_cttz, MVT::i8, 16},
818 {Intrinsic::vp_cttz, MVT::i16, 23},
819 {Intrinsic::vp_cttz, MVT::i32, 24},
820 {Intrinsic::vp_cttz, MVT::i64, 25},
821};
822
824 switch (ID) {
825#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
826 case Intrinsic::VPID: \
827 return ISD::VPSD;
828#include "llvm/IR/VPIntrinsics.def"
829#undef HELPER_MAP_VPID_TO_VPSD
830 }
831 return ISD::DELETED_NODE;
832}
833
837 auto *RetTy = ICA.getReturnType();
838 switch (ICA.getID()) {
839 case Intrinsic::ceil:
840 case Intrinsic::floor:
841 case Intrinsic::trunc:
842 case Intrinsic::rint:
843 case Intrinsic::lrint:
844 case Intrinsic::llrint:
845 case Intrinsic::round:
846 case Intrinsic::roundeven: {
847 // These all use the same code.
849 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
850 return LT.first * 8;
851 break;
852 }
853 case Intrinsic::umin:
854 case Intrinsic::umax:
855 case Intrinsic::smin:
856 case Intrinsic::smax: {
858 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
859 return LT.first;
860
861 if (ST->hasVInstructions() && LT.second.isVector()) {
862 unsigned Op;
863 switch (ICA.getID()) {
864 case Intrinsic::umin:
865 Op = RISCV::VMINU_VV;
866 break;
867 case Intrinsic::umax:
868 Op = RISCV::VMAXU_VV;
869 break;
870 case Intrinsic::smin:
871 Op = RISCV::VMIN_VV;
872 break;
873 case Intrinsic::smax:
874 Op = RISCV::VMAX_VV;
875 break;
876 }
877 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
878 }
879 break;
880 }
881 case Intrinsic::sadd_sat:
882 case Intrinsic::ssub_sat:
883 case Intrinsic::uadd_sat:
884 case Intrinsic::usub_sat:
885 case Intrinsic::fabs:
886 case Intrinsic::sqrt: {
888 if (ST->hasVInstructions() && LT.second.isVector())
889 return LT.first;
890 break;
891 }
892 case Intrinsic::ctpop: {
894 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
895 return LT.first;
896 break;
897 }
898 case Intrinsic::abs: {
900 if (ST->hasVInstructions() && LT.second.isVector()) {
901 // vrsub.vi v10, v8, 0
902 // vmax.vv v8, v8, v10
903 return LT.first * 2;
904 }
905 break;
906 }
907 case Intrinsic::get_active_lane_mask: {
908 if (ST->hasVInstructions()) {
909 Type *ExpRetTy = VectorType::get(
910 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
911 auto LT = getTypeLegalizationCost(ExpRetTy);
912
913 // vid.v v8 // considered hoisted
914 // vsaddu.vx v8, v8, a0
915 // vmsltu.vx v0, v8, a1
916 return LT.first *
917 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
918 LT.second, CostKind);
919 }
920 break;
921 }
922 // TODO: add more intrinsic
923 case Intrinsic::experimental_stepvector: {
925 // Legalisation of illegal types involves an `index' instruction plus
926 // (LT.first - 1) vector adds.
927 if (ST->hasVInstructions())
928 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
929 (LT.first - 1) *
930 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
931 return 1 + (LT.first - 1);
932 }
933 case Intrinsic::experimental_cttz_elts: {
934 Type *ArgTy = ICA.getArgTypes()[0];
935 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
936 if (getTLI()->shouldExpandCttzElements(ArgType))
937 break;
938 InstructionCost Cost = getRISCVInstructionCost(
939 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
940
941 // If zero_is_poison is false, then we will generate additional
942 // cmp + select instructions to convert -1 to EVL.
943 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
944 if (ICA.getArgs().size() > 1 &&
945 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
946 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
948 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
950
951 return Cost;
952 }
953 case Intrinsic::vp_rint: {
954 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
955 unsigned Cost = 5;
957 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
958 return Cost * LT.first;
959 break;
960 }
961 case Intrinsic::vp_nearbyint: {
962 // More one read and one write for fflags than vp_rint.
963 unsigned Cost = 7;
965 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
966 return Cost * LT.first;
967 break;
968 }
969 case Intrinsic::vp_ceil:
970 case Intrinsic::vp_floor:
971 case Intrinsic::vp_round:
972 case Intrinsic::vp_roundeven:
973 case Intrinsic::vp_roundtozero: {
974 // Rounding with static rounding mode needs two more instructions to
975 // swap/write FRM than vp_rint.
976 unsigned Cost = 7;
978 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
979 if (TLI->isOperationCustom(VPISD, LT.second))
980 return Cost * LT.first;
981 break;
982 }
983 // vp integer arithmetic ops.
984 case Intrinsic::vp_add:
985 case Intrinsic::vp_and:
986 case Intrinsic::vp_ashr:
987 case Intrinsic::vp_lshr:
988 case Intrinsic::vp_mul:
989 case Intrinsic::vp_or:
990 case Intrinsic::vp_sdiv:
991 case Intrinsic::vp_shl:
992 case Intrinsic::vp_srem:
993 case Intrinsic::vp_sub:
994 case Intrinsic::vp_udiv:
995 case Intrinsic::vp_urem:
996 case Intrinsic::vp_xor:
997 // vp float arithmetic ops.
998 case Intrinsic::vp_fadd:
999 case Intrinsic::vp_fsub:
1000 case Intrinsic::vp_fmul:
1001 case Intrinsic::vp_fdiv:
1002 case Intrinsic::vp_frem: {
1003 std::optional<unsigned> FOp =
1005 if (FOp)
1006 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1007 break;
1008 }
1009 }
1010
1011 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1012 if (auto LT = getTypeLegalizationCost(RetTy);
1013 LT.second.isVector()) {
1014 MVT EltTy = LT.second.getVectorElementType();
1015 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1016 ICA.getID(), EltTy))
1017 return LT.first * Entry->Cost;
1018 }
1019 }
1020
1022}
1023
1025 Type *Src,
1028 const Instruction *I) {
1029 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1030 if (!IsVectorType)
1031 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1032
1033 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
1034 (Src->getScalarSizeInBits() <= ST->getELen()) &&
1035 (Dst->getScalarSizeInBits() <= ST->getELen());
1036
1037 // FIXME: Need to compute legalizing cost for illegal types.
1038 if (!IsTypeLegal)
1039 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1040
1041 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1042 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1043
1044 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1045 assert(ISD && "Invalid opcode");
1046
1047 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1048 (int)Log2_32(Src->getScalarSizeInBits());
1049 switch (ISD) {
1050 case ISD::SIGN_EXTEND:
1051 case ISD::ZERO_EXTEND: {
1052 const unsigned SrcEltSize = Src->getScalarSizeInBits();
1053 if (SrcEltSize == 1) {
1054 // We do not use vsext/vzext to extend from mask vector.
1055 // Instead we use the following instructions to extend from mask vector:
1056 // vmv.v.i v8, 0
1057 // vmerge.vim v8, v8, -1, v0
1058 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1059 DstLT.second, CostKind);
1060 }
1061 if ((PowDiff < 1) || (PowDiff > 3))
1062 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1063 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1064 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1065 unsigned Op =
1066 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1067 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1068 }
1069 case ISD::TRUNCATE:
1070 if (Dst->getScalarSizeInBits() == 1) {
1071 // We do not use several vncvt to truncate to mask vector. So we could
1072 // not use PowDiff to calculate it.
1073 // Instead we use the following instructions to truncate to mask vector:
1074 // vand.vi v8, v8, 1
1075 // vmsne.vi v0, v8, 0
1076 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1077 SrcLT.second, CostKind);
1078 }
1079 [[fallthrough]];
1080 case ISD::FP_EXTEND:
1081 case ISD::FP_ROUND: {
1082 // Counts of narrow/widen instructions.
1083 unsigned SrcEltSize = Src->getScalarSizeInBits();
1084 unsigned DstEltSize = Dst->getScalarSizeInBits();
1085
1086 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1087 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1088 : RISCV::VFNCVT_F_F_W;
1090 for (; SrcEltSize != DstEltSize;) {
1091 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1092 ? MVT::getIntegerVT(DstEltSize)
1093 : MVT::getFloatingPointVT(DstEltSize);
1094 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1095 DstEltSize =
1096 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1097 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1098 }
1099 return Cost;
1100 }
1101 case ISD::FP_TO_SINT:
1102 case ISD::FP_TO_UINT:
1103 case ISD::SINT_TO_FP:
1104 case ISD::UINT_TO_FP:
1105 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1106 // The cost of convert from or to mask vector is different from other
1107 // cases. We could not use PowDiff to calculate it.
1108 // For mask vector to fp, we should use the following instructions:
1109 // vmv.v.i v8, 0
1110 // vmerge.vim v8, v8, -1, v0
1111 // vfcvt.f.x.v v8, v8
1112
1113 // And for fp vector to mask, we use:
1114 // vfncvt.rtz.x.f.w v9, v8
1115 // vand.vi v8, v9, 1
1116 // vmsne.vi v0, v8, 0
1117 return 3;
1118 }
1119 if (std::abs(PowDiff) <= 1)
1120 return 1;
1121 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1122 // so it only need two conversion.
1123 if (Src->isIntOrIntVectorTy())
1124 return 2;
1125 // Counts of narrow/widen instructions.
1126 return std::abs(PowDiff);
1127 }
1128 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1129}
1130
1131unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1132 if (isa<ScalableVectorType>(Ty)) {
1133 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1134 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1135 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1136 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1137 }
1138 return cast<FixedVectorType>(Ty)->getNumElements();
1139}
1140
1143 FastMathFlags FMF,
1145 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1146 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1147
1148 // Skip if scalar size of Ty is bigger than ELEN.
1149 if (Ty->getScalarSizeInBits() > ST->getELen())
1150 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1151
1152 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1153 if (Ty->getElementType()->isIntegerTy(1)) {
1154 // SelectionDAGBuilder does following transforms:
1155 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1156 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1157 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1158 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1159 else
1160 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1161 }
1162
1163 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1165 InstructionCost ExtraCost = 0;
1166 switch (IID) {
1167 case Intrinsic::maximum:
1168 if (FMF.noNaNs()) {
1169 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1170 } else {
1171 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1172 RISCV::VFMV_F_S};
1173 // Cost of Canonical Nan + branch
1174 // lui a0, 523264
1175 // fmv.w.x fa0, a0
1176 Type *DstTy = Ty->getScalarType();
1177 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1178 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1179 ExtraCost = 1 +
1180 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1182 getCFInstrCost(Instruction::Br, CostKind);
1183 }
1184 break;
1185
1186 case Intrinsic::minimum:
1187 if (FMF.noNaNs()) {
1188 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1189 } else {
1190 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1191 RISCV::VFMV_F_S};
1192 // Cost of Canonical Nan + branch
1193 // lui a0, 523264
1194 // fmv.w.x fa0, a0
1195 Type *DstTy = Ty->getScalarType();
1196 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1197 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1198 ExtraCost = 1 +
1199 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1201 getCFInstrCost(Instruction::Br, CostKind);
1202 }
1203 break;
1204 }
1205 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1206 }
1207
1208 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1209 unsigned SplitOp;
1211 switch (IID) {
1212 default:
1213 llvm_unreachable("Unsupported intrinsic");
1214 case Intrinsic::smax:
1215 SplitOp = RISCV::VMAX_VV;
1216 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1217 break;
1218 case Intrinsic::smin:
1219 SplitOp = RISCV::VMIN_VV;
1220 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1221 break;
1222 case Intrinsic::umax:
1223 SplitOp = RISCV::VMAXU_VV;
1224 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1225 break;
1226 case Intrinsic::umin:
1227 SplitOp = RISCV::VMINU_VV;
1228 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1229 break;
1230 case Intrinsic::maxnum:
1231 SplitOp = RISCV::VFMAX_VV;
1232 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1233 break;
1234 case Intrinsic::minnum:
1235 SplitOp = RISCV::VFMIN_VV;
1236 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1237 break;
1238 }
1239 // Add a cost for data larger than LMUL8
1240 InstructionCost SplitCost =
1241 (LT.first > 1) ? (LT.first - 1) *
1242 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1243 : 0;
1244 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1245}
1246
1249 std::optional<FastMathFlags> FMF,
1251 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1252 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1253
1254 // Skip if scalar size of Ty is bigger than ELEN.
1255 if (Ty->getScalarSizeInBits() > ST->getELen())
1256 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1257
1258 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1259 assert(ISD && "Invalid opcode");
1260
1261 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1262 ISD != ISD::FADD)
1263 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1264
1265 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1267 Type *ElementTy = Ty->getElementType();
1268 if (ElementTy->isIntegerTy(1)) {
1269 if (ISD == ISD::AND) {
1270 // Example sequences:
1271 // vsetvli a0, zero, e8, mf8, ta, ma
1272 // vmnot.m v8, v0
1273 // vcpop.m a0, v8
1274 // seqz a0, a0
1275 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1276 return (LT.first - 1) +
1277 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1278 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1280 } else {
1281 // Example sequences:
1282 // vsetvli a0, zero, e8, mf8, ta, ma
1283 // vcpop.m a0, v0
1284 // snez a0, a0
1285 Opcodes = {RISCV::VCPOP_M};
1286 return (LT.first - 1) +
1287 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1288 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1290 }
1291 }
1292
1293 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1295 Opcodes.push_back(RISCV::VFMV_S_F);
1296 for (unsigned i = 0; i < LT.first.getValue(); i++)
1297 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1298 Opcodes.push_back(RISCV::VFMV_F_S);
1299 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1300 }
1301 unsigned SplitOp;
1302 switch (ISD) {
1303 case ISD::ADD:
1304 SplitOp = RISCV::VADD_VV;
1305 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1306 break;
1307 case ISD::OR:
1308 SplitOp = RISCV::VOR_VV;
1309 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1310 break;
1311 case ISD::XOR:
1312 SplitOp = RISCV::VXOR_VV;
1313 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1314 break;
1315 case ISD::AND:
1316 SplitOp = RISCV::VAND_VV;
1317 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1318 break;
1319 case ISD::FADD:
1320 SplitOp = RISCV::VFADD_VV;
1321 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1322 break;
1323 }
1324 // Add a cost for data larger than LMUL8
1325 InstructionCost SplitCost =
1326 (LT.first > 1) ? (LT.first - 1) *
1327 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1328 : 0;
1329 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1330}
1331
1333 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1335 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1336 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1337 FMF, CostKind);
1338
1339 // Skip if scalar size of ResTy is bigger than ELEN.
1340 if (ResTy->getScalarSizeInBits() > ST->getELen())
1341 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1342 FMF, CostKind);
1343
1344 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1345 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1346 FMF, CostKind);
1347
1348 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1349
1350 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1351 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1352 FMF, CostKind);
1353
1354 return (LT.first - 1) +
1355 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1356}
1357
1359 TTI::OperandValueInfo OpInfo,
1361 assert(OpInfo.isConstant() && "non constant operand?");
1362 if (!isa<VectorType>(Ty))
1363 // FIXME: We need to account for immediate materialization here, but doing
1364 // a decent job requires more knowledge about the immediate than we
1365 // currently have here.
1366 return 0;
1367
1368 if (OpInfo.isUniform())
1369 // vmv.x.i, vmv.v.x, or vfmv.v.f
1370 // We ignore the cost of the scalar constant materialization to be consistent
1371 // with how we treat scalar constants themselves just above.
1372 return 1;
1373
1374 return getConstantPoolLoadCost(Ty, CostKind);
1375}
1376
1377
1379 MaybeAlign Alignment,
1380 unsigned AddressSpace,
1382 TTI::OperandValueInfo OpInfo,
1383 const Instruction *I) {
1384 EVT VT = TLI->getValueType(DL, Src, true);
1385 // Type legalization can't handle structs
1386 if (VT == MVT::Other)
1387 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1388 CostKind, OpInfo, I);
1389
1391 if (Opcode == Instruction::Store && OpInfo.isConstant())
1392 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1393 InstructionCost BaseCost =
1394 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1395 CostKind, OpInfo, I);
1396 // Assume memory ops cost scale with the number of vector registers
1397 // possible accessed by the instruction. Note that BasicTTI already
1398 // handles the LT.first term for us.
1399 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1400 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1401 BaseCost *= TLI->getLMULCost(LT.second);
1402 return Cost + BaseCost;
1403
1404}
1405
1407 Type *CondTy,
1408 CmpInst::Predicate VecPred,
1410 const Instruction *I) {
1412 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1413 I);
1414
1415 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1416 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1417 I);
1418
1419 // Skip if scalar size of ValTy is bigger than ELEN.
1420 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1421 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1422 I);
1423
1424 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1425 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1426 if (CondTy->isVectorTy()) {
1427 if (ValTy->getScalarSizeInBits() == 1) {
1428 // vmandn.mm v8, v8, v9
1429 // vmand.mm v9, v0, v9
1430 // vmor.mm v0, v9, v8
1431 return LT.first *
1432 getRISCVInstructionCost(
1433 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1434 LT.second, CostKind);
1435 }
1436 // vselect and max/min are supported natively.
1437 return LT.first *
1438 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1439 }
1440
1441 if (ValTy->getScalarSizeInBits() == 1) {
1442 // vmv.v.x v9, a0
1443 // vmsne.vi v9, v9, 0
1444 // vmandn.mm v8, v8, v9
1445 // vmand.mm v9, v0, v9
1446 // vmor.mm v0, v9, v8
1447 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1448 return LT.first *
1449 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1450 InterimVT, CostKind) +
1451 LT.first * getRISCVInstructionCost(
1452 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1453 LT.second, CostKind);
1454 }
1455
1456 // vmv.v.x v10, a0
1457 // vmsne.vi v0, v10, 0
1458 // vmerge.vvm v8, v9, v8, v0
1459 return LT.first * getRISCVInstructionCost(
1460 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1461 LT.second, CostKind);
1462 }
1463
1464 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1465 CmpInst::isIntPredicate(VecPred)) {
1466 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1467 // provided they incur the same cost across all implementations
1468 return LT.first *
1469 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1470 }
1471
1472 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1473 CmpInst::isFPPredicate(VecPred)) {
1474
1475 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1476 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1477 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1478
1479 // If we do not support the input floating point vector type, use the base
1480 // one which will calculate as:
1481 // ScalarizeCost + Num * Cost for fixed vector,
1482 // InvalidCost for scalable vector.
1483 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1484 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1485 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1486 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1487 I);
1488
1489 // Assuming vector fp compare and mask instructions are all the same cost
1490 // until a need arises to differentiate them.
1491 switch (VecPred) {
1492 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1493 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1494 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1495 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1496 return LT.first * getRISCVInstructionCost(
1497 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1498 LT.second, CostKind);
1499
1500 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1501 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1502 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1503 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1504 return LT.first *
1505 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1506 LT.second, CostKind);
1507
1508 case CmpInst::FCMP_OEQ: // vmfeq.vv
1509 case CmpInst::FCMP_OGT: // vmflt.vv
1510 case CmpInst::FCMP_OGE: // vmfle.vv
1511 case CmpInst::FCMP_OLT: // vmflt.vv
1512 case CmpInst::FCMP_OLE: // vmfle.vv
1513 case CmpInst::FCMP_UNE: // vmfne.vv
1514 return LT.first *
1515 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1516 default:
1517 break;
1518 }
1519 }
1520
1521 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1522 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1523 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1524 // be (0 + select instr cost).
1525 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1526 ValTy->isIntegerTy() && !I->user_empty()) {
1527 if (all_of(I->users(), [&](const User *U) {
1528 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1529 U->getType()->isIntegerTy() &&
1530 !isa<ConstantData>(U->getOperand(1)) &&
1531 !isa<ConstantData>(U->getOperand(2));
1532 }))
1533 return 0;
1534 }
1535
1536 // TODO: Add cost for scalar type.
1537
1538 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1539}
1540
1543 const Instruction *I) {
1545 return Opcode == Instruction::PHI ? 0 : 1;
1546 // Branches are assumed to be predicted.
1547 return 0;
1548}
1549
1552 unsigned Index, Value *Op0,
1553 Value *Op1) {
1554 assert(Val->isVectorTy() && "This must be a vector type");
1555
1556 if (Opcode != Instruction::ExtractElement &&
1557 Opcode != Instruction::InsertElement)
1558 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1559
1560 // Legalize the type.
1561 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1562
1563 // This type is legalized to a scalar type.
1564 if (!LT.second.isVector()) {
1565 auto *FixedVecTy = cast<FixedVectorType>(Val);
1566 // If Index is a known constant, cost is zero.
1567 if (Index != -1U)
1568 return 0;
1569 // Extract/InsertElement with non-constant index is very costly when
1570 // scalarized; estimate cost of loads/stores sequence via the stack:
1571 // ExtractElement cost: store vector to stack, load scalar;
1572 // InsertElement cost: store vector to stack, store scalar, load vector.
1573 Type *ElemTy = FixedVecTy->getElementType();
1574 auto NumElems = FixedVecTy->getNumElements();
1575 auto Align = DL.getPrefTypeAlign(ElemTy);
1576 InstructionCost LoadCost =
1577 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1578 InstructionCost StoreCost =
1579 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1580 return Opcode == Instruction::ExtractElement
1581 ? StoreCost * NumElems + LoadCost
1582 : (StoreCost + LoadCost) * NumElems + StoreCost;
1583 }
1584
1585 // For unsupported scalable vector.
1586 if (LT.second.isScalableVector() && !LT.first.isValid())
1587 return LT.first;
1588
1589 if (!isTypeLegal(Val))
1590 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1591
1592 // Mask vector extract/insert is expanded via e8.
1593 if (Val->getScalarSizeInBits() == 1) {
1594 VectorType *WideTy =
1596 cast<VectorType>(Val)->getElementCount());
1597 if (Opcode == Instruction::ExtractElement) {
1598 InstructionCost ExtendCost
1599 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1601 InstructionCost ExtractCost
1602 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1603 return ExtendCost + ExtractCost;
1604 }
1605 InstructionCost ExtendCost
1606 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1608 InstructionCost InsertCost
1609 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1610 InstructionCost TruncCost
1611 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1613 return ExtendCost + InsertCost + TruncCost;
1614 }
1615
1616
1617 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1618 // and vslideup + vmv.s.x to insert element to vector.
1619 unsigned BaseCost = 1;
1620 // When insertelement we should add the index with 1 as the input of vslideup.
1621 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1622
1623 if (Index != -1U) {
1624 // The type may be split. For fixed-width vectors we can normalize the
1625 // index to the new type.
1626 if (LT.second.isFixedLengthVector()) {
1627 unsigned Width = LT.second.getVectorNumElements();
1628 Index = Index % Width;
1629 }
1630
1631 // We could extract/insert the first element without vslidedown/vslideup.
1632 if (Index == 0)
1633 SlideCost = 0;
1634 else if (Opcode == Instruction::InsertElement)
1635 SlideCost = 1; // With a constant index, we do not need to use addi.
1636 }
1637
1638 // Extract i64 in the target that has XLEN=32 need more instruction.
1639 if (Val->getScalarType()->isIntegerTy() &&
1640 ST->getXLen() < Val->getScalarSizeInBits()) {
1641 // For extractelement, we need the following instructions:
1642 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1643 // vslidedown.vx v8, v8, a0
1644 // vmv.x.s a0, v8
1645 // li a1, 32
1646 // vsrl.vx v8, v8, a1
1647 // vmv.x.s a1, v8
1648
1649 // For insertelement, we need the following instructions:
1650 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1651 // vmv.v.i v12, 0
1652 // vslide1up.vx v16, v12, a1
1653 // vslide1up.vx v12, v16, a0
1654 // addi a0, a2, 1
1655 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1656 // vslideup.vx v8, v12, a2
1657
1658 // TODO: should we count these special vsetvlis?
1659 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1660 }
1661 return BaseCost + SlideCost;
1662}
1663
1665 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1667 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1668
1669 // TODO: Handle more cost kinds.
1671 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1672 Args, CxtI);
1673
1674 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1675 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1676 Args, CxtI);
1677
1678 // Skip if scalar size of Ty is bigger than ELEN.
1679 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1681 Args, CxtI);
1682
1683 // Legalize the type.
1684 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1685
1686 // TODO: Handle scalar type.
1687 if (!LT.second.isVector())
1688 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1689 Args, CxtI);
1690
1691
1692 auto getConstantMatCost =
1693 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1694 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1695 // Two sub-cases:
1696 // * Has a 5 bit immediate operand which can be splatted.
1697 // * Has a larger immediate which must be materialized in scalar register
1698 // We return 0 for both as we currently ignore the cost of materializing
1699 // scalar constants in GPRs.
1700 return 0;
1701
1702 return getConstantPoolLoadCost(Ty, CostKind);
1703 };
1704
1705 // Add the cost of materializing any constant vectors required.
1706 InstructionCost ConstantMatCost = 0;
1707 if (Op1Info.isConstant())
1708 ConstantMatCost += getConstantMatCost(0, Op1Info);
1709 if (Op2Info.isConstant())
1710 ConstantMatCost += getConstantMatCost(1, Op2Info);
1711
1712 unsigned Op;
1713 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1714 case ISD::ADD:
1715 case ISD::SUB:
1716 Op = RISCV::VADD_VV;
1717 break;
1718 case ISD::SHL:
1719 case ISD::SRL:
1720 case ISD::SRA:
1721 Op = RISCV::VSLL_VV;
1722 break;
1723 case ISD::AND:
1724 case ISD::OR:
1725 case ISD::XOR:
1726 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1727 break;
1728 case ISD::MUL:
1729 case ISD::MULHS:
1730 case ISD::MULHU:
1731 Op = RISCV::VMUL_VV;
1732 break;
1733 case ISD::SDIV:
1734 case ISD::UDIV:
1735 Op = RISCV::VDIV_VV;
1736 break;
1737 case ISD::SREM:
1738 case ISD::UREM:
1739 Op = RISCV::VREM_VV;
1740 break;
1741 case ISD::FADD:
1742 case ISD::FSUB:
1743 // TODO: Address FP16 with VFHMIN
1744 Op = RISCV::VFADD_VV;
1745 break;
1746 case ISD::FMUL:
1747 // TODO: Address FP16 with VFHMIN
1748 Op = RISCV::VFMUL_VV;
1749 break;
1750 case ISD::FDIV:
1751 Op = RISCV::VFDIV_VV;
1752 break;
1753 case ISD::FNEG:
1754 Op = RISCV::VFSGNJN_VV;
1755 break;
1756 default:
1757 // Assuming all other instructions have the same cost until a need arises to
1758 // differentiate them.
1759 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1760 Op1Info, Op2Info,
1761 Args, CxtI);
1762 }
1763 return ConstantMatCost +
1764 LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1765}
1766
1767// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1769 ArrayRef<const Value *> Ptrs, const Value *Base,
1770 const TTI::PointersChainInfo &Info, Type *AccessTy,
1773 // In the basic model we take into account GEP instructions only
1774 // (although here can come alloca instruction, a value, constants and/or
1775 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1776 // pointer). Typically, if Base is a not a GEP-instruction and all the
1777 // pointers are relative to the same base address, all the rest are
1778 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1779 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1780 // any their index is a non-const.
1781 // If no known dependecies between the pointers cost is calculated as a sum
1782 // of costs of GEP instructions.
1783 for (auto [I, V] : enumerate(Ptrs)) {
1784 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1785 if (!GEP)
1786 continue;
1787 if (Info.isSameBase() && V != Base) {
1788 if (GEP->hasAllConstantIndices())
1789 continue;
1790 // If the chain is unit-stride and BaseReg + stride*i is a legal
1791 // addressing mode, then presume the base GEP is sitting around in a
1792 // register somewhere and check if we can fold the offset relative to
1793 // it.
1794 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1795 if (Info.isUnitStride() &&
1796 isLegalAddressingMode(AccessTy,
1797 /* BaseGV */ nullptr,
1798 /* BaseOffset */ Stride * I,
1799 /* HasBaseReg */ true,
1800 /* Scale */ 0,
1801 GEP->getType()->getPointerAddressSpace()))
1802 continue;
1803 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1804 {TTI::OK_AnyValue, TTI::OP_None},
1805 {TTI::OK_AnyValue, TTI::OP_None},
1806 std::nullopt);
1807 } else {
1808 SmallVector<const Value *> Indices(GEP->indices());
1809 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1810 Indices, AccessTy, CostKind);
1811 }
1812 }
1813 return Cost;
1814}
1815
1819 // TODO: More tuning on benchmarks and metrics with changes as needed
1820 // would apply to all settings below to enable performance.
1821
1822
1823 if (ST->enableDefaultUnroll())
1824 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1825
1826 // Enable Upper bound unrolling universally, not dependant upon the conditions
1827 // below.
1828 UP.UpperBound = true;
1829
1830 // Disable loop unrolling for Oz and Os.
1831 UP.OptSizeThreshold = 0;
1833 if (L->getHeader()->getParent()->hasOptSize())
1834 return;
1835
1836 SmallVector<BasicBlock *, 4> ExitingBlocks;
1837 L->getExitingBlocks(ExitingBlocks);
1838 LLVM_DEBUG(dbgs() << "Loop has:\n"
1839 << "Blocks: " << L->getNumBlocks() << "\n"
1840 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1841
1842 // Only allow another exit other than the latch. This acts as an early exit
1843 // as it mirrors the profitability calculation of the runtime unroller.
1844 if (ExitingBlocks.size() > 2)
1845 return;
1846
1847 // Limit the CFG of the loop body for targets with a branch predictor.
1848 // Allowing 4 blocks permits if-then-else diamonds in the body.
1849 if (L->getNumBlocks() > 4)
1850 return;
1851
1852 // Don't unroll vectorized loops, including the remainder loop
1853 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1854 return;
1855
1856 // Scan the loop: don't unroll loops with calls as this could prevent
1857 // inlining.
1859 for (auto *BB : L->getBlocks()) {
1860 for (auto &I : *BB) {
1861 // Initial setting - Don't unroll loops containing vectorized
1862 // instructions.
1863 if (I.getType()->isVectorTy())
1864 return;
1865
1866 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1867 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1868 if (!isLoweredToCall(F))
1869 continue;
1870 }
1871 return;
1872 }
1873
1874 SmallVector<const Value *> Operands(I.operand_values());
1877 }
1878 }
1879
1880 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1881
1882 UP.Partial = true;
1883 UP.Runtime = true;
1884 UP.UnrollRemainder = true;
1885 UP.UnrollAndJam = true;
1887
1888 // Force unrolling small loops can be very useful because of the branch
1889 // taken cost of the backedge.
1890 if (Cost < 12)
1891 UP.Force = true;
1892}
1893
1897}
1898
1901 if (Ty->isVectorTy()) {
1902 if (Size.isScalable() && ST->hasVInstructions())
1903 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1904
1906 return divideCeil(Size, ST->getRealMinVLen());
1907 }
1908
1909 return BaseT::getRegUsageForType(Ty);
1910}
1911
1912unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1913 if (SLPMaxVF.getNumOccurrences())
1914 return SLPMaxVF;
1915
1916 // Return how many elements can fit in getRegisterBitwidth. This is the
1917 // same routine as used in LoopVectorizer. We should probably be
1918 // accounting for whether we actually have instructions with the right
1919 // lane type, but we don't have enough information to do that without
1920 // some additional plumbing which hasn't been justified yet.
1921 TypeSize RegWidth =
1923 // If no vector registers, or absurd element widths, disable
1924 // vectorization by returning 1.
1925 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1926}
1927
1929 const TargetTransformInfo::LSRCost &C2) {
1930 // RISC-V specific here are "instruction number 1st priority".
1931 // If we need to emit adds inside the loop to add up base registers, then
1932 // we need at least one extra temporary register.
1933 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
1934 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
1935 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
1936 C1.NumIVMuls, C1.NumBaseAdds,
1937 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1938 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
1939 C2.NumIVMuls, C2.NumBaseAdds,
1940 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1941}
1942
1944 auto *VTy = dyn_cast<VectorType>(DataTy);
1945 if (!VTy || VTy->isScalableTy())
1946 return false;
1947
1948 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1949 return false;
1950 return true;
1951}
1952
1954 const Function *Callee) const {
1955 const TargetMachine &TM = getTLI()->getTargetMachine();
1956
1957 const FeatureBitset &CallerBits =
1958 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1959 const FeatureBitset &CalleeBits =
1960 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1961
1962 // Inline a callee if its target-features are a subset of the callers
1963 // target-features.
1964 return (CallerBits & CalleeBits) == CalleeBits;
1965}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
uint64_t IntrinsicInst * II
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:762
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:761
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:444
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:42
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:818
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:802
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:733
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:808
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:916
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:864
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:897
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:814
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).