LLVM 20.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include <cmath>
19#include <optional>
20using namespace llvm;
21using namespace llvm::PatternMatch;
22
23#define DEBUG_TYPE "riscvtti"
24
26 "riscv-v-register-bit-width-lmul",
28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29 "by autovectorized code. Fractional LMULs are not supported."),
31
33 "riscv-v-slp-max-vf",
35 "Overrides result used for getMaximumVF query which is used "
36 "exclusively by SLP vectorizer."),
38
40RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
42 // Check if the type is valid for all CostKind
43 if (!VT.isVector())
45 size_t NumInstr = OpCodes.size();
47 return NumInstr;
48 InstructionCost LMULCost = TLI->getLMULCost(VT);
50 return LMULCost * NumInstr;
52 for (auto Op : OpCodes) {
53 switch (Op) {
54 case RISCV::VRGATHER_VI:
55 Cost += TLI->getVRGatherVICost(VT);
56 break;
57 case RISCV::VRGATHER_VV:
58 Cost += TLI->getVRGatherVVCost(VT);
59 break;
60 case RISCV::VSLIDEUP_VI:
61 case RISCV::VSLIDEDOWN_VI:
62 Cost += TLI->getVSlideVICost(VT);
63 break;
64 case RISCV::VSLIDEUP_VX:
65 case RISCV::VSLIDEDOWN_VX:
66 Cost += TLI->getVSlideVXCost(VT);
67 break;
68 case RISCV::VREDMAX_VS:
69 case RISCV::VREDMIN_VS:
70 case RISCV::VREDMAXU_VS:
71 case RISCV::VREDMINU_VS:
72 case RISCV::VREDSUM_VS:
73 case RISCV::VREDAND_VS:
74 case RISCV::VREDOR_VS:
75 case RISCV::VREDXOR_VS:
76 case RISCV::VFREDMAX_VS:
77 case RISCV::VFREDMIN_VS:
78 case RISCV::VFREDUSUM_VS: {
79 unsigned VL = VT.getVectorMinNumElements();
80 if (!VT.isFixedLengthVector())
81 VL *= *getVScaleForTuning();
82 Cost += Log2_32_Ceil(VL);
83 break;
84 }
85 case RISCV::VFREDOSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += VL;
90 break;
91 }
92 case RISCV::VMV_X_S:
93 case RISCV::VMV_S_X:
94 case RISCV::VFMV_F_S:
95 case RISCV::VFMV_S_F:
96 case RISCV::VMOR_MM:
97 case RISCV::VMXOR_MM:
98 case RISCV::VMAND_MM:
99 case RISCV::VMANDN_MM:
100 case RISCV::VMNAND_MM:
101 case RISCV::VCPOP_M:
102 case RISCV::VFIRST_M:
103 Cost += 1;
104 break;
105 default:
106 Cost += LMULCost;
107 }
108 }
109 return Cost;
110}
111
113 const RISCVSubtarget *ST,
114 const APInt &Imm, Type *Ty,
116 bool FreeZeroes) {
117 assert(Ty->isIntegerTy() &&
118 "getIntImmCost can only estimate cost of materialising integers");
119
120 // We have a Zero register, so 0 is always free.
121 if (Imm == 0)
122 return TTI::TCC_Free;
123
124 // Otherwise, we check how many instructions it will take to materialise.
125 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
126 /*CompressionCost=*/false, FreeZeroes);
127}
128
131 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
132}
133
134// Look for patterns of shift followed by AND that can be turned into a pair of
135// shifts. We won't need to materialize an immediate for the AND so these can
136// be considered free.
137static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
138 uint64_t Mask = Imm.getZExtValue();
139 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
140 if (!BO || !BO->hasOneUse())
141 return false;
142
143 if (BO->getOpcode() != Instruction::Shl)
144 return false;
145
146 if (!isa<ConstantInt>(BO->getOperand(1)))
147 return false;
148
149 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
150 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
151 // is a mask shifted by c2 bits with c3 leading zeros.
152 if (isShiftedMask_64(Mask)) {
153 unsigned Trailing = llvm::countr_zero(Mask);
154 if (ShAmt == Trailing)
155 return true;
156 }
157
158 return false;
159}
160
162 const APInt &Imm, Type *Ty,
164 Instruction *Inst) {
165 assert(Ty->isIntegerTy() &&
166 "getIntImmCost can only estimate cost of materialising integers");
167
168 // We have a Zero register, so 0 is always free.
169 if (Imm == 0)
170 return TTI::TCC_Free;
171
172 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
173 // commutative, in others the immediate comes from a specific argument index.
174 bool Takes12BitImm = false;
175 unsigned ImmArgIdx = ~0U;
176
177 switch (Opcode) {
178 case Instruction::GetElementPtr:
179 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
180 // split up large offsets in GEP into better parts than ConstantHoisting
181 // can.
182 return TTI::TCC_Free;
183 case Instruction::Store: {
184 // Use the materialization cost regardless of if it's the address or the
185 // value that is constant, except for if the store is misaligned and
186 // misaligned accesses are not legal (experience shows constant hoisting
187 // can sometimes be harmful in such cases).
188 if (Idx == 1 || !Inst)
189 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
190 /*FreeZeroes=*/true);
191
192 StoreInst *ST = cast<StoreInst>(Inst);
193 if (!getTLI()->allowsMemoryAccessForAlignment(
194 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
195 ST->getPointerAddressSpace(), ST->getAlign()))
196 return TTI::TCC_Free;
197
198 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
199 /*FreeZeroes=*/true);
200 }
201 case Instruction::Load:
202 // If the address is a constant, use the materialization cost.
203 return getIntImmCost(Imm, Ty, CostKind);
204 case Instruction::And:
205 // zext.h
206 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
207 return TTI::TCC_Free;
208 // zext.w
209 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
210 return TTI::TCC_Free;
211 // bclri
212 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
213 return TTI::TCC_Free;
214 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
215 canUseShiftPair(Inst, Imm))
216 return TTI::TCC_Free;
217 Takes12BitImm = true;
218 break;
219 case Instruction::Add:
220 Takes12BitImm = true;
221 break;
222 case Instruction::Or:
223 case Instruction::Xor:
224 // bseti/binvi
225 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
226 return TTI::TCC_Free;
227 Takes12BitImm = true;
228 break;
229 case Instruction::Mul:
230 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
231 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
232 return TTI::TCC_Free;
233 // One more or less than a power of 2 can use SLLI+ADD/SUB.
234 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
235 return TTI::TCC_Free;
236 // FIXME: There is no MULI instruction.
237 Takes12BitImm = true;
238 break;
239 case Instruction::Sub:
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 Takes12BitImm = true;
244 ImmArgIdx = 1;
245 break;
246 default:
247 break;
248 }
249
250 if (Takes12BitImm) {
251 // Check immediate is the correct argument...
252 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
253 // ... and fits into the 12-bit immediate.
254 if (Imm.getSignificantBits() <= 64 &&
255 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
256 return TTI::TCC_Free;
257 }
258 }
259
260 // Otherwise, use the full materialisation cost.
261 return getIntImmCost(Imm, Ty, CostKind);
262 }
263
264 // By default, prevent hoisting.
265 return TTI::TCC_Free;
266}
267
270 const APInt &Imm, Type *Ty,
272 // Prevent hoisting in unknown cases.
273 return TTI::TCC_Free;
274}
275
276bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
277 return ST->hasVInstructions();
278}
279
282 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
283 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
286}
287
289 // Currently, the ExpandReductions pass can't expand scalable-vector
290 // reductions, but we still request expansion as RVV doesn't support certain
291 // reductions and the SelectionDAG can't legalize them either.
292 switch (II->getIntrinsicID()) {
293 default:
294 return false;
295 // These reductions have no equivalent in RVV
296 case Intrinsic::vector_reduce_mul:
297 case Intrinsic::vector_reduce_fmul:
298 return true;
299 }
300}
301
302std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
303 if (ST->hasVInstructions())
305 return BaseT::getMaxVScale();
306}
307
308std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
309 if (ST->hasVInstructions())
310 if (unsigned MinVLen = ST->getRealMinVLen();
311 MinVLen >= RISCV::RVVBitsPerBlock)
312 return MinVLen / RISCV::RVVBitsPerBlock;
314}
315
318 unsigned LMUL =
319 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
320 switch (K) {
322 return TypeSize::getFixed(ST->getXLen());
324 return TypeSize::getFixed(
325 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
328 (ST->hasVInstructions() &&
331 : 0);
332 }
333
334 llvm_unreachable("Unsupported register kind");
335}
336
338RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
339 // Add a cost of address generation + the cost of the load. The address
340 // is expected to be a PC relative offset to a constant pool entry
341 // using auipc/addi.
342 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
343 /*AddressSpace=*/0, CostKind);
344}
345
347 LLVMContext &C) {
348 assert((DataVT.getScalarSizeInBits() != 8 ||
349 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
350 MVT IndexVT = DataVT.changeTypeToInteger();
351 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
352 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
353 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
354}
355
357 VectorType *Tp, ArrayRef<int> Mask,
359 int Index, VectorType *SubTp,
361 const Instruction *CxtI) {
362 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
363
364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
365
366 // First, handle cases where having a fixed length vector enables us to
367 // give a more accurate cost than falling back to generic scalable codegen.
368 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
369 if (isa<FixedVectorType>(Tp)) {
370 switch (Kind) {
371 default:
372 break;
374 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
375 MVT EltTp = LT.second.getVectorElementType();
376 // If the size of the element is < ELEN then shuffles of interleaves and
377 // deinterleaves of 2 vectors can be lowered into the following
378 // sequences
379 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
380 // Example sequence:
381 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
382 // vwaddu.vv v10, v8, v9
383 // li a0, -1 (ignored)
384 // vwmaccu.vx v10, a0, v9
385 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
386 return 2 * LT.first * TLI->getLMULCost(LT.second);
387
388 if (Mask[0] == 0 || Mask[0] == 1) {
389 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
390 // Example sequence:
391 // vnsrl.wi v10, v8, 0
392 if (equal(DeinterleaveMask, Mask))
393 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
394 LT.second, CostKind);
395 }
396 }
397 }
398 // vrgather + cost of generating the mask constant.
399 // We model this for an unknown mask with a single vrgather.
400 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
401 (LT.second.getScalarSizeInBits() != 8 ||
402 LT.second.getVectorNumElements() <= 256)) {
403 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
404 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
405 return IndexCost +
406 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
407 }
408 [[fallthrough]];
409 }
412 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
413 // register for the second vrgather. We model this for an unknown
414 // (shuffle) mask.
415 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
416 (LT.second.getScalarSizeInBits() != 8 ||
417 LT.second.getVectorNumElements() <= 256)) {
418 auto &C = Tp->getContext();
419 auto EC = Tp->getElementCount();
420 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
422 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
423 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
424 return 2 * IndexCost +
425 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
426 LT.second, CostKind) +
427 MaskCost;
428 }
429 [[fallthrough]];
430 }
431 case TTI::SK_Select: {
432 // We are going to permute multiple sources and the result will be in
433 // multiple destinations. Providing an accurate cost only for splits where
434 // the element type remains the same.
435 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
436 LT.second.isFixedLengthVector() &&
437 LT.second.getVectorElementType().getSizeInBits() ==
439 LT.second.getVectorNumElements() <
440 cast<FixedVectorType>(Tp)->getNumElements() &&
441 divideCeil(Mask.size(),
442 cast<FixedVectorType>(Tp)->getNumElements()) ==
443 static_cast<unsigned>(*LT.first.getValue())) {
444 unsigned NumRegs = *LT.first.getValue();
445 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
446 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
447 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
448
450 for (unsigned I = 0; I < NumRegs; ++I) {
451 bool IsSingleVector = true;
452 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
453 transform(Mask.slice(I * SubVF,
454 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
455 SubMask.begin(), [&](int I) {
456 bool SingleSubVector = I / VF == 0;
457 IsSingleVector &= SingleSubVector;
458 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
459 });
462 SubVecTy, SubMask, CostKind, 0, nullptr);
463 return Cost;
464 }
465 }
466 break;
467 }
468 }
469 };
470
471 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
472 switch (Kind) {
473 default:
474 // Fallthrough to generic handling.
475 // TODO: Most of these cases will return getInvalid in generic code, and
476 // must be implemented here.
477 break;
479 // Extract at zero is always a subregister extract
480 if (Index == 0)
481 return TTI::TCC_Free;
482
483 // If we're extracting a subvector of at most m1 size at a sub-register
484 // boundary - which unfortunately we need exact vlen to identify - this is
485 // a subregister extract at worst and thus won't require a vslidedown.
486 // TODO: Extend for aligned m2, m4 subvector extracts
487 // TODO: Extend for misalgined (but contained) extracts
488 // TODO: Extend for scalable subvector types
489 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
490 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
491 const unsigned MinVLen = ST->getRealMinVLen();
492 const unsigned MaxVLen = ST->getRealMaxVLen();
493 if (MinVLen == MaxVLen &&
494 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
495 SubLT.second.getSizeInBits() <= MinVLen)
496 return TTI::TCC_Free;
497 }
498
499 // Example sequence:
500 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
501 // vslidedown.vi v8, v9, 2
502 return LT.first *
503 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
505 // Example sequence:
506 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
507 // vslideup.vi v8, v9, 2
508 return LT.first *
509 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
510 case TTI::SK_Select: {
511 // Example sequence:
512 // li a0, 90
513 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
514 // vmv.s.x v0, a0
515 // vmerge.vvm v8, v9, v8, v0
516 // We use 2 for the cost of the mask materialization as this is the true
517 // cost for small masks and most shuffles are small. At worst, this cost
518 // should be a very small constant for the constant pool load. As such,
519 // we may bias towards large selects slightly more than truely warranted.
520 return LT.first *
521 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
522 LT.second, CostKind));
523 }
524 case TTI::SK_Broadcast: {
525 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
526 Instruction::InsertElement);
527 if (LT.second.getScalarSizeInBits() == 1) {
528 if (HasScalar) {
529 // Example sequence:
530 // andi a0, a0, 1
531 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
532 // vmv.v.x v8, a0
533 // vmsne.vi v0, v8, 0
534 return LT.first *
535 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
536 LT.second, CostKind));
537 }
538 // Example sequence:
539 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
540 // vmv.v.i v8, 0
541 // vmerge.vim v8, v8, 1, v0
542 // vmv.x.s a0, v8
543 // andi a0, a0, 1
544 // vmv.v.x v8, a0
545 // vmsne.vi v0, v8, 0
546
547 return LT.first *
548 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
549 RISCV::VMV_X_S, RISCV::VMV_V_X,
550 RISCV::VMSNE_VI},
551 LT.second, CostKind));
552 }
553
554 if (HasScalar) {
555 // Example sequence:
556 // vmv.v.x v8, a0
557 return LT.first *
558 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
559 }
560
561 // Example sequence:
562 // vrgather.vi v9, v8, 0
563 return LT.first *
564 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
565 }
566 case TTI::SK_Splice: {
567 // vslidedown+vslideup.
568 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
569 // of similar code, but I think we expand through memory.
570 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
571 if (Index >= 0 && Index < 32)
572 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
573 else if (Index < 0 && Index > -32)
574 Opcodes[1] = RISCV::VSLIDEUP_VI;
575 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
576 }
577 case TTI::SK_Reverse: {
578 // TODO: Cases to improve here:
579 // * Illegal vector types
580 // * i64 on RV32
581 // * i1 vector
582 // At low LMUL, most of the cost is producing the vrgather index register.
583 // At high LMUL, the cost of the vrgather itself will dominate.
584 // Example sequence:
585 // csrr a0, vlenb
586 // srli a0, a0, 3
587 // addi a0, a0, -1
588 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
589 // vid.v v9
590 // vrsub.vx v10, v9, a0
591 // vrgather.vv v9, v8, v10
592 InstructionCost LenCost = 3;
593 if (LT.second.isFixedLengthVector())
594 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
595 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
596 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
597 if (LT.second.isFixedLengthVector() &&
598 isInt<5>(LT.second.getVectorNumElements() - 1))
599 Opcodes[1] = RISCV::VRSUB_VI;
600 InstructionCost GatherCost =
601 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
602 // Mask operation additionally required extend and truncate
603 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
604 return LT.first * (LenCost + GatherCost + ExtendCost);
605 }
606 }
607 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
608}
609
611RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
612 unsigned AddressSpace,
614 if (!isLegalMaskedLoadStore(Src, Alignment) ||
616 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
617 CostKind);
618
619 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
620}
621
623 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
624 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
625 bool UseMaskForCond, bool UseMaskForGaps) {
626 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
628
629 // The interleaved memory access pass will lower interleaved memory ops (i.e
630 // a load and store followed by a specific shuffle) to vlseg/vsseg
631 // intrinsics. In those cases then we can treat it as if it's just one (legal)
632 // memory op
633 if (!UseMaskForCond && !UseMaskForGaps &&
634 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
635 auto *VTy = cast<VectorType>(VecTy);
636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
637 // Need to make sure type has't been scalarized
638 if (LT.second.isVector()) {
639 auto *SubVecTy =
640 VectorType::get(VTy->getElementType(),
641 VTy->getElementCount().divideCoefficientBy(Factor));
642
643 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
644 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
645 AddressSpace, DL)) {
646 // FIXME: We use the memory op cost of the *legalized* type here,
647 // because it's getMemoryOpCost returns a really expensive cost for
648 // types like <6 x i8>, which show up when doing interleaves of
649 // Factor=3 etc. Should the memory op cost of these be cheaper?
650 auto *LegalVTy = VectorType::get(VTy->getElementType(),
651 LT.second.getVectorElementCount());
652 InstructionCost LegalMemCost = getMemoryOpCost(
653 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
654 return LT.first + LegalMemCost;
655 }
656 }
657 }
658
659 // TODO: Return the cost of interleaved accesses for scalable vector when
660 // unable to convert to segment accesses instructions.
661 if (isa<ScalableVectorType>(VecTy))
663
664 auto *FVTy = cast<FixedVectorType>(VecTy);
665 InstructionCost MemCost =
666 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
667 unsigned VF = FVTy->getNumElements() / Factor;
668
669 // An interleaved load will look like this for Factor=3:
670 // %wide.vec = load <12 x i32>, ptr %3, align 4
671 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
672 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
673 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
674 if (Opcode == Instruction::Load) {
675 InstructionCost Cost = MemCost;
676 for (unsigned Index : Indices) {
677 FixedVectorType *SubVecTy =
678 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
679 auto Mask = createStrideMask(Index, Factor, VF);
680 InstructionCost ShuffleCost =
682 CostKind, 0, nullptr, {});
683 Cost += ShuffleCost;
684 }
685 return Cost;
686 }
687
688 // TODO: Model for NF > 2
689 // We'll need to enhance getShuffleCost to model shuffles that are just
690 // inserts and extracts into subvectors, since they won't have the full cost
691 // of a vrgather.
692 // An interleaved store for 3 vectors of 4 lanes will look like
693 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
694 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
695 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
696 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
697 // store <12 x i32> %interleaved.vec, ptr %10, align 4
698 if (Factor != 2)
699 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
700 Alignment, AddressSpace, CostKind,
701 UseMaskForCond, UseMaskForGaps);
702
703 assert(Opcode == Instruction::Store && "Opcode must be a store");
704 // For an interleaving store of 2 vectors, we perform one large interleaving
705 // shuffle that goes into the wide store
706 auto Mask = createInterleaveMask(VF, Factor);
707 InstructionCost ShuffleCost =
709 CostKind, 0, nullptr, {});
710 return MemCost + ShuffleCost;
711}
712
714 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
715 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
717 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
718 Alignment, CostKind, I);
719
720 if ((Opcode == Instruction::Load &&
721 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
722 (Opcode == Instruction::Store &&
723 !isLegalMaskedScatter(DataTy, Align(Alignment))))
724 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
725 Alignment, CostKind, I);
726
727 // Cost is proportional to the number of memory operations implied. For
728 // scalable vectors, we use an estimate on that number since we don't
729 // know exactly what VL will be.
730 auto &VTy = *cast<VectorType>(DataTy);
731 InstructionCost MemOpCost =
732 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
733 {TTI::OK_AnyValue, TTI::OP_None}, I);
734 unsigned NumLoads = getEstimatedVLFor(&VTy);
735 return NumLoads * MemOpCost;
736}
737
739 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
740 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
741 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
742 !isLegalStridedLoadStore(DataTy, Alignment)) ||
743 (Opcode != Instruction::Load && Opcode != Instruction::Store))
744 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
745 Alignment, CostKind, I);
746
748 return TTI::TCC_Basic;
749
750 // Cost is proportional to the number of memory operations implied. For
751 // scalable vectors, we use an estimate on that number since we don't
752 // know exactly what VL will be.
753 auto &VTy = *cast<VectorType>(DataTy);
754 InstructionCost MemOpCost =
755 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
756 {TTI::OK_AnyValue, TTI::OP_None}, I);
757 unsigned NumLoads = getEstimatedVLFor(&VTy);
758 return NumLoads * MemOpCost;
759}
760
761// Currently, these represent both throughput and codesize costs
762// for the respective intrinsics. The costs in this table are simply
763// instruction counts with the following adjustments made:
764// * One vsetvli is considered free.
766 {Intrinsic::floor, MVT::f32, 9},
767 {Intrinsic::floor, MVT::f64, 9},
768 {Intrinsic::ceil, MVT::f32, 9},
769 {Intrinsic::ceil, MVT::f64, 9},
770 {Intrinsic::trunc, MVT::f32, 7},
771 {Intrinsic::trunc, MVT::f64, 7},
772 {Intrinsic::round, MVT::f32, 9},
773 {Intrinsic::round, MVT::f64, 9},
774 {Intrinsic::roundeven, MVT::f32, 9},
775 {Intrinsic::roundeven, MVT::f64, 9},
776 {Intrinsic::rint, MVT::f32, 7},
777 {Intrinsic::rint, MVT::f64, 7},
778 {Intrinsic::lrint, MVT::i32, 1},
779 {Intrinsic::lrint, MVT::i64, 1},
780 {Intrinsic::llrint, MVT::i64, 1},
781 {Intrinsic::nearbyint, MVT::f32, 9},
782 {Intrinsic::nearbyint, MVT::f64, 9},
783 {Intrinsic::bswap, MVT::i16, 3},
784 {Intrinsic::bswap, MVT::i32, 12},
785 {Intrinsic::bswap, MVT::i64, 31},
786 {Intrinsic::vp_bswap, MVT::i16, 3},
787 {Intrinsic::vp_bswap, MVT::i32, 12},
788 {Intrinsic::vp_bswap, MVT::i64, 31},
789 {Intrinsic::vp_fshl, MVT::i8, 7},
790 {Intrinsic::vp_fshl, MVT::i16, 7},
791 {Intrinsic::vp_fshl, MVT::i32, 7},
792 {Intrinsic::vp_fshl, MVT::i64, 7},
793 {Intrinsic::vp_fshr, MVT::i8, 7},
794 {Intrinsic::vp_fshr, MVT::i16, 7},
795 {Intrinsic::vp_fshr, MVT::i32, 7},
796 {Intrinsic::vp_fshr, MVT::i64, 7},
797 {Intrinsic::bitreverse, MVT::i8, 17},
798 {Intrinsic::bitreverse, MVT::i16, 24},
799 {Intrinsic::bitreverse, MVT::i32, 33},
800 {Intrinsic::bitreverse, MVT::i64, 52},
801 {Intrinsic::vp_bitreverse, MVT::i8, 17},
802 {Intrinsic::vp_bitreverse, MVT::i16, 24},
803 {Intrinsic::vp_bitreverse, MVT::i32, 33},
804 {Intrinsic::vp_bitreverse, MVT::i64, 52},
805 {Intrinsic::ctpop, MVT::i8, 12},
806 {Intrinsic::ctpop, MVT::i16, 19},
807 {Intrinsic::ctpop, MVT::i32, 20},
808 {Intrinsic::ctpop, MVT::i64, 21},
809 {Intrinsic::vp_ctpop, MVT::i8, 12},
810 {Intrinsic::vp_ctpop, MVT::i16, 19},
811 {Intrinsic::vp_ctpop, MVT::i32, 20},
812 {Intrinsic::vp_ctpop, MVT::i64, 21},
813 {Intrinsic::vp_ctlz, MVT::i8, 19},
814 {Intrinsic::vp_ctlz, MVT::i16, 28},
815 {Intrinsic::vp_ctlz, MVT::i32, 31},
816 {Intrinsic::vp_ctlz, MVT::i64, 35},
817 {Intrinsic::vp_cttz, MVT::i8, 16},
818 {Intrinsic::vp_cttz, MVT::i16, 23},
819 {Intrinsic::vp_cttz, MVT::i32, 24},
820 {Intrinsic::vp_cttz, MVT::i64, 25},
821};
822
824 switch (ID) {
825#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
826 case Intrinsic::VPID: \
827 return ISD::VPSD;
828#include "llvm/IR/VPIntrinsics.def"
829#undef HELPER_MAP_VPID_TO_VPSD
830 }
831 return ISD::DELETED_NODE;
832}
833
837 auto *RetTy = ICA.getReturnType();
838 switch (ICA.getID()) {
839 case Intrinsic::ceil:
840 case Intrinsic::floor:
841 case Intrinsic::trunc:
842 case Intrinsic::rint:
843 case Intrinsic::lrint:
844 case Intrinsic::llrint:
845 case Intrinsic::round:
846 case Intrinsic::roundeven: {
847 // These all use the same code.
849 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
850 return LT.first * 8;
851 break;
852 }
853 case Intrinsic::umin:
854 case Intrinsic::umax:
855 case Intrinsic::smin:
856 case Intrinsic::smax: {
858 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
859 return LT.first;
860
861 if (ST->hasVInstructions() && LT.second.isVector()) {
862 unsigned Op;
863 switch (ICA.getID()) {
864 case Intrinsic::umin:
865 Op = RISCV::VMINU_VV;
866 break;
867 case Intrinsic::umax:
868 Op = RISCV::VMAXU_VV;
869 break;
870 case Intrinsic::smin:
871 Op = RISCV::VMIN_VV;
872 break;
873 case Intrinsic::smax:
874 Op = RISCV::VMAX_VV;
875 break;
876 }
877 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
878 }
879 break;
880 }
881 case Intrinsic::sadd_sat:
882 case Intrinsic::ssub_sat:
883 case Intrinsic::uadd_sat:
884 case Intrinsic::usub_sat:
885 case Intrinsic::fabs:
886 case Intrinsic::sqrt: {
888 if (ST->hasVInstructions() && LT.second.isVector())
889 return LT.first;
890 break;
891 }
892 case Intrinsic::ctpop: {
894 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
895 return LT.first;
896 break;
897 }
898 case Intrinsic::abs: {
900 if (ST->hasVInstructions() && LT.second.isVector()) {
901 // vrsub.vi v10, v8, 0
902 // vmax.vv v8, v8, v10
903 return LT.first * 2;
904 }
905 break;
906 }
907 case Intrinsic::get_active_lane_mask: {
908 if (ST->hasVInstructions()) {
909 Type *ExpRetTy = VectorType::get(
910 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
911 auto LT = getTypeLegalizationCost(ExpRetTy);
912
913 // vid.v v8 // considered hoisted
914 // vsaddu.vx v8, v8, a0
915 // vmsltu.vx v0, v8, a1
916 return LT.first *
917 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
918 LT.second, CostKind);
919 }
920 break;
921 }
922 // TODO: add more intrinsic
923 case Intrinsic::experimental_stepvector: {
925 // Legalisation of illegal types involves an `index' instruction plus
926 // (LT.first - 1) vector adds.
927 if (ST->hasVInstructions())
928 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
929 (LT.first - 1) *
930 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
931 return 1 + (LT.first - 1);
932 }
933 case Intrinsic::experimental_cttz_elts: {
934 Type *ArgTy = ICA.getArgTypes()[0];
935 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
936 if (getTLI()->shouldExpandCttzElements(ArgType))
937 break;
938 InstructionCost Cost = getRISCVInstructionCost(
939 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
940
941 // If zero_is_poison is false, then we will generate additional
942 // cmp + select instructions to convert -1 to EVL.
943 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
944 if (ICA.getArgs().size() > 1 &&
945 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
946 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
948 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
950
951 return Cost;
952 }
953 case Intrinsic::vp_rint: {
954 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
955 unsigned Cost = 5;
957 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
958 return Cost * LT.first;
959 break;
960 }
961 case Intrinsic::vp_nearbyint: {
962 // More one read and one write for fflags than vp_rint.
963 unsigned Cost = 7;
965 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
966 return Cost * LT.first;
967 break;
968 }
969 case Intrinsic::vp_ceil:
970 case Intrinsic::vp_floor:
971 case Intrinsic::vp_round:
972 case Intrinsic::vp_roundeven:
973 case Intrinsic::vp_roundtozero: {
974 // Rounding with static rounding mode needs two more instructions to
975 // swap/write FRM than vp_rint.
976 unsigned Cost = 7;
978 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
979 if (TLI->isOperationCustom(VPISD, LT.second))
980 return Cost * LT.first;
981 break;
982 }
983 // vp integer arithmetic ops.
984 case Intrinsic::vp_add:
985 case Intrinsic::vp_and:
986 case Intrinsic::vp_ashr:
987 case Intrinsic::vp_lshr:
988 case Intrinsic::vp_mul:
989 case Intrinsic::vp_or:
990 case Intrinsic::vp_sdiv:
991 case Intrinsic::vp_shl:
992 case Intrinsic::vp_srem:
993 case Intrinsic::vp_sub:
994 case Intrinsic::vp_udiv:
995 case Intrinsic::vp_urem:
996 case Intrinsic::vp_xor:
997 // vp float arithmetic ops.
998 case Intrinsic::vp_fadd:
999 case Intrinsic::vp_fsub:
1000 case Intrinsic::vp_fmul:
1001 case Intrinsic::vp_fdiv:
1002 case Intrinsic::vp_frem: {
1003 std::optional<unsigned> FOp =
1005 if (FOp)
1006 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1007 break;
1008 }
1009 }
1010
1011 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1012 if (auto LT = getTypeLegalizationCost(RetTy);
1013 LT.second.isVector()) {
1014 MVT EltTy = LT.second.getVectorElementType();
1015 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1016 ICA.getID(), EltTy))
1017 return LT.first * Entry->Cost;
1018 }
1019 }
1020
1022}
1023
1025 Type *Src,
1028 const Instruction *I) {
1029 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1030 if (!IsVectorType)
1031 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1032
1033 // FIXME: Need to compute legalizing cost for illegal types. The current
1034 // code handles only legal types and those which can be trivially
1035 // promoted to legal.
1036 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1037 Dst->getScalarSizeInBits() > ST->getELen())
1038 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1039
1040 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1041 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1042
1043 // Our actual lowering for the case where a wider legal type is available
1044 // uses promotion to the wider type. This is reflected in the result of
1045 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1046 // scalarized if the legalized Src and Dst are not equal sized.
1047 const DataLayout &DL = this->getDataLayout();
1048 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1050 SrcLT.second.getSizeInBits()) ||
1052 DstLT.second.getSizeInBits()))
1053 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1054
1055 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1056 assert(ISD && "Invalid opcode");
1057
1058 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1059 (int)Log2_32(Src->getScalarSizeInBits());
1060 switch (ISD) {
1061 case ISD::SIGN_EXTEND:
1062 case ISD::ZERO_EXTEND: {
1063 const unsigned SrcEltSize = Src->getScalarSizeInBits();
1064 if (SrcEltSize == 1) {
1065 // We do not use vsext/vzext to extend from mask vector.
1066 // Instead we use the following instructions to extend from mask vector:
1067 // vmv.v.i v8, 0
1068 // vmerge.vim v8, v8, -1, v0
1069 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1070 DstLT.second, CostKind);
1071 }
1072 if ((PowDiff < 1) || (PowDiff > 3))
1073 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1074 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1075 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1076 unsigned Op =
1077 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1078 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1079 }
1080 case ISD::TRUNCATE:
1081 if (Dst->getScalarSizeInBits() == 1) {
1082 // We do not use several vncvt to truncate to mask vector. So we could
1083 // not use PowDiff to calculate it.
1084 // Instead we use the following instructions to truncate to mask vector:
1085 // vand.vi v8, v8, 1
1086 // vmsne.vi v0, v8, 0
1087 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1088 SrcLT.second, CostKind);
1089 }
1090 [[fallthrough]];
1091 case ISD::FP_EXTEND:
1092 case ISD::FP_ROUND: {
1093 // Counts of narrow/widen instructions.
1094 unsigned SrcEltSize = Src->getScalarSizeInBits();
1095 unsigned DstEltSize = Dst->getScalarSizeInBits();
1096
1097 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1098 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1099 : RISCV::VFNCVT_F_F_W;
1101 for (; SrcEltSize != DstEltSize;) {
1102 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1103 ? MVT::getIntegerVT(DstEltSize)
1104 : MVT::getFloatingPointVT(DstEltSize);
1105 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1106 DstEltSize =
1107 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1108 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1109 }
1110 return Cost;
1111 }
1112 case ISD::FP_TO_SINT:
1113 case ISD::FP_TO_UINT:
1114 // For fp vector to mask, we use:
1115 // vfncvt.rtz.x.f.w v9, v8
1116 // vand.vi v8, v9, 1
1117 // vmsne.vi v0, v8, 0
1118 if (Dst->getScalarSizeInBits() == 1)
1119 return 3;
1120
1121 if (std::abs(PowDiff) <= 1)
1122 return 1;
1123
1124 // Counts of narrow/widen instructions.
1125 return std::abs(PowDiff);
1126
1127 case ISD::SINT_TO_FP:
1128 case ISD::UINT_TO_FP:
1129 // For mask vector to fp, we should use the following instructions:
1130 // vmv.v.i v8, 0
1131 // vmerge.vim v8, v8, -1, v0
1132 // vfcvt.f.x.v v8, v8
1133 if (Src->getScalarSizeInBits() == 1)
1134 return 3;
1135
1136 if (std::abs(PowDiff) <= 1)
1137 return 1;
1138 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1139 // so it only need two conversion.
1140 return 2;
1141 }
1142 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1143}
1144
1145unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1146 if (isa<ScalableVectorType>(Ty)) {
1147 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1148 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1149 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1150 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1151 }
1152 return cast<FixedVectorType>(Ty)->getNumElements();
1153}
1154
1157 FastMathFlags FMF,
1159 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1160 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1161
1162 // Skip if scalar size of Ty is bigger than ELEN.
1163 if (Ty->getScalarSizeInBits() > ST->getELen())
1164 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1165
1166 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1167 if (Ty->getElementType()->isIntegerTy(1)) {
1168 // SelectionDAGBuilder does following transforms:
1169 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1170 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1171 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1172 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1173 else
1174 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1175 }
1176
1177 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1179 InstructionCost ExtraCost = 0;
1180 switch (IID) {
1181 case Intrinsic::maximum:
1182 if (FMF.noNaNs()) {
1183 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1184 } else {
1185 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1186 RISCV::VFMV_F_S};
1187 // Cost of Canonical Nan + branch
1188 // lui a0, 523264
1189 // fmv.w.x fa0, a0
1190 Type *DstTy = Ty->getScalarType();
1191 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1192 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1193 ExtraCost = 1 +
1194 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1196 getCFInstrCost(Instruction::Br, CostKind);
1197 }
1198 break;
1199
1200 case Intrinsic::minimum:
1201 if (FMF.noNaNs()) {
1202 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1203 } else {
1204 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1205 RISCV::VFMV_F_S};
1206 // Cost of Canonical Nan + branch
1207 // lui a0, 523264
1208 // fmv.w.x fa0, a0
1209 Type *DstTy = Ty->getScalarType();
1210 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1211 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1212 ExtraCost = 1 +
1213 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1215 getCFInstrCost(Instruction::Br, CostKind);
1216 }
1217 break;
1218 }
1219 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1220 }
1221
1222 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1223 unsigned SplitOp;
1225 switch (IID) {
1226 default:
1227 llvm_unreachable("Unsupported intrinsic");
1228 case Intrinsic::smax:
1229 SplitOp = RISCV::VMAX_VV;
1230 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1231 break;
1232 case Intrinsic::smin:
1233 SplitOp = RISCV::VMIN_VV;
1234 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1235 break;
1236 case Intrinsic::umax:
1237 SplitOp = RISCV::VMAXU_VV;
1238 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1239 break;
1240 case Intrinsic::umin:
1241 SplitOp = RISCV::VMINU_VV;
1242 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1243 break;
1244 case Intrinsic::maxnum:
1245 SplitOp = RISCV::VFMAX_VV;
1246 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1247 break;
1248 case Intrinsic::minnum:
1249 SplitOp = RISCV::VFMIN_VV;
1250 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1251 break;
1252 }
1253 // Add a cost for data larger than LMUL8
1254 InstructionCost SplitCost =
1255 (LT.first > 1) ? (LT.first - 1) *
1256 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1257 : 0;
1258 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1259}
1260
1263 std::optional<FastMathFlags> FMF,
1265 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1266 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1267
1268 // Skip if scalar size of Ty is bigger than ELEN.
1269 if (Ty->getScalarSizeInBits() > ST->getELen())
1270 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1271
1272 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1273 assert(ISD && "Invalid opcode");
1274
1275 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1276 ISD != ISD::FADD)
1277 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1278
1279 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1281 Type *ElementTy = Ty->getElementType();
1282 if (ElementTy->isIntegerTy(1)) {
1283 if (ISD == ISD::AND) {
1284 // Example sequences:
1285 // vsetvli a0, zero, e8, mf8, ta, ma
1286 // vmnot.m v8, v0
1287 // vcpop.m a0, v8
1288 // seqz a0, a0
1289 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1290 return (LT.first - 1) +
1291 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1292 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1294 } else {
1295 // Example sequences:
1296 // vsetvli a0, zero, e8, mf8, ta, ma
1297 // vcpop.m a0, v0
1298 // snez a0, a0
1299 Opcodes = {RISCV::VCPOP_M};
1300 return (LT.first - 1) +
1301 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1302 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1304 }
1305 }
1306
1307 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1309 Opcodes.push_back(RISCV::VFMV_S_F);
1310 for (unsigned i = 0; i < LT.first.getValue(); i++)
1311 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1312 Opcodes.push_back(RISCV::VFMV_F_S);
1313 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1314 }
1315 unsigned SplitOp;
1316 switch (ISD) {
1317 case ISD::ADD:
1318 SplitOp = RISCV::VADD_VV;
1319 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1320 break;
1321 case ISD::OR:
1322 SplitOp = RISCV::VOR_VV;
1323 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1324 break;
1325 case ISD::XOR:
1326 SplitOp = RISCV::VXOR_VV;
1327 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1328 break;
1329 case ISD::AND:
1330 SplitOp = RISCV::VAND_VV;
1331 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1332 break;
1333 case ISD::FADD:
1334 SplitOp = RISCV::VFADD_VV;
1335 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1336 break;
1337 }
1338 // Add a cost for data larger than LMUL8
1339 InstructionCost SplitCost =
1340 (LT.first > 1) ? (LT.first - 1) *
1341 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1342 : 0;
1343 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1344}
1345
1347 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1349 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1350 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1351 FMF, CostKind);
1352
1353 // Skip if scalar size of ResTy is bigger than ELEN.
1354 if (ResTy->getScalarSizeInBits() > ST->getELen())
1355 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1356 FMF, CostKind);
1357
1358 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1359 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1360 FMF, CostKind);
1361
1362 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1363
1364 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1365 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1366 FMF, CostKind);
1367
1368 return (LT.first - 1) +
1369 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1370}
1371
1373 TTI::OperandValueInfo OpInfo,
1375 assert(OpInfo.isConstant() && "non constant operand?");
1376 if (!isa<VectorType>(Ty))
1377 // FIXME: We need to account for immediate materialization here, but doing
1378 // a decent job requires more knowledge about the immediate than we
1379 // currently have here.
1380 return 0;
1381
1382 if (OpInfo.isUniform())
1383 // vmv.x.i, vmv.v.x, or vfmv.v.f
1384 // We ignore the cost of the scalar constant materialization to be consistent
1385 // with how we treat scalar constants themselves just above.
1386 return 1;
1387
1388 return getConstantPoolLoadCost(Ty, CostKind);
1389}
1390
1391
1393 MaybeAlign Alignment,
1394 unsigned AddressSpace,
1396 TTI::OperandValueInfo OpInfo,
1397 const Instruction *I) {
1398 EVT VT = TLI->getValueType(DL, Src, true);
1399 // Type legalization can't handle structs
1400 if (VT == MVT::Other)
1401 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1402 CostKind, OpInfo, I);
1403
1405 if (Opcode == Instruction::Store && OpInfo.isConstant())
1406 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1407
1408 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1409
1410 InstructionCost BaseCost = [&]() {
1411 InstructionCost Cost = LT.first;
1413 return Cost;
1414
1415 // Our actual lowering for the case where a wider legal type is available
1416 // uses the a VL predicated load on the wider type. This is reflected in
1417 // the result of getTypeLegalizationCost, but BasicTTI assumes the
1418 // widened cases are scalarized.
1419 const DataLayout &DL = this->getDataLayout();
1420 if (Src->isVectorTy() && LT.second.isVector() &&
1422 LT.second.getSizeInBits()))
1423 return Cost;
1424
1425 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1426 CostKind, OpInfo, I);
1427 }();
1428
1429 // Assume memory ops cost scale with the number of vector registers
1430 // possible accessed by the instruction. Note that BasicTTI already
1431 // handles the LT.first term for us.
1432 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1433 BaseCost *= TLI->getLMULCost(LT.second);
1434 return Cost + BaseCost;
1435
1436}
1437
1439 Type *CondTy,
1440 CmpInst::Predicate VecPred,
1442 const Instruction *I) {
1444 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1445 I);
1446
1447 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1448 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1449 I);
1450
1451 // Skip if scalar size of ValTy is bigger than ELEN.
1452 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1453 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1454 I);
1455
1456 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1457 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1458 if (CondTy->isVectorTy()) {
1459 if (ValTy->getScalarSizeInBits() == 1) {
1460 // vmandn.mm v8, v8, v9
1461 // vmand.mm v9, v0, v9
1462 // vmor.mm v0, v9, v8
1463 return LT.first *
1464 getRISCVInstructionCost(
1465 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1466 LT.second, CostKind);
1467 }
1468 // vselect and max/min are supported natively.
1469 return LT.first *
1470 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1471 }
1472
1473 if (ValTy->getScalarSizeInBits() == 1) {
1474 // vmv.v.x v9, a0
1475 // vmsne.vi v9, v9, 0
1476 // vmandn.mm v8, v8, v9
1477 // vmand.mm v9, v0, v9
1478 // vmor.mm v0, v9, v8
1479 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1480 return LT.first *
1481 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1482 InterimVT, CostKind) +
1483 LT.first * getRISCVInstructionCost(
1484 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1485 LT.second, CostKind);
1486 }
1487
1488 // vmv.v.x v10, a0
1489 // vmsne.vi v0, v10, 0
1490 // vmerge.vvm v8, v9, v8, v0
1491 return LT.first * getRISCVInstructionCost(
1492 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1493 LT.second, CostKind);
1494 }
1495
1496 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1497 CmpInst::isIntPredicate(VecPred)) {
1498 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1499 // provided they incur the same cost across all implementations
1500 return LT.first *
1501 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1502 }
1503
1504 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1505 CmpInst::isFPPredicate(VecPred)) {
1506
1507 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1508 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1509 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1510
1511 // If we do not support the input floating point vector type, use the base
1512 // one which will calculate as:
1513 // ScalarizeCost + Num * Cost for fixed vector,
1514 // InvalidCost for scalable vector.
1515 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1516 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1517 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1518 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1519 I);
1520
1521 // Assuming vector fp compare and mask instructions are all the same cost
1522 // until a need arises to differentiate them.
1523 switch (VecPred) {
1524 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1525 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1526 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1527 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1528 return LT.first * getRISCVInstructionCost(
1529 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1530 LT.second, CostKind);
1531
1532 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1533 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1534 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1535 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1536 return LT.first *
1537 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1538 LT.second, CostKind);
1539
1540 case CmpInst::FCMP_OEQ: // vmfeq.vv
1541 case CmpInst::FCMP_OGT: // vmflt.vv
1542 case CmpInst::FCMP_OGE: // vmfle.vv
1543 case CmpInst::FCMP_OLT: // vmflt.vv
1544 case CmpInst::FCMP_OLE: // vmfle.vv
1545 case CmpInst::FCMP_UNE: // vmfne.vv
1546 return LT.first *
1547 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1548 default:
1549 break;
1550 }
1551 }
1552
1553 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1554 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1555 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1556 // be (0 + select instr cost).
1557 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1558 ValTy->isIntegerTy() && !I->user_empty()) {
1559 if (all_of(I->users(), [&](const User *U) {
1560 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1561 U->getType()->isIntegerTy() &&
1562 !isa<ConstantData>(U->getOperand(1)) &&
1563 !isa<ConstantData>(U->getOperand(2));
1564 }))
1565 return 0;
1566 }
1567
1568 // TODO: Add cost for scalar type.
1569
1570 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1571}
1572
1575 const Instruction *I) {
1577 return Opcode == Instruction::PHI ? 0 : 1;
1578 // Branches are assumed to be predicted.
1579 return 0;
1580}
1581
1584 unsigned Index, Value *Op0,
1585 Value *Op1) {
1586 assert(Val->isVectorTy() && "This must be a vector type");
1587
1588 if (Opcode != Instruction::ExtractElement &&
1589 Opcode != Instruction::InsertElement)
1590 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1591
1592 // Legalize the type.
1593 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1594
1595 // This type is legalized to a scalar type.
1596 if (!LT.second.isVector()) {
1597 auto *FixedVecTy = cast<FixedVectorType>(Val);
1598 // If Index is a known constant, cost is zero.
1599 if (Index != -1U)
1600 return 0;
1601 // Extract/InsertElement with non-constant index is very costly when
1602 // scalarized; estimate cost of loads/stores sequence via the stack:
1603 // ExtractElement cost: store vector to stack, load scalar;
1604 // InsertElement cost: store vector to stack, store scalar, load vector.
1605 Type *ElemTy = FixedVecTy->getElementType();
1606 auto NumElems = FixedVecTy->getNumElements();
1607 auto Align = DL.getPrefTypeAlign(ElemTy);
1608 InstructionCost LoadCost =
1609 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1610 InstructionCost StoreCost =
1611 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1612 return Opcode == Instruction::ExtractElement
1613 ? StoreCost * NumElems + LoadCost
1614 : (StoreCost + LoadCost) * NumElems + StoreCost;
1615 }
1616
1617 // For unsupported scalable vector.
1618 if (LT.second.isScalableVector() && !LT.first.isValid())
1619 return LT.first;
1620
1621 if (!isTypeLegal(Val))
1622 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1623
1624 // Mask vector extract/insert is expanded via e8.
1625 if (Val->getScalarSizeInBits() == 1) {
1626 VectorType *WideTy =
1628 cast<VectorType>(Val)->getElementCount());
1629 if (Opcode == Instruction::ExtractElement) {
1630 InstructionCost ExtendCost
1631 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1633 InstructionCost ExtractCost
1634 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1635 return ExtendCost + ExtractCost;
1636 }
1637 InstructionCost ExtendCost
1638 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1640 InstructionCost InsertCost
1641 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1642 InstructionCost TruncCost
1643 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1645 return ExtendCost + InsertCost + TruncCost;
1646 }
1647
1648
1649 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1650 // and vslideup + vmv.s.x to insert element to vector.
1651 unsigned BaseCost = 1;
1652 // When insertelement we should add the index with 1 as the input of vslideup.
1653 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1654
1655 if (Index != -1U) {
1656 // The type may be split. For fixed-width vectors we can normalize the
1657 // index to the new type.
1658 if (LT.second.isFixedLengthVector()) {
1659 unsigned Width = LT.second.getVectorNumElements();
1660 Index = Index % Width;
1661 }
1662
1663 // We could extract/insert the first element without vslidedown/vslideup.
1664 if (Index == 0)
1665 SlideCost = 0;
1666 else if (Opcode == Instruction::InsertElement)
1667 SlideCost = 1; // With a constant index, we do not need to use addi.
1668 }
1669
1670 // Extract i64 in the target that has XLEN=32 need more instruction.
1671 if (Val->getScalarType()->isIntegerTy() &&
1672 ST->getXLen() < Val->getScalarSizeInBits()) {
1673 // For extractelement, we need the following instructions:
1674 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1675 // vslidedown.vx v8, v8, a0
1676 // vmv.x.s a0, v8
1677 // li a1, 32
1678 // vsrl.vx v8, v8, a1
1679 // vmv.x.s a1, v8
1680
1681 // For insertelement, we need the following instructions:
1682 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1683 // vmv.v.i v12, 0
1684 // vslide1up.vx v16, v12, a1
1685 // vslide1up.vx v12, v16, a0
1686 // addi a0, a2, 1
1687 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1688 // vslideup.vx v8, v12, a2
1689
1690 // TODO: should we count these special vsetvlis?
1691 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1692 }
1693 return BaseCost + SlideCost;
1694}
1695
1697 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1699 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1700
1701 // TODO: Handle more cost kinds.
1703 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1704 Args, CxtI);
1705
1706 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1707 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1708 Args, CxtI);
1709
1710 // Skip if scalar size of Ty is bigger than ELEN.
1711 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1712 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1713 Args, CxtI);
1714
1715 // Legalize the type.
1716 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1717
1718 // TODO: Handle scalar type.
1719 if (!LT.second.isVector())
1720 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1721 Args, CxtI);
1722
1723 auto getConstantMatCost =
1724 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1725 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1726 // Two sub-cases:
1727 // * Has a 5 bit immediate operand which can be splatted.
1728 // * Has a larger immediate which must be materialized in scalar register
1729 // We return 0 for both as we currently ignore the cost of materializing
1730 // scalar constants in GPRs.
1731 return 0;
1732
1733 return getConstantPoolLoadCost(Ty, CostKind);
1734 };
1735
1736 // Add the cost of materializing any constant vectors required.
1737 InstructionCost ConstantMatCost = 0;
1738 if (Op1Info.isConstant())
1739 ConstantMatCost += getConstantMatCost(0, Op1Info);
1740 if (Op2Info.isConstant())
1741 ConstantMatCost += getConstantMatCost(1, Op2Info);
1742
1743 unsigned Op;
1744 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1745 case ISD::ADD:
1746 case ISD::SUB:
1747 Op = RISCV::VADD_VV;
1748 break;
1749 case ISD::SHL:
1750 case ISD::SRL:
1751 case ISD::SRA:
1752 Op = RISCV::VSLL_VV;
1753 break;
1754 case ISD::AND:
1755 case ISD::OR:
1756 case ISD::XOR:
1757 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1758 break;
1759 case ISD::MUL:
1760 case ISD::MULHS:
1761 case ISD::MULHU:
1762 Op = RISCV::VMUL_VV;
1763 break;
1764 case ISD::SDIV:
1765 case ISD::UDIV:
1766 Op = RISCV::VDIV_VV;
1767 break;
1768 case ISD::SREM:
1769 case ISD::UREM:
1770 Op = RISCV::VREM_VV;
1771 break;
1772 case ISD::FADD:
1773 case ISD::FSUB:
1774 // TODO: Address FP16 with VFHMIN
1775 Op = RISCV::VFADD_VV;
1776 break;
1777 case ISD::FMUL:
1778 // TODO: Address FP16 with VFHMIN
1779 Op = RISCV::VFMUL_VV;
1780 break;
1781 case ISD::FDIV:
1782 Op = RISCV::VFDIV_VV;
1783 break;
1784 case ISD::FNEG:
1785 Op = RISCV::VFSGNJN_VV;
1786 break;
1787 default:
1788 // Assuming all other instructions have the same cost until a need arises to
1789 // differentiate them.
1790 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1791 Op1Info, Op2Info,
1792 Args, CxtI);
1793 }
1794
1795 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
1796 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
1797 // ops are twice as expensive as integer ops. Do the same for vectors so
1798 // scalar floating point ops aren't cheaper than their vector equivalents.
1799 if (Ty->isFPOrFPVectorTy())
1800 InstrCost *= 2;
1801 return ConstantMatCost + LT.first * InstrCost;
1802}
1803
1804// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1806 ArrayRef<const Value *> Ptrs, const Value *Base,
1807 const TTI::PointersChainInfo &Info, Type *AccessTy,
1810 // In the basic model we take into account GEP instructions only
1811 // (although here can come alloca instruction, a value, constants and/or
1812 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1813 // pointer). Typically, if Base is a not a GEP-instruction and all the
1814 // pointers are relative to the same base address, all the rest are
1815 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1816 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1817 // any their index is a non-const.
1818 // If no known dependecies between the pointers cost is calculated as a sum
1819 // of costs of GEP instructions.
1820 for (auto [I, V] : enumerate(Ptrs)) {
1821 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1822 if (!GEP)
1823 continue;
1824 if (Info.isSameBase() && V != Base) {
1825 if (GEP->hasAllConstantIndices())
1826 continue;
1827 // If the chain is unit-stride and BaseReg + stride*i is a legal
1828 // addressing mode, then presume the base GEP is sitting around in a
1829 // register somewhere and check if we can fold the offset relative to
1830 // it.
1831 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1832 if (Info.isUnitStride() &&
1833 isLegalAddressingMode(AccessTy,
1834 /* BaseGV */ nullptr,
1835 /* BaseOffset */ Stride * I,
1836 /* HasBaseReg */ true,
1837 /* Scale */ 0,
1838 GEP->getType()->getPointerAddressSpace()))
1839 continue;
1840 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1841 {TTI::OK_AnyValue, TTI::OP_None},
1842 {TTI::OK_AnyValue, TTI::OP_None},
1843 std::nullopt);
1844 } else {
1845 SmallVector<const Value *> Indices(GEP->indices());
1846 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1847 Indices, AccessTy, CostKind);
1848 }
1849 }
1850 return Cost;
1851}
1852
1856 // TODO: More tuning on benchmarks and metrics with changes as needed
1857 // would apply to all settings below to enable performance.
1858
1859
1860 if (ST->enableDefaultUnroll())
1861 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1862
1863 // Enable Upper bound unrolling universally, not dependant upon the conditions
1864 // below.
1865 UP.UpperBound = true;
1866
1867 // Disable loop unrolling for Oz and Os.
1868 UP.OptSizeThreshold = 0;
1870 if (L->getHeader()->getParent()->hasOptSize())
1871 return;
1872
1873 SmallVector<BasicBlock *, 4> ExitingBlocks;
1874 L->getExitingBlocks(ExitingBlocks);
1875 LLVM_DEBUG(dbgs() << "Loop has:\n"
1876 << "Blocks: " << L->getNumBlocks() << "\n"
1877 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1878
1879 // Only allow another exit other than the latch. This acts as an early exit
1880 // as it mirrors the profitability calculation of the runtime unroller.
1881 if (ExitingBlocks.size() > 2)
1882 return;
1883
1884 // Limit the CFG of the loop body for targets with a branch predictor.
1885 // Allowing 4 blocks permits if-then-else diamonds in the body.
1886 if (L->getNumBlocks() > 4)
1887 return;
1888
1889 // Don't unroll vectorized loops, including the remainder loop
1890 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1891 return;
1892
1893 // Scan the loop: don't unroll loops with calls as this could prevent
1894 // inlining.
1896 for (auto *BB : L->getBlocks()) {
1897 for (auto &I : *BB) {
1898 // Initial setting - Don't unroll loops containing vectorized
1899 // instructions.
1900 if (I.getType()->isVectorTy())
1901 return;
1902
1903 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1904 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1905 if (!isLoweredToCall(F))
1906 continue;
1907 }
1908 return;
1909 }
1910
1911 SmallVector<const Value *> Operands(I.operand_values());
1914 }
1915 }
1916
1917 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1918
1919 UP.Partial = true;
1920 UP.Runtime = true;
1921 UP.UnrollRemainder = true;
1922 UP.UnrollAndJam = true;
1924
1925 // Force unrolling small loops can be very useful because of the branch
1926 // taken cost of the backedge.
1927 if (Cost < 12)
1928 UP.Force = true;
1929}
1930
1934}
1935
1938 if (Ty->isVectorTy()) {
1939 if (Size.isScalable() && ST->hasVInstructions())
1940 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1941
1943 return divideCeil(Size, ST->getRealMinVLen());
1944 }
1945
1946 return BaseT::getRegUsageForType(Ty);
1947}
1948
1949unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1950 if (SLPMaxVF.getNumOccurrences())
1951 return SLPMaxVF;
1952
1953 // Return how many elements can fit in getRegisterBitwidth. This is the
1954 // same routine as used in LoopVectorizer. We should probably be
1955 // accounting for whether we actually have instructions with the right
1956 // lane type, but we don't have enough information to do that without
1957 // some additional plumbing which hasn't been justified yet.
1958 TypeSize RegWidth =
1960 // If no vector registers, or absurd element widths, disable
1961 // vectorization by returning 1.
1962 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1963}
1964
1966 const TargetTransformInfo::LSRCost &C2) {
1967 // RISC-V specific here are "instruction number 1st priority".
1968 // If we need to emit adds inside the loop to add up base registers, then
1969 // we need at least one extra temporary register.
1970 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
1971 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
1972 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
1973 C1.NumIVMuls, C1.NumBaseAdds,
1974 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1975 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
1976 C2.NumIVMuls, C2.NumBaseAdds,
1977 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1978}
1979
1981 auto *VTy = dyn_cast<VectorType>(DataTy);
1982 if (!VTy || VTy->isScalableTy())
1983 return false;
1984
1985 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1986 return false;
1987 return true;
1988}
1989
1991 const Function *Callee) const {
1992 const TargetMachine &TM = getTLI()->getTargetMachine();
1993
1994 const FeatureBitset &CallerBits =
1995 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1996 const FeatureBitset &CalleeBits =
1997 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1998
1999 // Inline a callee if its target-features are a subset of the callers
2000 // target-features.
2001 return (CallerBits & CalleeBits) == CalleeBits;
2002}
2003
2004/// See if \p I should be considered for address type promotion. We check if \p
2005/// I is a sext with right type and used in memory accesses. If it used in a
2006/// "complex" getelementptr, we allow it to be promoted without finding other
2007/// sext instructions that sign extended the same initial value. A getelementptr
2008/// is considered as "complex" if it has more than 2 operands.
2010 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2011 bool Considerable = false;
2012 AllowPromotionWithoutCommonHeader = false;
2013 if (!isa<SExtInst>(&I))
2014 return false;
2015 Type *ConsideredSExtType =
2016 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2017 if (I.getType() != ConsideredSExtType)
2018 return false;
2019 // See if the sext is the one with the right type and used in at least one
2020 // GetElementPtrInst.
2021 for (const User *U : I.users()) {
2022 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2023 Considerable = true;
2024 // A getelementptr is considered as "complex" if it has more than 2
2025 // operands. We will promote a SExt used in such complex GEP as we
2026 // expect some computation to be merged if they are done on 64 bits.
2027 if (GEPInst->getNumOperands() > 2) {
2028 AllowPromotionWithoutCommonHeader = true;
2029 break;
2030 }
2031 }
2032 }
2033 return Considerable;
2034}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:583
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:757
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:756
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:970
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:439
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:655
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:892
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:856
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:441
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:838
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:621
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:429
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:842
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:42
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1935
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2050
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).