LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
43 size_t NumInstr = OpCodes.size();
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
48 return LMULCost * NumInstr;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMNAND_MM:
95 case RISCV::VCPOP_M:
96 Cost += 1;
97 break;
98 default:
99 Cost += LMULCost;
100 }
101 }
102 return Cost;
103}
104
107 assert(Ty->isIntegerTy() &&
108 "getIntImmCost can only estimate cost of materialising integers");
109
110 // We have a Zero register, so 0 is always free.
111 if (Imm == 0)
112 return TTI::TCC_Free;
113
114 // Otherwise, we check how many instructions it will take to materialise.
115 const DataLayout &DL = getDataLayout();
116 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
117}
118
119// Look for patterns of shift followed by AND that can be turned into a pair of
120// shifts. We won't need to materialize an immediate for the AND so these can
121// be considered free.
122static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
123 uint64_t Mask = Imm.getZExtValue();
124 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
125 if (!BO || !BO->hasOneUse())
126 return false;
127
128 if (BO->getOpcode() != Instruction::Shl)
129 return false;
130
131 if (!isa<ConstantInt>(BO->getOperand(1)))
132 return false;
133
134 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
135 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
136 // is a mask shifted by c2 bits with c3 leading zeros.
137 if (isShiftedMask_64(Mask)) {
138 unsigned Trailing = llvm::countr_zero(Mask);
139 if (ShAmt == Trailing)
140 return true;
141 }
142
143 return false;
144}
145
147 const APInt &Imm, Type *Ty,
149 Instruction *Inst) {
150 assert(Ty->isIntegerTy() &&
151 "getIntImmCost can only estimate cost of materialising integers");
152
153 // We have a Zero register, so 0 is always free.
154 if (Imm == 0)
155 return TTI::TCC_Free;
156
157 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
158 // commutative, in others the immediate comes from a specific argument index.
159 bool Takes12BitImm = false;
160 unsigned ImmArgIdx = ~0U;
161
162 switch (Opcode) {
163 case Instruction::GetElementPtr:
164 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
165 // split up large offsets in GEP into better parts than ConstantHoisting
166 // can.
167 return TTI::TCC_Free;
168 case Instruction::Store:
169 // If the address is a constant, use the materialization cost.
170 if (Idx == 1)
171 return getIntImmCost(Imm, Ty, CostKind);
172 return TTI::TCC_Free;
173 case Instruction::Load:
174 // If the address is a constant, use the materialization cost.
175 return getIntImmCost(Imm, Ty, CostKind);
176 case Instruction::And:
177 // zext.h
178 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
179 return TTI::TCC_Free;
180 // zext.w
181 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
182 return TTI::TCC_Free;
183 // bclri
184 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
185 return TTI::TCC_Free;
186 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
187 canUseShiftPair(Inst, Imm))
188 return TTI::TCC_Free;
189 Takes12BitImm = true;
190 break;
191 case Instruction::Add:
192 Takes12BitImm = true;
193 break;
194 case Instruction::Or:
195 case Instruction::Xor:
196 // bseti/binvi
197 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
198 return TTI::TCC_Free;
199 Takes12BitImm = true;
200 break;
201 case Instruction::Mul:
202 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
203 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
204 return TTI::TCC_Free;
205 // One more or less than a power of 2 can use SLLI+ADD/SUB.
206 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
207 return TTI::TCC_Free;
208 // FIXME: There is no MULI instruction.
209 Takes12BitImm = true;
210 break;
211 case Instruction::Sub:
212 case Instruction::Shl:
213 case Instruction::LShr:
214 case Instruction::AShr:
215 Takes12BitImm = true;
216 ImmArgIdx = 1;
217 break;
218 default:
219 break;
220 }
221
222 if (Takes12BitImm) {
223 // Check immediate is the correct argument...
224 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
225 // ... and fits into the 12-bit immediate.
226 if (Imm.getSignificantBits() <= 64 &&
227 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
228 return TTI::TCC_Free;
229 }
230 }
231
232 // Otherwise, use the full materialisation cost.
233 return getIntImmCost(Imm, Ty, CostKind);
234 }
235
236 // By default, prevent hoisting.
237 return TTI::TCC_Free;
238}
239
242 const APInt &Imm, Type *Ty,
244 // Prevent hoisting in unknown cases.
245 return TTI::TCC_Free;
246}
247
248bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
249 return ST->hasVInstructions();
250}
251
254 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
255 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
258}
259
261 // Currently, the ExpandReductions pass can't expand scalable-vector
262 // reductions, but we still request expansion as RVV doesn't support certain
263 // reductions and the SelectionDAG can't legalize them either.
264 switch (II->getIntrinsicID()) {
265 default:
266 return false;
267 // These reductions have no equivalent in RVV
268 case Intrinsic::vector_reduce_mul:
269 case Intrinsic::vector_reduce_fmul:
270 return true;
271 }
272}
273
274std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
275 if (ST->hasVInstructions())
277 return BaseT::getMaxVScale();
278}
279
280std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
281 if (ST->hasVInstructions())
282 if (unsigned MinVLen = ST->getRealMinVLen();
283 MinVLen >= RISCV::RVVBitsPerBlock)
284 return MinVLen / RISCV::RVVBitsPerBlock;
286}
287
290 unsigned LMUL =
291 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
292 switch (K) {
294 return TypeSize::getFixed(ST->getXLen());
296 return TypeSize::getFixed(
297 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
300 (ST->hasVInstructions() &&
303 : 0);
304 }
305
306 llvm_unreachable("Unsupported register kind");
307}
308
310RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
311 // Add a cost of address generation + the cost of the load. The address
312 // is expected to be a PC relative offset to a constant pool entry
313 // using auipc/addi.
314 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
315 /*AddressSpace=*/0, CostKind);
316}
317
319 LLVMContext &C) {
320 assert((DataVT.getScalarSizeInBits() != 8 ||
321 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
322 MVT IndexVT = DataVT.changeTypeToInteger();
323 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
324 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
325 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
326}
327
329 VectorType *Tp, ArrayRef<int> Mask,
331 int Index, VectorType *SubTp,
333 const Instruction *CxtI) {
334 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
335
336 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
337
338 // First, handle cases where having a fixed length vector enables us to
339 // give a more accurate cost than falling back to generic scalable codegen.
340 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
341 if (isa<FixedVectorType>(Tp)) {
342 switch (Kind) {
343 default:
344 break;
346 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
347 MVT EltTp = LT.second.getVectorElementType();
348 // If the size of the element is < ELEN then shuffles of interleaves and
349 // deinterleaves of 2 vectors can be lowered into the following
350 // sequences
351 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
352 // Example sequence:
353 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
354 // vwaddu.vv v10, v8, v9
355 // li a0, -1 (ignored)
356 // vwmaccu.vx v10, a0, v9
357 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
358 return 2 * LT.first * TLI->getLMULCost(LT.second);
359
360 if (Mask[0] == 0 || Mask[0] == 1) {
361 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
362 // Example sequence:
363 // vnsrl.wi v10, v8, 0
364 if (equal(DeinterleaveMask, Mask))
365 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
366 LT.second, CostKind);
367 }
368 }
369 }
370 // vrgather + cost of generating the mask constant.
371 // We model this for an unknown mask with a single vrgather.
372 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
373 (LT.second.getScalarSizeInBits() != 8 ||
374 LT.second.getVectorNumElements() <= 256)) {
375 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
376 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
377 return IndexCost +
378 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
379 }
380 [[fallthrough]];
381 }
384 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
385 // register for the second vrgather. We model this for an unknown
386 // (shuffle) mask.
387 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
388 (LT.second.getScalarSizeInBits() != 8 ||
389 LT.second.getVectorNumElements() <= 256)) {
390 auto &C = Tp->getContext();
391 auto EC = Tp->getElementCount();
392 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
394 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
395 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
396 return 2 * IndexCost +
397 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
398 LT.second, CostKind) +
399 MaskCost;
400 }
401 [[fallthrough]];
402 }
403 case TTI::SK_Select: {
404 // We are going to permute multiple sources and the result will be in
405 // multiple destinations. Providing an accurate cost only for splits where
406 // the element type remains the same.
407 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
408 LT.second.isFixedLengthVector() &&
409 LT.second.getVectorElementType().getSizeInBits() ==
411 LT.second.getVectorNumElements() <
412 cast<FixedVectorType>(Tp)->getNumElements() &&
413 divideCeil(Mask.size(),
414 cast<FixedVectorType>(Tp)->getNumElements()) ==
415 static_cast<unsigned>(*LT.first.getValue())) {
416 unsigned NumRegs = *LT.first.getValue();
417 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
418 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
419 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
420
422 for (unsigned I = 0; I < NumRegs; ++I) {
423 bool IsSingleVector = true;
424 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
425 transform(Mask.slice(I * SubVF,
426 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
427 SubMask.begin(), [&](int I) {
428 bool SingleSubVector = I / VF == 0;
429 IsSingleVector &= SingleSubVector;
430 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
431 });
434 SubVecTy, SubMask, CostKind, 0, nullptr);
435 return Cost;
436 }
437 }
438 break;
439 }
440 }
441 };
442
443 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
444 switch (Kind) {
445 default:
446 // Fallthrough to generic handling.
447 // TODO: Most of these cases will return getInvalid in generic code, and
448 // must be implemented here.
449 break;
451 // Extract at zero is always a subregister extract
452 if (Index == 0)
453 return TTI::TCC_Free;
454
455 // If we're extracting a subvector of at most m1 size at a sub-register
456 // boundary - which unfortunately we need exact vlen to identify - this is
457 // a subregister extract at worst and thus won't require a vslidedown.
458 // TODO: Extend for aligned m2, m4 subvector extracts
459 // TODO: Extend for misalgined (but contained) extracts
460 // TODO: Extend for scalable subvector types
461 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
462 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
463 const unsigned MinVLen = ST->getRealMinVLen();
464 const unsigned MaxVLen = ST->getRealMaxVLen();
465 if (MinVLen == MaxVLen &&
466 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
467 SubLT.second.getSizeInBits() <= MinVLen)
468 return TTI::TCC_Free;
469 }
470
471 // Example sequence:
472 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
473 // vslidedown.vi v8, v9, 2
474 return LT.first *
475 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
477 // Example sequence:
478 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
479 // vslideup.vi v8, v9, 2
480 return LT.first *
481 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
482 case TTI::SK_Select: {
483 // Example sequence:
484 // li a0, 90
485 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
486 // vmv.s.x v0, a0
487 // vmerge.vvm v8, v9, v8, v0
488 // We use 2 for the cost of the mask materialization as this is the true
489 // cost for small masks and most shuffles are small. At worst, this cost
490 // should be a very small constant for the constant pool load. As such,
491 // we may bias towards large selects slightly more than truely warranted.
492 return LT.first *
493 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
494 LT.second, CostKind));
495 }
496 case TTI::SK_Broadcast: {
497 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
498 Instruction::InsertElement);
499 if (LT.second.getScalarSizeInBits() == 1) {
500 if (HasScalar) {
501 // Example sequence:
502 // andi a0, a0, 1
503 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
504 // vmv.v.x v8, a0
505 // vmsne.vi v0, v8, 0
506 return LT.first *
507 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
508 LT.second, CostKind));
509 }
510 // Example sequence:
511 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
512 // vmv.v.i v8, 0
513 // vmerge.vim v8, v8, 1, v0
514 // vmv.x.s a0, v8
515 // andi a0, a0, 1
516 // vmv.v.x v8, a0
517 // vmsne.vi v0, v8, 0
518
519 return LT.first *
520 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
521 RISCV::VMV_X_S, RISCV::VMV_V_X,
522 RISCV::VMSNE_VI},
523 LT.second, CostKind));
524 }
525
526 if (HasScalar) {
527 // Example sequence:
528 // vmv.v.x v8, a0
529 return LT.first *
530 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
531 }
532
533 // Example sequence:
534 // vrgather.vi v9, v8, 0
535 return LT.first *
536 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
537 }
538 case TTI::SK_Splice: {
539 // vslidedown+vslideup.
540 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
541 // of similar code, but I think we expand through memory.
542 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
543 if (Index >= 0 && Index < 32)
544 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
545 else if (Index < 0 && Index > -32)
546 Opcodes[1] = RISCV::VSLIDEUP_VI;
547 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
548 }
549 case TTI::SK_Reverse: {
550 // TODO: Cases to improve here:
551 // * Illegal vector types
552 // * i64 on RV32
553 // * i1 vector
554 // At low LMUL, most of the cost is producing the vrgather index register.
555 // At high LMUL, the cost of the vrgather itself will dominate.
556 // Example sequence:
557 // csrr a0, vlenb
558 // srli a0, a0, 3
559 // addi a0, a0, -1
560 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
561 // vid.v v9
562 // vrsub.vx v10, v9, a0
563 // vrgather.vv v9, v8, v10
564 InstructionCost LenCost = 3;
565 if (LT.second.isFixedLengthVector())
566 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
567 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
568 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
569 if (LT.second.isFixedLengthVector() &&
570 isInt<5>(LT.second.getVectorNumElements() - 1))
571 Opcodes[1] = RISCV::VRSUB_VI;
572 InstructionCost GatherCost =
573 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
574 // Mask operation additionally required extend and truncate
575 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
576 return LT.first * (LenCost + GatherCost + ExtendCost);
577 }
578 }
579 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
580}
581
583RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
584 unsigned AddressSpace,
586 if (!isLegalMaskedLoadStore(Src, Alignment) ||
588 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
589 CostKind);
590
591 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
592}
593
595 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
596 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
597 bool UseMaskForCond, bool UseMaskForGaps) {
598 if (isa<ScalableVectorType>(VecTy))
600 auto *FVTy = cast<FixedVectorType>(VecTy);
601 InstructionCost MemCost =
602 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
603 unsigned VF = FVTy->getNumElements() / Factor;
604
605 // The interleaved memory access pass will lower interleaved memory ops (i.e
606 // a load and store followed by a specific shuffle) to vlseg/vsseg
607 // intrinsics. In those cases then we can treat it as if it's just one (legal)
608 // memory op
609 if (!UseMaskForCond && !UseMaskForGaps &&
610 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
611 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
612 // Need to make sure type has't been scalarized
613 if (LT.second.isFixedLengthVector()) {
614 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
615 LT.second.getVectorNumElements());
616 // FIXME: We use the memory op cost of the *legalized* type here, becuase
617 // it's getMemoryOpCost returns a really expensive cost for types like
618 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
619 // Should the memory op cost of these be cheaper?
620 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
621 AddressSpace, DL)) {
622 InstructionCost LegalMemCost = getMemoryOpCost(
623 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
624 return LT.first + LegalMemCost;
625 }
626 }
627 }
628
629 // An interleaved load will look like this for Factor=3:
630 // %wide.vec = load <12 x i32>, ptr %3, align 4
631 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
632 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
633 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
634 if (Opcode == Instruction::Load) {
635 InstructionCost Cost = MemCost;
636 for (unsigned Index : Indices) {
637 FixedVectorType *SubVecTy =
638 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
639 auto Mask = createStrideMask(Index, Factor, VF);
640 InstructionCost ShuffleCost =
642 CostKind, 0, nullptr, {});
643 Cost += ShuffleCost;
644 }
645 return Cost;
646 }
647
648 // TODO: Model for NF > 2
649 // We'll need to enhance getShuffleCost to model shuffles that are just
650 // inserts and extracts into subvectors, since they won't have the full cost
651 // of a vrgather.
652 // An interleaved store for 3 vectors of 4 lanes will look like
653 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
654 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
655 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
656 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
657 // store <12 x i32> %interleaved.vec, ptr %10, align 4
658 if (Factor != 2)
659 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
660 Alignment, AddressSpace, CostKind,
661 UseMaskForCond, UseMaskForGaps);
662
663 assert(Opcode == Instruction::Store && "Opcode must be a store");
664 // For an interleaving store of 2 vectors, we perform one large interleaving
665 // shuffle that goes into the wide store
666 auto Mask = createInterleaveMask(VF, Factor);
667 InstructionCost ShuffleCost =
669 CostKind, 0, nullptr, {});
670 return MemCost + ShuffleCost;
671}
672
674 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
675 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
677 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
678 Alignment, CostKind, I);
679
680 if ((Opcode == Instruction::Load &&
681 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
682 (Opcode == Instruction::Store &&
683 !isLegalMaskedScatter(DataTy, Align(Alignment))))
684 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
685 Alignment, CostKind, I);
686
687 // Cost is proportional to the number of memory operations implied. For
688 // scalable vectors, we use an estimate on that number since we don't
689 // know exactly what VL will be.
690 auto &VTy = *cast<VectorType>(DataTy);
691 InstructionCost MemOpCost =
692 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
693 {TTI::OK_AnyValue, TTI::OP_None}, I);
694 unsigned NumLoads = getEstimatedVLFor(&VTy);
695 return NumLoads * MemOpCost;
696}
697
699 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
700 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
701 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
702 !isLegalStridedLoadStore(DataTy, Alignment)) ||
703 (Opcode != Instruction::Load && Opcode != Instruction::Store))
704 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
705 Alignment, CostKind, I);
706
708 return TTI::TCC_Basic;
709
710 // Cost is proportional to the number of memory operations implied. For
711 // scalable vectors, we use an estimate on that number since we don't
712 // know exactly what VL will be.
713 auto &VTy = *cast<VectorType>(DataTy);
714 InstructionCost MemOpCost =
715 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
716 {TTI::OK_AnyValue, TTI::OP_None}, I);
717 unsigned NumLoads = getEstimatedVLFor(&VTy);
718 return NumLoads * MemOpCost;
719}
720
721// Currently, these represent both throughput and codesize costs
722// for the respective intrinsics. The costs in this table are simply
723// instruction counts with the following adjustments made:
724// * One vsetvli is considered free.
726 {Intrinsic::floor, MVT::f32, 9},
727 {Intrinsic::floor, MVT::f64, 9},
728 {Intrinsic::ceil, MVT::f32, 9},
729 {Intrinsic::ceil, MVT::f64, 9},
730 {Intrinsic::trunc, MVT::f32, 7},
731 {Intrinsic::trunc, MVT::f64, 7},
732 {Intrinsic::round, MVT::f32, 9},
733 {Intrinsic::round, MVT::f64, 9},
734 {Intrinsic::roundeven, MVT::f32, 9},
735 {Intrinsic::roundeven, MVT::f64, 9},
736 {Intrinsic::rint, MVT::f32, 7},
737 {Intrinsic::rint, MVT::f64, 7},
738 {Intrinsic::lrint, MVT::i32, 1},
739 {Intrinsic::lrint, MVT::i64, 1},
740 {Intrinsic::llrint, MVT::i64, 1},
741 {Intrinsic::nearbyint, MVT::f32, 9},
742 {Intrinsic::nearbyint, MVT::f64, 9},
743 {Intrinsic::bswap, MVT::i16, 3},
744 {Intrinsic::bswap, MVT::i32, 12},
745 {Intrinsic::bswap, MVT::i64, 31},
746 {Intrinsic::vp_bswap, MVT::i16, 3},
747 {Intrinsic::vp_bswap, MVT::i32, 12},
748 {Intrinsic::vp_bswap, MVT::i64, 31},
749 {Intrinsic::vp_fshl, MVT::i8, 7},
750 {Intrinsic::vp_fshl, MVT::i16, 7},
751 {Intrinsic::vp_fshl, MVT::i32, 7},
752 {Intrinsic::vp_fshl, MVT::i64, 7},
753 {Intrinsic::vp_fshr, MVT::i8, 7},
754 {Intrinsic::vp_fshr, MVT::i16, 7},
755 {Intrinsic::vp_fshr, MVT::i32, 7},
756 {Intrinsic::vp_fshr, MVT::i64, 7},
757 {Intrinsic::bitreverse, MVT::i8, 17},
758 {Intrinsic::bitreverse, MVT::i16, 24},
759 {Intrinsic::bitreverse, MVT::i32, 33},
760 {Intrinsic::bitreverse, MVT::i64, 52},
761 {Intrinsic::vp_bitreverse, MVT::i8, 17},
762 {Intrinsic::vp_bitreverse, MVT::i16, 24},
763 {Intrinsic::vp_bitreverse, MVT::i32, 33},
764 {Intrinsic::vp_bitreverse, MVT::i64, 52},
765 {Intrinsic::ctpop, MVT::i8, 12},
766 {Intrinsic::ctpop, MVT::i16, 19},
767 {Intrinsic::ctpop, MVT::i32, 20},
768 {Intrinsic::ctpop, MVT::i64, 21},
769 {Intrinsic::vp_ctpop, MVT::i8, 12},
770 {Intrinsic::vp_ctpop, MVT::i16, 19},
771 {Intrinsic::vp_ctpop, MVT::i32, 20},
772 {Intrinsic::vp_ctpop, MVT::i64, 21},
773 {Intrinsic::vp_ctlz, MVT::i8, 19},
774 {Intrinsic::vp_ctlz, MVT::i16, 28},
775 {Intrinsic::vp_ctlz, MVT::i32, 31},
776 {Intrinsic::vp_ctlz, MVT::i64, 35},
777 {Intrinsic::vp_cttz, MVT::i8, 16},
778 {Intrinsic::vp_cttz, MVT::i16, 23},
779 {Intrinsic::vp_cttz, MVT::i32, 24},
780 {Intrinsic::vp_cttz, MVT::i64, 25},
781};
782
784 switch (ID) {
785#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
786 case Intrinsic::VPID: \
787 return ISD::VPSD;
788#include "llvm/IR/VPIntrinsics.def"
789#undef HELPER_MAP_VPID_TO_VPSD
790 }
791 return ISD::DELETED_NODE;
792}
793
797 auto *RetTy = ICA.getReturnType();
798 switch (ICA.getID()) {
799 case Intrinsic::ceil:
800 case Intrinsic::floor:
801 case Intrinsic::trunc:
802 case Intrinsic::rint:
803 case Intrinsic::lrint:
804 case Intrinsic::llrint:
805 case Intrinsic::round:
806 case Intrinsic::roundeven: {
807 // These all use the same code.
809 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
810 return LT.first * 8;
811 break;
812 }
813 case Intrinsic::umin:
814 case Intrinsic::umax:
815 case Intrinsic::smin:
816 case Intrinsic::smax: {
818 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
819 return LT.first;
820
821 if (ST->hasVInstructions() && LT.second.isVector()) {
822 unsigned Op;
823 switch (ICA.getID()) {
824 case Intrinsic::umin:
825 Op = RISCV::VMINU_VV;
826 break;
827 case Intrinsic::umax:
828 Op = RISCV::VMAXU_VV;
829 break;
830 case Intrinsic::smin:
831 Op = RISCV::VMIN_VV;
832 break;
833 case Intrinsic::smax:
834 Op = RISCV::VMAX_VV;
835 break;
836 }
837 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
838 }
839 break;
840 }
841 case Intrinsic::sadd_sat:
842 case Intrinsic::ssub_sat:
843 case Intrinsic::uadd_sat:
844 case Intrinsic::usub_sat:
845 case Intrinsic::fabs:
846 case Intrinsic::sqrt: {
848 if (ST->hasVInstructions() && LT.second.isVector())
849 return LT.first;
850 break;
851 }
852 case Intrinsic::ctpop: {
854 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
855 return LT.first;
856 break;
857 }
858 case Intrinsic::abs: {
860 if (ST->hasVInstructions() && LT.second.isVector()) {
861 // vrsub.vi v10, v8, 0
862 // vmax.vv v8, v8, v10
863 return LT.first * 2;
864 }
865 break;
866 }
867 case Intrinsic::get_active_lane_mask: {
868 if (ST->hasVInstructions()) {
869 Type *ExpRetTy = VectorType::get(
870 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
871 auto LT = getTypeLegalizationCost(ExpRetTy);
872
873 // vid.v v8 // considered hoisted
874 // vsaddu.vx v8, v8, a0
875 // vmsltu.vx v0, v8, a1
876 return LT.first *
877 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
878 LT.second, CostKind);
879 }
880 break;
881 }
882 // TODO: add more intrinsic
883 case Intrinsic::experimental_stepvector: {
885 // Legalisation of illegal types involves an `index' instruction plus
886 // (LT.first - 1) vector adds.
887 if (ST->hasVInstructions())
888 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
889 (LT.first - 1) *
890 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
891 return 1 + (LT.first - 1);
892 }
893 case Intrinsic::vp_rint: {
894 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
895 unsigned Cost = 5;
897 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
898 return Cost * LT.first;
899 break;
900 }
901 case Intrinsic::vp_nearbyint: {
902 // More one read and one write for fflags than vp_rint.
903 unsigned Cost = 7;
905 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
906 return Cost * LT.first;
907 break;
908 }
909 case Intrinsic::vp_ceil:
910 case Intrinsic::vp_floor:
911 case Intrinsic::vp_round:
912 case Intrinsic::vp_roundeven:
913 case Intrinsic::vp_roundtozero: {
914 // Rounding with static rounding mode needs two more instructions to
915 // swap/write FRM than vp_rint.
916 unsigned Cost = 7;
918 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
919 if (TLI->isOperationCustom(VPISD, LT.second))
920 return Cost * LT.first;
921 break;
922 }
923 }
924
925 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
926 if (auto LT = getTypeLegalizationCost(RetTy);
927 LT.second.isVector()) {
928 MVT EltTy = LT.second.getVectorElementType();
929 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
930 ICA.getID(), EltTy))
931 return LT.first * Entry->Cost;
932 }
933 }
934
936}
937
939 Type *Src,
942 const Instruction *I) {
943 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
944 if (!IsVectorType)
945 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
946
947 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
948 (Src->getScalarSizeInBits() <= ST->getELen()) &&
949 (Dst->getScalarSizeInBits() <= ST->getELen());
950
951 // FIXME: Need to compute legalizing cost for illegal types.
952 if (!IsTypeLegal)
953 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
954
955 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
956 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
957
958 int ISD = TLI->InstructionOpcodeToISD(Opcode);
959 assert(ISD && "Invalid opcode");
960
961 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
962 (int)Log2_32(Src->getScalarSizeInBits());
963 switch (ISD) {
964 case ISD::SIGN_EXTEND:
965 case ISD::ZERO_EXTEND: {
966 const unsigned SrcEltSize = Src->getScalarSizeInBits();
967 if (SrcEltSize == 1) {
968 // We do not use vsext/vzext to extend from mask vector.
969 // Instead we use the following instructions to extend from mask vector:
970 // vmv.v.i v8, 0
971 // vmerge.vim v8, v8, -1, v0
972 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
973 DstLT.second, CostKind);
974 }
975 if ((PowDiff < 1) || (PowDiff > 3))
976 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
977 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
978 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
979 unsigned Op =
980 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
981 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
982 }
983 case ISD::TRUNCATE:
984 if (Dst->getScalarSizeInBits() == 1) {
985 // We do not use several vncvt to truncate to mask vector. So we could
986 // not use PowDiff to calculate it.
987 // Instead we use the following instructions to truncate to mask vector:
988 // vand.vi v8, v8, 1
989 // vmsne.vi v0, v8, 0
990 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
991 SrcLT.second, CostKind);
992 }
993 [[fallthrough]];
994 case ISD::FP_EXTEND:
995 case ISD::FP_ROUND: {
996 // Counts of narrow/widen instructions.
997 unsigned SrcEltSize = Src->getScalarSizeInBits();
998 unsigned DstEltSize = Dst->getScalarSizeInBits();
999
1000 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1001 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1002 : RISCV::VFNCVT_F_F_W;
1004 for (; SrcEltSize != DstEltSize;) {
1005 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1006 ? MVT::getIntegerVT(DstEltSize)
1007 : MVT::getFloatingPointVT(DstEltSize);
1008 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1009 DstEltSize =
1010 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1011 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1012 }
1013 return Cost;
1014 }
1015 case ISD::FP_TO_SINT:
1016 case ISD::FP_TO_UINT:
1017 case ISD::SINT_TO_FP:
1018 case ISD::UINT_TO_FP:
1019 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1020 // The cost of convert from or to mask vector is different from other
1021 // cases. We could not use PowDiff to calculate it.
1022 // For mask vector to fp, we should use the following instructions:
1023 // vmv.v.i v8, 0
1024 // vmerge.vim v8, v8, -1, v0
1025 // vfcvt.f.x.v v8, v8
1026
1027 // And for fp vector to mask, we use:
1028 // vfncvt.rtz.x.f.w v9, v8
1029 // vand.vi v8, v9, 1
1030 // vmsne.vi v0, v8, 0
1031 return 3;
1032 }
1033 if (std::abs(PowDiff) <= 1)
1034 return 1;
1035 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1036 // so it only need two conversion.
1037 if (Src->isIntOrIntVectorTy())
1038 return 2;
1039 // Counts of narrow/widen instructions.
1040 return std::abs(PowDiff);
1041 }
1042 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1043}
1044
1045unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1046 if (isa<ScalableVectorType>(Ty)) {
1047 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1048 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1049 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1050 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1051 }
1052 return cast<FixedVectorType>(Ty)->getNumElements();
1053}
1054
1057 FastMathFlags FMF,
1059 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1060 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1061
1062 // Skip if scalar size of Ty is bigger than ELEN.
1063 if (Ty->getScalarSizeInBits() > ST->getELen())
1064 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1065
1066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1067 if (Ty->getElementType()->isIntegerTy(1)) {
1068 // SelectionDAGBuilder does following transforms:
1069 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1070 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1071 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1072 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1073 else
1074 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1075 }
1076
1077 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1079 InstructionCost ExtraCost = 0;
1080 switch (IID) {
1081 case Intrinsic::maximum:
1082 if (FMF.noNaNs()) {
1083 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1084 } else {
1085 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1086 RISCV::VFMV_F_S};
1087 // Cost of Canonical Nan + branch
1088 // lui a0, 523264
1089 // fmv.w.x fa0, a0
1090 Type *DstTy = Ty->getScalarType();
1091 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1092 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1093 ExtraCost = 1 +
1094 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1096 getCFInstrCost(Instruction::Br, CostKind);
1097 }
1098 break;
1099
1100 case Intrinsic::minimum:
1101 if (FMF.noNaNs()) {
1102 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1103 } else {
1104 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1105 RISCV::VFMV_F_S};
1106 // Cost of Canonical Nan + branch
1107 // lui a0, 523264
1108 // fmv.w.x fa0, a0
1109 Type *DstTy = Ty->getScalarType();
1110 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1111 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1112 ExtraCost = 1 +
1113 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1115 getCFInstrCost(Instruction::Br, CostKind);
1116 }
1117 break;
1118 }
1119 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1120 }
1121
1122 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1123 unsigned SplitOp;
1125 switch (IID) {
1126 default:
1127 llvm_unreachable("Unsupported intrinsic");
1128 case Intrinsic::smax:
1129 SplitOp = RISCV::VMAX_VV;
1130 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1131 break;
1132 case Intrinsic::smin:
1133 SplitOp = RISCV::VMIN_VV;
1134 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1135 break;
1136 case Intrinsic::umax:
1137 SplitOp = RISCV::VMAXU_VV;
1138 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1139 break;
1140 case Intrinsic::umin:
1141 SplitOp = RISCV::VMINU_VV;
1142 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1143 break;
1144 case Intrinsic::maxnum:
1145 SplitOp = RISCV::VFMAX_VV;
1146 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1147 break;
1148 case Intrinsic::minnum:
1149 SplitOp = RISCV::VFMIN_VV;
1150 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1151 break;
1152 }
1153 // Add a cost for data larger than LMUL8
1154 InstructionCost SplitCost =
1155 (LT.first > 1) ? (LT.first - 1) *
1156 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1157 : 0;
1158 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1159}
1160
1163 std::optional<FastMathFlags> FMF,
1165 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1166 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1167
1168 // Skip if scalar size of Ty is bigger than ELEN.
1169 if (Ty->getScalarSizeInBits() > ST->getELen())
1170 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1171
1172 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1173 assert(ISD && "Invalid opcode");
1174
1175 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1176 ISD != ISD::FADD)
1177 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1178
1179 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1181 Type *ElementTy = Ty->getElementType();
1182 if (ElementTy->isIntegerTy(1)) {
1183 if (ISD == ISD::AND) {
1184 // Example sequences:
1185 // vsetvli a0, zero, e8, mf8, ta, ma
1186 // vmnot.m v8, v0
1187 // vcpop.m a0, v8
1188 // seqz a0, a0
1189 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1190 return (LT.first - 1) +
1191 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1192 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1194 } else {
1195 // Example sequences:
1196 // vsetvli a0, zero, e8, mf8, ta, ma
1197 // vcpop.m a0, v0
1198 // snez a0, a0
1199 Opcodes = {RISCV::VCPOP_M};
1200 return (LT.first - 1) +
1201 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1202 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1204 }
1205 }
1206
1207 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1209 Opcodes.push_back(RISCV::VFMV_S_F);
1210 for (unsigned i = 0; i < LT.first.getValue(); i++)
1211 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1212 Opcodes.push_back(RISCV::VFMV_F_S);
1213 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1214 }
1215 unsigned SplitOp;
1216 switch (ISD) {
1217 case ISD::ADD:
1218 SplitOp = RISCV::VADD_VV;
1219 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1220 break;
1221 case ISD::OR:
1222 SplitOp = RISCV::VOR_VV;
1223 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1224 break;
1225 case ISD::XOR:
1226 SplitOp = RISCV::VXOR_VV;
1227 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1228 break;
1229 case ISD::AND:
1230 SplitOp = RISCV::VAND_VV;
1231 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1232 break;
1233 case ISD::FADD:
1234 SplitOp = RISCV::VFADD_VV;
1235 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1236 break;
1237 }
1238 // Add a cost for data larger than LMUL8
1239 InstructionCost SplitCost =
1240 (LT.first > 1) ? (LT.first - 1) *
1241 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1242 : 0;
1243 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1244}
1245
1247 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1249 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1250 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1251 FMF, CostKind);
1252
1253 // Skip if scalar size of ResTy is bigger than ELEN.
1254 if (ResTy->getScalarSizeInBits() > ST->getELen())
1255 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1256 FMF, CostKind);
1257
1258 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1259 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1260 FMF, CostKind);
1261
1262 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1263
1264 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1265 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1266 FMF, CostKind);
1267
1268 return (LT.first - 1) +
1269 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1270}
1271
1273 TTI::OperandValueInfo OpInfo,
1275 assert(OpInfo.isConstant() && "non constant operand?");
1276 if (!isa<VectorType>(Ty))
1277 // FIXME: We need to account for immediate materialization here, but doing
1278 // a decent job requires more knowledge about the immediate than we
1279 // currently have here.
1280 return 0;
1281
1282 if (OpInfo.isUniform())
1283 // vmv.x.i, vmv.v.x, or vfmv.v.f
1284 // We ignore the cost of the scalar constant materialization to be consistent
1285 // with how we treat scalar constants themselves just above.
1286 return 1;
1287
1288 return getConstantPoolLoadCost(Ty, CostKind);
1289}
1290
1291
1293 MaybeAlign Alignment,
1294 unsigned AddressSpace,
1296 TTI::OperandValueInfo OpInfo,
1297 const Instruction *I) {
1298 EVT VT = TLI->getValueType(DL, Src, true);
1299 // Type legalization can't handle structs
1300 if (VT == MVT::Other)
1301 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1302 CostKind, OpInfo, I);
1303
1305 if (Opcode == Instruction::Store && OpInfo.isConstant())
1306 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1307 InstructionCost BaseCost =
1308 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1309 CostKind, OpInfo, I);
1310 // Assume memory ops cost scale with the number of vector registers
1311 // possible accessed by the instruction. Note that BasicTTI already
1312 // handles the LT.first term for us.
1313 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1314 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1315 BaseCost *= TLI->getLMULCost(LT.second);
1316 return Cost + BaseCost;
1317
1318}
1319
1321 Type *CondTy,
1322 CmpInst::Predicate VecPred,
1324 const Instruction *I) {
1326 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1327 I);
1328
1329 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1330 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1331 I);
1332
1333 // Skip if scalar size of ValTy is bigger than ELEN.
1334 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1335 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1336 I);
1337
1338 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1339 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1340 if (CondTy->isVectorTy()) {
1341 if (ValTy->getScalarSizeInBits() == 1) {
1342 // vmandn.mm v8, v8, v9
1343 // vmand.mm v9, v0, v9
1344 // vmor.mm v0, v9, v8
1345 return LT.first *
1346 getRISCVInstructionCost(
1347 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1348 LT.second, CostKind);
1349 }
1350 // vselect and max/min are supported natively.
1351 return LT.first *
1352 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1353 }
1354
1355 if (ValTy->getScalarSizeInBits() == 1) {
1356 // vmv.v.x v9, a0
1357 // vmsne.vi v9, v9, 0
1358 // vmandn.mm v8, v8, v9
1359 // vmand.mm v9, v0, v9
1360 // vmor.mm v0, v9, v8
1361 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1362 return LT.first *
1363 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1364 InterimVT, CostKind) +
1365 LT.first * getRISCVInstructionCost(
1366 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1367 LT.second, CostKind);
1368 }
1369
1370 // vmv.v.x v10, a0
1371 // vmsne.vi v0, v10, 0
1372 // vmerge.vvm v8, v9, v8, v0
1373 return LT.first * getRISCVInstructionCost(
1374 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1375 LT.second, CostKind);
1376 }
1377
1378 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1379 ValTy->isVectorTy()) {
1380 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1381
1382 // Support natively.
1383 if (CmpInst::isIntPredicate(VecPred))
1384 return LT.first * 1;
1385
1386 // If we do not support the input floating point vector type, use the base
1387 // one which will calculate as:
1388 // ScalarizeCost + Num * Cost for fixed vector,
1389 // InvalidCost for scalable vector.
1390 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1391 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1392 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1393 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1394 I);
1395 switch (VecPred) {
1396 // Support natively.
1397 case CmpInst::FCMP_OEQ:
1398 case CmpInst::FCMP_OGT:
1399 case CmpInst::FCMP_OGE:
1400 case CmpInst::FCMP_OLT:
1401 case CmpInst::FCMP_OLE:
1402 case CmpInst::FCMP_UNE:
1403 return LT.first * 1;
1404 // TODO: Other comparisons?
1405 default:
1406 break;
1407 }
1408 }
1409
1410 // TODO: Add cost for scalar type.
1411
1412 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1413}
1414
1417 const Instruction *I) {
1419 return Opcode == Instruction::PHI ? 0 : 1;
1420 // Branches are assumed to be predicted.
1421 return 0;
1422}
1423
1426 unsigned Index, Value *Op0,
1427 Value *Op1) {
1428 assert(Val->isVectorTy() && "This must be a vector type");
1429
1430 if (Opcode != Instruction::ExtractElement &&
1431 Opcode != Instruction::InsertElement)
1432 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1433
1434 // Legalize the type.
1435 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1436
1437 // This type is legalized to a scalar type.
1438 if (!LT.second.isVector()) {
1439 auto *FixedVecTy = cast<FixedVectorType>(Val);
1440 // If Index is a known constant, cost is zero.
1441 if (Index != -1U)
1442 return 0;
1443 // Extract/InsertElement with non-constant index is very costly when
1444 // scalarized; estimate cost of loads/stores sequence via the stack:
1445 // ExtractElement cost: store vector to stack, load scalar;
1446 // InsertElement cost: store vector to stack, store scalar, load vector.
1447 Type *ElemTy = FixedVecTy->getElementType();
1448 auto NumElems = FixedVecTy->getNumElements();
1449 auto Align = DL.getPrefTypeAlign(ElemTy);
1450 InstructionCost LoadCost =
1451 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1452 InstructionCost StoreCost =
1453 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1454 return Opcode == Instruction::ExtractElement
1455 ? StoreCost * NumElems + LoadCost
1456 : (StoreCost + LoadCost) * NumElems + StoreCost;
1457 }
1458
1459 // For unsupported scalable vector.
1460 if (LT.second.isScalableVector() && !LT.first.isValid())
1461 return LT.first;
1462
1463 if (!isTypeLegal(Val))
1464 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1465
1466 // Mask vector extract/insert is expanded via e8.
1467 if (Val->getScalarSizeInBits() == 1) {
1468 VectorType *WideTy =
1470 cast<VectorType>(Val)->getElementCount());
1471 if (Opcode == Instruction::ExtractElement) {
1472 InstructionCost ExtendCost
1473 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1475 InstructionCost ExtractCost
1476 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1477 return ExtendCost + ExtractCost;
1478 }
1479 InstructionCost ExtendCost
1480 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1482 InstructionCost InsertCost
1483 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1484 InstructionCost TruncCost
1485 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1487 return ExtendCost + InsertCost + TruncCost;
1488 }
1489
1490
1491 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1492 // and vslideup + vmv.s.x to insert element to vector.
1493 unsigned BaseCost = 1;
1494 // When insertelement we should add the index with 1 as the input of vslideup.
1495 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1496
1497 if (Index != -1U) {
1498 // The type may be split. For fixed-width vectors we can normalize the
1499 // index to the new type.
1500 if (LT.second.isFixedLengthVector()) {
1501 unsigned Width = LT.second.getVectorNumElements();
1502 Index = Index % Width;
1503 }
1504
1505 // We could extract/insert the first element without vslidedown/vslideup.
1506 if (Index == 0)
1507 SlideCost = 0;
1508 else if (Opcode == Instruction::InsertElement)
1509 SlideCost = 1; // With a constant index, we do not need to use addi.
1510 }
1511
1512 // Extract i64 in the target that has XLEN=32 need more instruction.
1513 if (Val->getScalarType()->isIntegerTy() &&
1514 ST->getXLen() < Val->getScalarSizeInBits()) {
1515 // For extractelement, we need the following instructions:
1516 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1517 // vslidedown.vx v8, v8, a0
1518 // vmv.x.s a0, v8
1519 // li a1, 32
1520 // vsrl.vx v8, v8, a1
1521 // vmv.x.s a1, v8
1522
1523 // For insertelement, we need the following instructions:
1524 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1525 // vmv.v.i v12, 0
1526 // vslide1up.vx v16, v12, a1
1527 // vslide1up.vx v12, v16, a0
1528 // addi a0, a2, 1
1529 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1530 // vslideup.vx v8, v12, a2
1531
1532 // TODO: should we count these special vsetvlis?
1533 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1534 }
1535 return BaseCost + SlideCost;
1536}
1537
1539 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1541 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1542
1543 // TODO: Handle more cost kinds.
1545 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1546 Args, CxtI);
1547
1548 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1549 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1550 Args, CxtI);
1551
1552 // Skip if scalar size of Ty is bigger than ELEN.
1553 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1554 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1555 Args, CxtI);
1556
1557 // Legalize the type.
1558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1559
1560 // TODO: Handle scalar type.
1561 if (!LT.second.isVector())
1562 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1563 Args, CxtI);
1564
1565
1566 auto getConstantMatCost =
1567 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1568 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1569 // Two sub-cases:
1570 // * Has a 5 bit immediate operand which can be splatted.
1571 // * Has a larger immediate which must be materialized in scalar register
1572 // We return 0 for both as we currently ignore the cost of materializing
1573 // scalar constants in GPRs.
1574 return 0;
1575
1576 return getConstantPoolLoadCost(Ty, CostKind);
1577 };
1578
1579 // Add the cost of materializing any constant vectors required.
1580 InstructionCost ConstantMatCost = 0;
1581 if (Op1Info.isConstant())
1582 ConstantMatCost += getConstantMatCost(0, Op1Info);
1583 if (Op2Info.isConstant())
1584 ConstantMatCost += getConstantMatCost(1, Op2Info);
1585
1586 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1587 case ISD::ADD:
1588 case ISD::SUB:
1589 case ISD::AND:
1590 case ISD::OR:
1591 case ISD::XOR:
1592 case ISD::SHL:
1593 case ISD::SRL:
1594 case ISD::SRA:
1595 case ISD::MUL:
1596 case ISD::MULHS:
1597 case ISD::MULHU:
1598 case ISD::FADD:
1599 case ISD::FSUB:
1600 case ISD::FMUL:
1601 case ISD::FNEG: {
1602 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1603 }
1604 default:
1605 return ConstantMatCost +
1606 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1607 Args, CxtI);
1608 }
1609}
1610
1611// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1613 ArrayRef<const Value *> Ptrs, const Value *Base,
1614 const TTI::PointersChainInfo &Info, Type *AccessTy,
1617 // In the basic model we take into account GEP instructions only
1618 // (although here can come alloca instruction, a value, constants and/or
1619 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1620 // pointer). Typically, if Base is a not a GEP-instruction and all the
1621 // pointers are relative to the same base address, all the rest are
1622 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1623 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1624 // any their index is a non-const.
1625 // If no known dependecies between the pointers cost is calculated as a sum
1626 // of costs of GEP instructions.
1627 for (auto [I, V] : enumerate(Ptrs)) {
1628 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1629 if (!GEP)
1630 continue;
1631 if (Info.isSameBase() && V != Base) {
1632 if (GEP->hasAllConstantIndices())
1633 continue;
1634 // If the chain is unit-stride and BaseReg + stride*i is a legal
1635 // addressing mode, then presume the base GEP is sitting around in a
1636 // register somewhere and check if we can fold the offset relative to
1637 // it.
1638 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1639 if (Info.isUnitStride() &&
1640 isLegalAddressingMode(AccessTy,
1641 /* BaseGV */ nullptr,
1642 /* BaseOffset */ Stride * I,
1643 /* HasBaseReg */ true,
1644 /* Scale */ 0,
1645 GEP->getType()->getPointerAddressSpace()))
1646 continue;
1647 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1648 {TTI::OK_AnyValue, TTI::OP_None},
1649 {TTI::OK_AnyValue, TTI::OP_None},
1650 std::nullopt);
1651 } else {
1652 SmallVector<const Value *> Indices(GEP->indices());
1653 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1654 Indices, AccessTy, CostKind);
1655 }
1656 }
1657 return Cost;
1658}
1659
1663 // TODO: More tuning on benchmarks and metrics with changes as needed
1664 // would apply to all settings below to enable performance.
1665
1666
1667 if (ST->enableDefaultUnroll())
1668 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1669
1670 // Enable Upper bound unrolling universally, not dependant upon the conditions
1671 // below.
1672 UP.UpperBound = true;
1673
1674 // Disable loop unrolling for Oz and Os.
1675 UP.OptSizeThreshold = 0;
1677 if (L->getHeader()->getParent()->hasOptSize())
1678 return;
1679
1680 SmallVector<BasicBlock *, 4> ExitingBlocks;
1681 L->getExitingBlocks(ExitingBlocks);
1682 LLVM_DEBUG(dbgs() << "Loop has:\n"
1683 << "Blocks: " << L->getNumBlocks() << "\n"
1684 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1685
1686 // Only allow another exit other than the latch. This acts as an early exit
1687 // as it mirrors the profitability calculation of the runtime unroller.
1688 if (ExitingBlocks.size() > 2)
1689 return;
1690
1691 // Limit the CFG of the loop body for targets with a branch predictor.
1692 // Allowing 4 blocks permits if-then-else diamonds in the body.
1693 if (L->getNumBlocks() > 4)
1694 return;
1695
1696 // Don't unroll vectorized loops, including the remainder loop
1697 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1698 return;
1699
1700 // Scan the loop: don't unroll loops with calls as this could prevent
1701 // inlining.
1703 for (auto *BB : L->getBlocks()) {
1704 for (auto &I : *BB) {
1705 // Initial setting - Don't unroll loops containing vectorized
1706 // instructions.
1707 if (I.getType()->isVectorTy())
1708 return;
1709
1710 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1711 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1712 if (!isLoweredToCall(F))
1713 continue;
1714 }
1715 return;
1716 }
1717
1718 SmallVector<const Value *> Operands(I.operand_values());
1721 }
1722 }
1723
1724 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1725
1726 UP.Partial = true;
1727 UP.Runtime = true;
1728 UP.UnrollRemainder = true;
1729 UP.UnrollAndJam = true;
1731
1732 // Force unrolling small loops can be very useful because of the branch
1733 // taken cost of the backedge.
1734 if (Cost < 12)
1735 UP.Force = true;
1736}
1737
1741}
1742
1745 if (Ty->isVectorTy()) {
1746 if (Size.isScalable() && ST->hasVInstructions())
1747 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1748
1750 return divideCeil(Size, ST->getRealMinVLen());
1751 }
1752
1753 return BaseT::getRegUsageForType(Ty);
1754}
1755
1756unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1757 if (SLPMaxVF.getNumOccurrences())
1758 return SLPMaxVF;
1759
1760 // Return how many elements can fit in getRegisterBitwidth. This is the
1761 // same routine as used in LoopVectorizer. We should probably be
1762 // accounting for whether we actually have instructions with the right
1763 // lane type, but we don't have enough information to do that without
1764 // some additional plumbing which hasn't been justified yet.
1765 TypeSize RegWidth =
1767 // If no vector registers, or absurd element widths, disable
1768 // vectorization by returning 1.
1769 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1770}
1771
1773 const TargetTransformInfo::LSRCost &C2) {
1774 // RISC-V specific here are "instruction number 1st priority".
1775 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1776 C1.NumIVMuls, C1.NumBaseAdds,
1777 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1778 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1779 C2.NumIVMuls, C2.NumBaseAdds,
1780 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1781}
1782
1784 auto *VTy = dyn_cast<VectorType>(DataTy);
1785 if (!VTy || VTy->isScalableTy())
1786 return false;
1787
1788 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1789 return false;
1790 return true;
1791}
1792
1794 const Function *Callee) const {
1795 const TargetMachine &TM = getTLI()->getTargetMachine();
1796
1797 const FeatureBitset &CallerBits =
1798 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1799 const FeatureBitset &CalleeBits =
1800 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1801
1802 // Inline a callee if its target-features are a subset of the callers
1803 // target-features.
1804 return (CallerBits & CalleeBits) == CalleeBits;
1805}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:756
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:755
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:969
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:972
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:970
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:971
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:973
@ ICMP_EQ
equal
Definition: InstrTypes.h:987
@ ICMP_NE
not equal
Definition: InstrTypes.h:988
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:982
bool isIntPredicate() const
Definition: InstrTypes.h:1096
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).