LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
973 // For now, skip all fixed vector cost analysis when P extension is available
974 // to avoid crashes in getMinRVVVectorSizeInBits()
975 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
976 return 1; // Treat as single instruction cost for now
977 }
978
979 // A build_vector (which is m1 sized or smaller) can be done in no
980 // worse than one vslide1down.vx per element in the type. We could
981 // in theory do an explode_vector in the inverse manner, but our
982 // lowering today does not have a first class node for this pattern.
984 Ty, DemandedElts, Insert, Extract, CostKind);
985 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
986 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
987 if (Ty->getScalarSizeInBits() == 1) {
988 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
989 // Note: Implicit scalar anyextend is assumed to be free since the i1
990 // must be stored in a GPR.
991 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
992 CostKind) +
993 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
995 }
996
997 assert(LT.second.isFixedLengthVector());
998 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
999 if (isM1OrSmaller(ContainerVT)) {
1000 InstructionCost BV =
1001 cast<FixedVectorType>(Ty)->getNumElements() *
1002 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1003 if (BV < Cost)
1004 Cost = BV;
1005 }
1006 }
1007 return Cost;
1008}
1009
1013 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1014 : Instruction::Store;
1015 Type *Src = MICA.getDataType();
1016 Align Alignment = MICA.getAlignment();
1017 unsigned AddressSpace = MICA.getAddressSpace();
1018
1019 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1022
1023 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1024}
1025
1027 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1028 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1029 bool UseMaskForCond, bool UseMaskForGaps) const {
1030
1031 // The interleaved memory access pass will lower (de)interleave ops combined
1032 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1033 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1034 // gap).
1035 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1036 auto *VTy = cast<VectorType>(VecTy);
1037 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1038 // Need to make sure type has't been scalarized
1039 if (LT.second.isVector()) {
1040 auto *SubVecTy =
1041 VectorType::get(VTy->getElementType(),
1042 VTy->getElementCount().divideCoefficientBy(Factor));
1043 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1044 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1045 AddressSpace, DL)) {
1046
1047 // Some processors optimize segment loads/stores as one wide memory op +
1048 // Factor * LMUL shuffle ops.
1049 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1051 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1052 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1053 Cost += Factor * TLI->getLMULCost(SubVecVT);
1054 return LT.first * Cost;
1055 }
1056
1057 // Otherwise, the cost is proportional to the number of elements (VL *
1058 // Factor ops).
1059 InstructionCost MemOpCost =
1060 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1061 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1062 unsigned NumLoads = getEstimatedVLFor(VTy);
1063 return NumLoads * MemOpCost;
1064 }
1065 }
1066 }
1067
1068 // TODO: Return the cost of interleaved accesses for scalable vector when
1069 // unable to convert to segment accesses instructions.
1070 if (isa<ScalableVectorType>(VecTy))
1072
1073 auto *FVTy = cast<FixedVectorType>(VecTy);
1074 InstructionCost MemCost =
1075 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1076 unsigned VF = FVTy->getNumElements() / Factor;
1077
1078 // An interleaved load will look like this for Factor=3:
1079 // %wide.vec = load <12 x i32>, ptr %3, align 4
1080 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1081 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1082 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1083 if (Opcode == Instruction::Load) {
1084 InstructionCost Cost = MemCost;
1085 for (unsigned Index : Indices) {
1086 FixedVectorType *VecTy =
1087 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1088 auto Mask = createStrideMask(Index, Factor, VF);
1089 Mask.resize(VF * Factor, -1);
1090 InstructionCost ShuffleCost =
1092 Mask, CostKind, 0, nullptr, {});
1093 Cost += ShuffleCost;
1094 }
1095 return Cost;
1096 }
1097
1098 // TODO: Model for NF > 2
1099 // We'll need to enhance getShuffleCost to model shuffles that are just
1100 // inserts and extracts into subvectors, since they won't have the full cost
1101 // of a vrgather.
1102 // An interleaved store for 3 vectors of 4 lanes will look like
1103 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1104 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1105 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1106 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1107 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1108 if (Factor != 2)
1109 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1110 Alignment, AddressSpace, CostKind,
1111 UseMaskForCond, UseMaskForGaps);
1112
1113 assert(Opcode == Instruction::Store && "Opcode must be a store");
1114 // For an interleaving store of 2 vectors, we perform one large interleaving
1115 // shuffle that goes into the wide store
1116 auto Mask = createInterleaveMask(VF, Factor);
1117 InstructionCost ShuffleCost =
1119 CostKind, 0, nullptr, {});
1120 return MemCost + ShuffleCost;
1121}
1122
1124 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1125 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1127 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1128 Alignment, CostKind, I);
1129
1130 if ((Opcode == Instruction::Load &&
1131 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1132 (Opcode == Instruction::Store &&
1133 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1134 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1135 Alignment, CostKind, I);
1136
1137 // Cost is proportional to the number of memory operations implied. For
1138 // scalable vectors, we use an estimate on that number since we don't
1139 // know exactly what VL will be.
1140 auto &VTy = *cast<VectorType>(DataTy);
1141 InstructionCost MemOpCost =
1142 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1143 {TTI::OK_AnyValue, TTI::OP_None}, I);
1144 unsigned NumLoads = getEstimatedVLFor(&VTy);
1145 return NumLoads * MemOpCost;
1146}
1147
1149 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1150 TTI::TargetCostKind CostKind, const Instruction *I) const {
1151 bool IsLegal = (Opcode == Instruction::Store &&
1152 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1153 (Opcode == Instruction::Load &&
1154 isLegalMaskedExpandLoad(DataTy, Alignment));
1155 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1156 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1157 Alignment, CostKind, I);
1158 // Example compressstore sequence:
1159 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1160 // vcompress.vm v10, v8, v0
1161 // vcpop.m a1, v0
1162 // vsetvli zero, a1, e32, m2, ta, ma
1163 // vse32.v v10, (a0)
1164 // Example expandload sequence:
1165 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1166 // vcpop.m a1, v0
1167 // vsetvli zero, a1, e32, m2, ta, ma
1168 // vle32.v v10, (a0)
1169 // vsetivli zero, 8, e32, m2, ta, ma
1170 // viota.m v12, v0
1171 // vrgather.vv v8, v10, v12, v0.t
1172 auto MemOpCost =
1173 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1174 auto LT = getTypeLegalizationCost(DataTy);
1175 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1176 if (VariableMask)
1177 Opcodes.push_back(RISCV::VCPOP_M);
1178 if (Opcode == Instruction::Store)
1179 Opcodes.append({RISCV::VCOMPRESS_VM});
1180 else
1181 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1182 return MemOpCost +
1183 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1184}
1185
1187 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1188 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1189 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1190 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1191 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1192 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1193 Alignment, CostKind, I);
1194
1196 return TTI::TCC_Basic;
1197
1198 // Cost is proportional to the number of memory operations implied. For
1199 // scalable vectors, we use an estimate on that number since we don't
1200 // know exactly what VL will be.
1201 auto &VTy = *cast<VectorType>(DataTy);
1202 InstructionCost MemOpCost =
1203 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1204 {TTI::OK_AnyValue, TTI::OP_None}, I);
1205 unsigned NumLoads = getEstimatedVLFor(&VTy);
1206 return NumLoads * MemOpCost;
1207}
1208
1211 // FIXME: This is a property of the default vector convention, not
1212 // all possible calling conventions. Fixing that will require
1213 // some TTI API and SLP rework.
1216 for (auto *Ty : Tys) {
1217 if (!Ty->isVectorTy())
1218 continue;
1219 Align A = DL.getPrefTypeAlign(Ty);
1220 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1221 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1222 }
1223 return Cost;
1224}
1225
1226// Currently, these represent both throughput and codesize costs
1227// for the respective intrinsics. The costs in this table are simply
1228// instruction counts with the following adjustments made:
1229// * One vsetvli is considered free.
1231 {Intrinsic::floor, MVT::f32, 9},
1232 {Intrinsic::floor, MVT::f64, 9},
1233 {Intrinsic::ceil, MVT::f32, 9},
1234 {Intrinsic::ceil, MVT::f64, 9},
1235 {Intrinsic::trunc, MVT::f32, 7},
1236 {Intrinsic::trunc, MVT::f64, 7},
1237 {Intrinsic::round, MVT::f32, 9},
1238 {Intrinsic::round, MVT::f64, 9},
1239 {Intrinsic::roundeven, MVT::f32, 9},
1240 {Intrinsic::roundeven, MVT::f64, 9},
1241 {Intrinsic::rint, MVT::f32, 7},
1242 {Intrinsic::rint, MVT::f64, 7},
1243 {Intrinsic::nearbyint, MVT::f32, 9},
1244 {Intrinsic::nearbyint, MVT::f64, 9},
1245 {Intrinsic::bswap, MVT::i16, 3},
1246 {Intrinsic::bswap, MVT::i32, 12},
1247 {Intrinsic::bswap, MVT::i64, 31},
1248 {Intrinsic::vp_bswap, MVT::i16, 3},
1249 {Intrinsic::vp_bswap, MVT::i32, 12},
1250 {Intrinsic::vp_bswap, MVT::i64, 31},
1251 {Intrinsic::vp_fshl, MVT::i8, 7},
1252 {Intrinsic::vp_fshl, MVT::i16, 7},
1253 {Intrinsic::vp_fshl, MVT::i32, 7},
1254 {Intrinsic::vp_fshl, MVT::i64, 7},
1255 {Intrinsic::vp_fshr, MVT::i8, 7},
1256 {Intrinsic::vp_fshr, MVT::i16, 7},
1257 {Intrinsic::vp_fshr, MVT::i32, 7},
1258 {Intrinsic::vp_fshr, MVT::i64, 7},
1259 {Intrinsic::bitreverse, MVT::i8, 17},
1260 {Intrinsic::bitreverse, MVT::i16, 24},
1261 {Intrinsic::bitreverse, MVT::i32, 33},
1262 {Intrinsic::bitreverse, MVT::i64, 52},
1263 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1264 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1265 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1266 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1267 {Intrinsic::ctpop, MVT::i8, 12},
1268 {Intrinsic::ctpop, MVT::i16, 19},
1269 {Intrinsic::ctpop, MVT::i32, 20},
1270 {Intrinsic::ctpop, MVT::i64, 21},
1271 {Intrinsic::ctlz, MVT::i8, 19},
1272 {Intrinsic::ctlz, MVT::i16, 28},
1273 {Intrinsic::ctlz, MVT::i32, 31},
1274 {Intrinsic::ctlz, MVT::i64, 35},
1275 {Intrinsic::cttz, MVT::i8, 16},
1276 {Intrinsic::cttz, MVT::i16, 23},
1277 {Intrinsic::cttz, MVT::i32, 24},
1278 {Intrinsic::cttz, MVT::i64, 25},
1279 {Intrinsic::vp_ctpop, MVT::i8, 12},
1280 {Intrinsic::vp_ctpop, MVT::i16, 19},
1281 {Intrinsic::vp_ctpop, MVT::i32, 20},
1282 {Intrinsic::vp_ctpop, MVT::i64, 21},
1283 {Intrinsic::vp_ctlz, MVT::i8, 19},
1284 {Intrinsic::vp_ctlz, MVT::i16, 28},
1285 {Intrinsic::vp_ctlz, MVT::i32, 31},
1286 {Intrinsic::vp_ctlz, MVT::i64, 35},
1287 {Intrinsic::vp_cttz, MVT::i8, 16},
1288 {Intrinsic::vp_cttz, MVT::i16, 23},
1289 {Intrinsic::vp_cttz, MVT::i32, 24},
1290 {Intrinsic::vp_cttz, MVT::i64, 25},
1291};
1292
1296 auto *RetTy = ICA.getReturnType();
1297 switch (ICA.getID()) {
1298 case Intrinsic::lrint:
1299 case Intrinsic::llrint:
1300 case Intrinsic::lround:
1301 case Intrinsic::llround: {
1302 auto LT = getTypeLegalizationCost(RetTy);
1303 Type *SrcTy = ICA.getArgTypes().front();
1304 auto SrcLT = getTypeLegalizationCost(SrcTy);
1305 if (ST->hasVInstructions() && LT.second.isVector()) {
1307 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1308 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1309 if (LT.second.getVectorElementType() == MVT::bf16) {
1310 if (!ST->hasVInstructionsBF16Minimal())
1312 if (DstEltSz == 32)
1313 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1314 else
1315 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1316 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1317 !ST->hasVInstructionsF16()) {
1318 if (!ST->hasVInstructionsF16Minimal())
1320 if (DstEltSz == 32)
1321 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1322 else
1323 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1324
1325 } else if (SrcEltSz > DstEltSz) {
1326 Ops = {RISCV::VFNCVT_X_F_W};
1327 } else if (SrcEltSz < DstEltSz) {
1328 Ops = {RISCV::VFWCVT_X_F_V};
1329 } else {
1330 Ops = {RISCV::VFCVT_X_F_V};
1331 }
1332
1333 // We need to use the source LMUL in the case of a narrowing op, and the
1334 // destination LMUL otherwise.
1335 if (SrcEltSz > DstEltSz)
1336 return SrcLT.first *
1337 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1338 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1339 }
1340 break;
1341 }
1342 case Intrinsic::ceil:
1343 case Intrinsic::floor:
1344 case Intrinsic::trunc:
1345 case Intrinsic::rint:
1346 case Intrinsic::round:
1347 case Intrinsic::roundeven: {
1348 // These all use the same code.
1349 auto LT = getTypeLegalizationCost(RetTy);
1350 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1351 return LT.first * 8;
1352 break;
1353 }
1354 case Intrinsic::umin:
1355 case Intrinsic::umax:
1356 case Intrinsic::smin:
1357 case Intrinsic::smax: {
1358 auto LT = getTypeLegalizationCost(RetTy);
1359 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1360 return LT.first;
1361
1362 if (ST->hasVInstructions() && LT.second.isVector()) {
1363 unsigned Op;
1364 switch (ICA.getID()) {
1365 case Intrinsic::umin:
1366 Op = RISCV::VMINU_VV;
1367 break;
1368 case Intrinsic::umax:
1369 Op = RISCV::VMAXU_VV;
1370 break;
1371 case Intrinsic::smin:
1372 Op = RISCV::VMIN_VV;
1373 break;
1374 case Intrinsic::smax:
1375 Op = RISCV::VMAX_VV;
1376 break;
1377 }
1378 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1379 }
1380 break;
1381 }
1382 case Intrinsic::sadd_sat:
1383 case Intrinsic::ssub_sat:
1384 case Intrinsic::uadd_sat:
1385 case Intrinsic::usub_sat: {
1386 auto LT = getTypeLegalizationCost(RetTy);
1387 if (ST->hasVInstructions() && LT.second.isVector()) {
1388 unsigned Op;
1389 switch (ICA.getID()) {
1390 case Intrinsic::sadd_sat:
1391 Op = RISCV::VSADD_VV;
1392 break;
1393 case Intrinsic::ssub_sat:
1394 Op = RISCV::VSSUBU_VV;
1395 break;
1396 case Intrinsic::uadd_sat:
1397 Op = RISCV::VSADDU_VV;
1398 break;
1399 case Intrinsic::usub_sat:
1400 Op = RISCV::VSSUBU_VV;
1401 break;
1402 }
1403 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1404 }
1405 break;
1406 }
1407 case Intrinsic::fma:
1408 case Intrinsic::fmuladd: {
1409 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1410 auto LT = getTypeLegalizationCost(RetTy);
1411 if (ST->hasVInstructions() && LT.second.isVector())
1412 return LT.first *
1413 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1414 break;
1415 }
1416 case Intrinsic::fabs: {
1417 auto LT = getTypeLegalizationCost(RetTy);
1418 if (ST->hasVInstructions() && LT.second.isVector()) {
1419 // lui a0, 8
1420 // addi a0, a0, -1
1421 // vsetvli a1, zero, e16, m1, ta, ma
1422 // vand.vx v8, v8, a0
1423 // f16 with zvfhmin and bf16 with zvfhbmin
1424 if (LT.second.getVectorElementType() == MVT::bf16 ||
1425 (LT.second.getVectorElementType() == MVT::f16 &&
1426 !ST->hasVInstructionsF16()))
1427 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1428 CostKind) +
1429 2;
1430 else
1431 return LT.first *
1432 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1433 }
1434 break;
1435 }
1436 case Intrinsic::sqrt: {
1437 auto LT = getTypeLegalizationCost(RetTy);
1438 if (ST->hasVInstructions() && LT.second.isVector()) {
1441 MVT ConvType = LT.second;
1442 MVT FsqrtType = LT.second;
1443 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1444 // will be spilt.
1445 if (LT.second.getVectorElementType() == MVT::bf16) {
1446 if (LT.second == MVT::nxv32bf16) {
1447 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1448 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1449 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1450 ConvType = MVT::nxv16f16;
1451 FsqrtType = MVT::nxv16f32;
1452 } else {
1453 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1454 FsqrtOp = {RISCV::VFSQRT_V};
1455 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1456 }
1457 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1458 !ST->hasVInstructionsF16()) {
1459 if (LT.second == MVT::nxv32f16) {
1460 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1461 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1462 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1463 ConvType = MVT::nxv16f16;
1464 FsqrtType = MVT::nxv16f32;
1465 } else {
1466 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1467 FsqrtOp = {RISCV::VFSQRT_V};
1468 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1469 }
1470 } else {
1471 FsqrtOp = {RISCV::VFSQRT_V};
1472 }
1473
1474 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1475 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1476 }
1477 break;
1478 }
1479 case Intrinsic::cttz:
1480 case Intrinsic::ctlz:
1481 case Intrinsic::ctpop: {
1482 auto LT = getTypeLegalizationCost(RetTy);
1483 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1484 unsigned Op;
1485 switch (ICA.getID()) {
1486 case Intrinsic::cttz:
1487 Op = RISCV::VCTZ_V;
1488 break;
1489 case Intrinsic::ctlz:
1490 Op = RISCV::VCLZ_V;
1491 break;
1492 case Intrinsic::ctpop:
1493 Op = RISCV::VCPOP_V;
1494 break;
1495 }
1496 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1497 }
1498 break;
1499 }
1500 case Intrinsic::abs: {
1501 auto LT = getTypeLegalizationCost(RetTy);
1502 if (ST->hasVInstructions() && LT.second.isVector()) {
1503 // vrsub.vi v10, v8, 0
1504 // vmax.vv v8, v8, v10
1505 return LT.first *
1506 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1507 LT.second, CostKind);
1508 }
1509 break;
1510 }
1511 case Intrinsic::get_active_lane_mask: {
1512 if (ST->hasVInstructions()) {
1513 Type *ExpRetTy = VectorType::get(
1514 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1515 auto LT = getTypeLegalizationCost(ExpRetTy);
1516
1517 // vid.v v8 // considered hoisted
1518 // vsaddu.vx v8, v8, a0
1519 // vmsltu.vx v0, v8, a1
1520 return LT.first *
1521 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1522 LT.second, CostKind);
1523 }
1524 break;
1525 }
1526 // TODO: add more intrinsic
1527 case Intrinsic::stepvector: {
1528 auto LT = getTypeLegalizationCost(RetTy);
1529 // Legalisation of illegal types involves an `index' instruction plus
1530 // (LT.first - 1) vector adds.
1531 if (ST->hasVInstructions())
1532 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1533 (LT.first - 1) *
1534 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1535 return 1 + (LT.first - 1);
1536 }
1537 case Intrinsic::experimental_cttz_elts: {
1538 Type *ArgTy = ICA.getArgTypes()[0];
1539 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1540 if (getTLI()->shouldExpandCttzElements(ArgType))
1541 break;
1542 InstructionCost Cost = getRISCVInstructionCost(
1543 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1544
1545 // If zero_is_poison is false, then we will generate additional
1546 // cmp + select instructions to convert -1 to EVL.
1547 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1548 if (ICA.getArgs().size() > 1 &&
1549 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1550 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1552 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1554
1555 return Cost;
1556 }
1557 case Intrinsic::experimental_vp_splat: {
1558 auto LT = getTypeLegalizationCost(RetTy);
1559 // TODO: Lower i1 experimental_vp_splat
1560 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1562 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1563 ? RISCV::VFMV_V_F
1564 : RISCV::VMV_V_X,
1565 LT.second, CostKind);
1566 }
1567 case Intrinsic::experimental_vp_splice: {
1568 // To support type-based query from vectorizer, set the index to 0.
1569 // Note that index only change the cost from vslide.vx to vslide.vi and in
1570 // current implementations they have same costs.
1572 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1574 }
1575 case Intrinsic::fptoui_sat:
1576 case Intrinsic::fptosi_sat: {
1578 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1579 Type *SrcTy = ICA.getArgTypes()[0];
1580
1581 auto SrcLT = getTypeLegalizationCost(SrcTy);
1582 auto DstLT = getTypeLegalizationCost(RetTy);
1583 if (!SrcTy->isVectorTy())
1584 break;
1585
1586 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1588
1589 Cost +=
1590 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1591 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1592
1593 // Handle NaN.
1594 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1595 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1596 Type *CondTy = RetTy->getWithNewBitWidth(1);
1597 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1599 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1601 return Cost;
1602 }
1603 }
1604
1605 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1606 if (auto LT = getTypeLegalizationCost(RetTy);
1607 LT.second.isVector()) {
1608 MVT EltTy = LT.second.getVectorElementType();
1609 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1610 ICA.getID(), EltTy))
1611 return LT.first * Entry->Cost;
1612 }
1613 }
1614
1616}
1617
1620 const SCEV *Ptr,
1622 // Address computations for vector indexed load/store likely require an offset
1623 // and/or scaling.
1624 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1625 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1626
1627 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1628}
1629
1631 Type *Src,
1634 const Instruction *I) const {
1635 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1636 if (!IsVectorType)
1637 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1638
1639 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1640 // For now, skip all fixed vector cost analysis when P extension is available
1641 // to avoid crashes in getMinRVVVectorSizeInBits()
1642 if (ST->enablePExtCodeGen() &&
1644 return 1; // Treat as single instruction cost for now
1645 }
1646
1647 // FIXME: Need to compute legalizing cost for illegal types. The current
1648 // code handles only legal types and those which can be trivially
1649 // promoted to legal.
1650 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1651 Dst->getScalarSizeInBits() > ST->getELen())
1652 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1653
1654 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1655 assert(ISD && "Invalid opcode");
1656 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1657 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1658
1659 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1660 // The shared implementation doesn't model vector widening during legalization
1661 // and instead assumes scalarization. In order to scalarize an <N x i1>
1662 // vector, we need to extend/trunc to/from i8. If we don't special case
1663 // this, we can get an infinite recursion cycle.
1664 switch (ISD) {
1665 default:
1666 break;
1667 case ISD::SIGN_EXTEND:
1668 case ISD::ZERO_EXTEND:
1669 if (Src->getScalarSizeInBits() == 1) {
1670 // We do not use vsext/vzext to extend from mask vector.
1671 // Instead we use the following instructions to extend from mask vector:
1672 // vmv.v.i v8, 0
1673 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1674 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1675 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1676 DstLT.second, CostKind) +
1677 DstLT.first - 1;
1678 }
1679 break;
1680 case ISD::TRUNCATE:
1681 if (Dst->getScalarSizeInBits() == 1) {
1682 // We do not use several vncvt to truncate to mask vector. So we could
1683 // not use PowDiff to calculate it.
1684 // Instead we use the following instructions to truncate to mask vector:
1685 // vand.vi v8, v8, 1
1686 // vmsne.vi v0, v8, 0
1687 return SrcLT.first *
1688 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1689 SrcLT.second, CostKind) +
1690 SrcLT.first - 1;
1691 }
1692 break;
1693 };
1694
1695 // Our actual lowering for the case where a wider legal type is available
1696 // uses promotion to the wider type. This is reflected in the result of
1697 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1698 // scalarized if the legalized Src and Dst are not equal sized.
1699 const DataLayout &DL = this->getDataLayout();
1700 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1701 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1702 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1703 SrcLT.second.getSizeInBits()) ||
1704 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1705 DstLT.second.getSizeInBits()) ||
1706 SrcLT.first > 1 || DstLT.first > 1)
1707 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1708
1709 // The split cost is handled by the base getCastInstrCost
1710 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1711
1712 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1713 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1714 switch (ISD) {
1715 case ISD::SIGN_EXTEND:
1716 case ISD::ZERO_EXTEND: {
1717 if ((PowDiff < 1) || (PowDiff > 3))
1718 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1719 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1720 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1721 unsigned Op =
1722 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1723 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1724 }
1725 case ISD::TRUNCATE:
1726 case ISD::FP_EXTEND:
1727 case ISD::FP_ROUND: {
1728 // Counts of narrow/widen instructions.
1729 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1730 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1731
1732 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1733 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1734 : RISCV::VFNCVT_F_F_W;
1736 for (; SrcEltSize != DstEltSize;) {
1737 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1738 ? MVT::getIntegerVT(DstEltSize)
1739 : MVT::getFloatingPointVT(DstEltSize);
1740 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1741 DstEltSize =
1742 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1743 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1744 }
1745 return Cost;
1746 }
1747 case ISD::FP_TO_SINT:
1748 case ISD::FP_TO_UINT: {
1749 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1750 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1751 unsigned FWCVT =
1752 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1753 unsigned FNCVT =
1754 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1755 unsigned SrcEltSize = Src->getScalarSizeInBits();
1756 unsigned DstEltSize = Dst->getScalarSizeInBits();
1758 if ((SrcEltSize == 16) &&
1759 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1760 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1761 // pre-widening to f32 and then convert f32 to integer
1762 VectorType *VecF32Ty =
1763 VectorType::get(Type::getFloatTy(Dst->getContext()),
1764 cast<VectorType>(Dst)->getElementCount());
1765 std::pair<InstructionCost, MVT> VecF32LT =
1766 getTypeLegalizationCost(VecF32Ty);
1767 Cost +=
1768 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1769 VecF32LT.second, CostKind);
1770 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1771 return Cost;
1772 }
1773 if (DstEltSize == SrcEltSize)
1774 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1775 else if (DstEltSize > SrcEltSize)
1776 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1777 else { // (SrcEltSize > DstEltSize)
1778 // First do a narrowing conversion to an integer half the size, then
1779 // truncate if needed.
1780 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1781 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1782 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1783 if ((SrcEltSize / 2) > DstEltSize) {
1784 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1785 Cost +=
1786 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1787 }
1788 }
1789 return Cost;
1790 }
1791 case ISD::SINT_TO_FP:
1792 case ISD::UINT_TO_FP: {
1793 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1794 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1795 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1796 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1797 unsigned SrcEltSize = Src->getScalarSizeInBits();
1798 unsigned DstEltSize = Dst->getScalarSizeInBits();
1799
1801 if ((DstEltSize == 16) &&
1802 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1803 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1804 // it is converted to f32 and then converted to f16
1805 VectorType *VecF32Ty =
1806 VectorType::get(Type::getFloatTy(Dst->getContext()),
1807 cast<VectorType>(Dst)->getElementCount());
1808 std::pair<InstructionCost, MVT> VecF32LT =
1809 getTypeLegalizationCost(VecF32Ty);
1810 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1811 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1812 DstLT.second, CostKind);
1813 return Cost;
1814 }
1815
1816 if (DstEltSize == SrcEltSize)
1817 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1818 else if (DstEltSize > SrcEltSize) {
1819 if ((DstEltSize / 2) > SrcEltSize) {
1820 VectorType *VecTy =
1821 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1822 cast<VectorType>(Dst)->getElementCount());
1823 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1824 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1825 }
1826 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1827 } else
1828 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1829 return Cost;
1830 }
1831 }
1832 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1833}
1834
1835unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1836 if (isa<ScalableVectorType>(Ty)) {
1837 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1838 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1839 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1840 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1841 }
1842 return cast<FixedVectorType>(Ty)->getNumElements();
1843}
1844
1847 FastMathFlags FMF,
1849 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1850 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1851
1852 // Skip if scalar size of Ty is bigger than ELEN.
1853 if (Ty->getScalarSizeInBits() > ST->getELen())
1854 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1855
1856 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1857 if (Ty->getElementType()->isIntegerTy(1)) {
1858 // SelectionDAGBuilder does following transforms:
1859 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1860 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1861 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1862 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1863 else
1864 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1865 }
1866
1867 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1869 InstructionCost ExtraCost = 0;
1870 switch (IID) {
1871 case Intrinsic::maximum:
1872 if (FMF.noNaNs()) {
1873 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1874 } else {
1875 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1876 RISCV::VFMV_F_S};
1877 // Cost of Canonical Nan + branch
1878 // lui a0, 523264
1879 // fmv.w.x fa0, a0
1880 Type *DstTy = Ty->getScalarType();
1881 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1882 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1883 ExtraCost = 1 +
1884 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1886 getCFInstrCost(Instruction::Br, CostKind);
1887 }
1888 break;
1889
1890 case Intrinsic::minimum:
1891 if (FMF.noNaNs()) {
1892 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1893 } else {
1894 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1895 RISCV::VFMV_F_S};
1896 // Cost of Canonical Nan + branch
1897 // lui a0, 523264
1898 // fmv.w.x fa0, a0
1899 Type *DstTy = Ty->getScalarType();
1900 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1901 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1902 ExtraCost = 1 +
1903 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1905 getCFInstrCost(Instruction::Br, CostKind);
1906 }
1907 break;
1908 }
1909 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1910 }
1911
1912 // IR Reduction is composed by one rvv reduction instruction and vmv
1913 unsigned SplitOp;
1915 switch (IID) {
1916 default:
1917 llvm_unreachable("Unsupported intrinsic");
1918 case Intrinsic::smax:
1919 SplitOp = RISCV::VMAX_VV;
1920 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1921 break;
1922 case Intrinsic::smin:
1923 SplitOp = RISCV::VMIN_VV;
1924 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1925 break;
1926 case Intrinsic::umax:
1927 SplitOp = RISCV::VMAXU_VV;
1928 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1929 break;
1930 case Intrinsic::umin:
1931 SplitOp = RISCV::VMINU_VV;
1932 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1933 break;
1934 case Intrinsic::maxnum:
1935 SplitOp = RISCV::VFMAX_VV;
1936 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1937 break;
1938 case Intrinsic::minnum:
1939 SplitOp = RISCV::VFMIN_VV;
1940 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1941 break;
1942 }
1943 // Add a cost for data larger than LMUL8
1944 InstructionCost SplitCost =
1945 (LT.first > 1) ? (LT.first - 1) *
1946 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1947 : 0;
1948 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1949}
1950
1953 std::optional<FastMathFlags> FMF,
1955 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1956 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1957
1958 // Skip if scalar size of Ty is bigger than ELEN.
1959 if (Ty->getScalarSizeInBits() > ST->getELen())
1960 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1961
1962 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1963 assert(ISD && "Invalid opcode");
1964
1965 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1966 ISD != ISD::FADD)
1967 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1968
1969 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1970 Type *ElementTy = Ty->getElementType();
1971 if (ElementTy->isIntegerTy(1)) {
1972 // Example sequences:
1973 // vfirst.m a0, v0
1974 // seqz a0, a0
1975 if (LT.second == MVT::v1i1)
1976 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1977 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1979
1980 if (ISD == ISD::AND) {
1981 // Example sequences:
1982 // vmand.mm v8, v9, v8 ; needed every time type is split
1983 // vmnot.m v8, v0 ; alias for vmnand
1984 // vcpop.m a0, v8
1985 // seqz a0, a0
1986
1987 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1988 // For LMUL <= 8, there is no splitting,
1989 // the sequences are vmnot, vcpop and seqz.
1990 // When LMUL > 8 and split = 1,
1991 // the sequences are vmnand, vcpop and seqz.
1992 // When LMUL > 8 and split > 1,
1993 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1994 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1995 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1996 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1997 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1998 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2000 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2001 // Example sequences:
2002 // vsetvli a0, zero, e8, mf8, ta, ma
2003 // vmxor.mm v8, v0, v8 ; needed every time type is split
2004 // vcpop.m a0, v8
2005 // andi a0, a0, 1
2006 return (LT.first - 1) *
2007 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2008 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2009 } else {
2010 assert(ISD == ISD::OR);
2011 // Example sequences:
2012 // vsetvli a0, zero, e8, mf8, ta, ma
2013 // vmor.mm v8, v9, v8 ; needed every time type is split
2014 // vcpop.m a0, v0
2015 // snez a0, a0
2016 return (LT.first - 1) *
2017 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2018 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2019 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2021 }
2022 }
2023
2024 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2025 // instruction, and others is composed by two vmv and one rvv reduction
2026 // instruction
2027 unsigned SplitOp;
2029 switch (ISD) {
2030 case ISD::ADD:
2031 SplitOp = RISCV::VADD_VV;
2032 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2033 break;
2034 case ISD::OR:
2035 SplitOp = RISCV::VOR_VV;
2036 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2037 break;
2038 case ISD::XOR:
2039 SplitOp = RISCV::VXOR_VV;
2040 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2041 break;
2042 case ISD::AND:
2043 SplitOp = RISCV::VAND_VV;
2044 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2045 break;
2046 case ISD::FADD:
2047 // We can't promote f16/bf16 fadd reductions.
2048 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2049 LT.second.getScalarType() == MVT::bf16)
2050 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2052 Opcodes.push_back(RISCV::VFMV_S_F);
2053 for (unsigned i = 0; i < LT.first.getValue(); i++)
2054 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2055 Opcodes.push_back(RISCV::VFMV_F_S);
2056 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2057 }
2058 SplitOp = RISCV::VFADD_VV;
2059 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2060 break;
2061 }
2062 // Add a cost for data larger than LMUL8
2063 InstructionCost SplitCost =
2064 (LT.first > 1) ? (LT.first - 1) *
2065 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2066 : 0;
2067 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2068}
2069
2071 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2072 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2073 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2074 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2075 FMF, CostKind);
2076
2077 // Skip if scalar size of ResTy is bigger than ELEN.
2078 if (ResTy->getScalarSizeInBits() > ST->getELen())
2079 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2080 FMF, CostKind);
2081
2082 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2083 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2084 FMF, CostKind);
2085
2086 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2087
2088 if (IsUnsigned && Opcode == Instruction::Add &&
2089 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2090 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2091 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2092 return LT.first *
2093 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2094 }
2095
2096 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2097 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2098 FMF, CostKind);
2099
2100 return (LT.first - 1) +
2101 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2102}
2103
2107 assert(OpInfo.isConstant() && "non constant operand?");
2108 if (!isa<VectorType>(Ty))
2109 // FIXME: We need to account for immediate materialization here, but doing
2110 // a decent job requires more knowledge about the immediate than we
2111 // currently have here.
2112 return 0;
2113
2114 if (OpInfo.isUniform())
2115 // vmv.v.i, vmv.v.x, or vfmv.v.f
2116 // We ignore the cost of the scalar constant materialization to be consistent
2117 // with how we treat scalar constants themselves just above.
2118 return 1;
2119
2120 return getConstantPoolLoadCost(Ty, CostKind);
2121}
2122
2124 Align Alignment,
2125 unsigned AddressSpace,
2127 TTI::OperandValueInfo OpInfo,
2128 const Instruction *I) const {
2129 EVT VT = TLI->getValueType(DL, Src, true);
2130 // Type legalization can't handle structs
2131 if (VT == MVT::Other)
2132 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2133 CostKind, OpInfo, I);
2134
2136 if (Opcode == Instruction::Store && OpInfo.isConstant())
2137 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2138
2139 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2140
2141 InstructionCost BaseCost = [&]() {
2142 InstructionCost Cost = LT.first;
2144 return Cost;
2145
2146 // Our actual lowering for the case where a wider legal type is available
2147 // uses the a VL predicated load on the wider type. This is reflected in
2148 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2149 // widened cases are scalarized.
2150 const DataLayout &DL = this->getDataLayout();
2151 if (Src->isVectorTy() && LT.second.isVector() &&
2152 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2153 LT.second.getSizeInBits()))
2154 return Cost;
2155
2156 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2157 CostKind, OpInfo, I);
2158 }();
2159
2160 // Assume memory ops cost scale with the number of vector registers
2161 // possible accessed by the instruction. Note that BasicTTI already
2162 // handles the LT.first term for us.
2163 if (ST->hasVInstructions() && LT.second.isVector() &&
2165 BaseCost *= TLI->getLMULCost(LT.second);
2166 return Cost + BaseCost;
2167}
2168
2170 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2172 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2174 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2175 Op1Info, Op2Info, I);
2176
2177 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2178 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2179 Op1Info, Op2Info, I);
2180
2181 // Skip if scalar size of ValTy is bigger than ELEN.
2182 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2183 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2184 Op1Info, Op2Info, I);
2185
2186 auto GetConstantMatCost =
2187 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2188 if (OpInfo.isUniform())
2189 // We return 0 we currently ignore the cost of materializing scalar
2190 // constants in GPRs.
2191 return 0;
2192
2193 return getConstantPoolLoadCost(ValTy, CostKind);
2194 };
2195
2196 InstructionCost ConstantMatCost;
2197 if (Op1Info.isConstant())
2198 ConstantMatCost += GetConstantMatCost(Op1Info);
2199 if (Op2Info.isConstant())
2200 ConstantMatCost += GetConstantMatCost(Op2Info);
2201
2202 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2203 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2204 if (CondTy->isVectorTy()) {
2205 if (ValTy->getScalarSizeInBits() == 1) {
2206 // vmandn.mm v8, v8, v9
2207 // vmand.mm v9, v0, v9
2208 // vmor.mm v0, v9, v8
2209 return ConstantMatCost +
2210 LT.first *
2211 getRISCVInstructionCost(
2212 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2213 LT.second, CostKind);
2214 }
2215 // vselect and max/min are supported natively.
2216 return ConstantMatCost +
2217 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2218 CostKind);
2219 }
2220
2221 if (ValTy->getScalarSizeInBits() == 1) {
2222 // vmv.v.x v9, a0
2223 // vmsne.vi v9, v9, 0
2224 // vmandn.mm v8, v8, v9
2225 // vmand.mm v9, v0, v9
2226 // vmor.mm v0, v9, v8
2227 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2228 return ConstantMatCost +
2229 LT.first *
2230 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2231 InterimVT, CostKind) +
2232 LT.first * getRISCVInstructionCost(
2233 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2234 LT.second, CostKind);
2235 }
2236
2237 // vmv.v.x v10, a0
2238 // vmsne.vi v0, v10, 0
2239 // vmerge.vvm v8, v9, v8, v0
2240 return ConstantMatCost +
2241 LT.first * getRISCVInstructionCost(
2242 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2243 LT.second, CostKind);
2244 }
2245
2246 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2247 CmpInst::isIntPredicate(VecPred)) {
2248 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2249 // provided they incur the same cost across all implementations
2250 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2251 LT.second,
2252 CostKind);
2253 }
2254
2255 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2256 CmpInst::isFPPredicate(VecPred)) {
2257
2258 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2259 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2260 return ConstantMatCost +
2261 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2262
2263 // If we do not support the input floating point vector type, use the base
2264 // one which will calculate as:
2265 // ScalarizeCost + Num * Cost for fixed vector,
2266 // InvalidCost for scalable vector.
2267 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2268 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2269 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2270 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2271 Op1Info, Op2Info, I);
2272
2273 // Assuming vector fp compare and mask instructions are all the same cost
2274 // until a need arises to differentiate them.
2275 switch (VecPred) {
2276 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2277 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2278 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2279 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2280 return ConstantMatCost +
2281 LT.first * getRISCVInstructionCost(
2282 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2283 LT.second, CostKind);
2284
2285 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2286 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2287 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2288 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2289 return ConstantMatCost +
2290 LT.first *
2291 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2292 LT.second, CostKind);
2293
2294 case CmpInst::FCMP_OEQ: // vmfeq.vv
2295 case CmpInst::FCMP_OGT: // vmflt.vv
2296 case CmpInst::FCMP_OGE: // vmfle.vv
2297 case CmpInst::FCMP_OLT: // vmflt.vv
2298 case CmpInst::FCMP_OLE: // vmfle.vv
2299 case CmpInst::FCMP_UNE: // vmfne.vv
2300 return ConstantMatCost +
2301 LT.first *
2302 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2303 default:
2304 break;
2305 }
2306 }
2307
2308 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2309 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2310 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2311 // be (0 + select instr cost).
2312 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2313 ValTy->isIntegerTy() && !I->user_empty()) {
2314 if (all_of(I->users(), [&](const User *U) {
2315 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2316 U->getType()->isIntegerTy() &&
2317 !isa<ConstantData>(U->getOperand(1)) &&
2318 !isa<ConstantData>(U->getOperand(2));
2319 }))
2320 return 0;
2321 }
2322
2323 // TODO: Add cost for scalar type.
2324
2325 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2326 Op1Info, Op2Info, I);
2327}
2328
2331 const Instruction *I) const {
2333 return Opcode == Instruction::PHI ? 0 : 1;
2334 // Branches are assumed to be predicted.
2335 return 0;
2336}
2337
2340 unsigned Index,
2341 const Value *Op0,
2342 const Value *Op1) const {
2343 assert(Val->isVectorTy() && "This must be a vector type");
2344
2345 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2346 // For now, skip all fixed vector cost analysis when P extension is available
2347 // to avoid crashes in getMinRVVVectorSizeInBits()
2348 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
2349 return 1; // Treat as single instruction cost for now
2350 }
2351
2352 if (Opcode != Instruction::ExtractElement &&
2353 Opcode != Instruction::InsertElement)
2354 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2355
2356 // Legalize the type.
2357 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2358
2359 // This type is legalized to a scalar type.
2360 if (!LT.second.isVector()) {
2361 auto *FixedVecTy = cast<FixedVectorType>(Val);
2362 // If Index is a known constant, cost is zero.
2363 if (Index != -1U)
2364 return 0;
2365 // Extract/InsertElement with non-constant index is very costly when
2366 // scalarized; estimate cost of loads/stores sequence via the stack:
2367 // ExtractElement cost: store vector to stack, load scalar;
2368 // InsertElement cost: store vector to stack, store scalar, load vector.
2369 Type *ElemTy = FixedVecTy->getElementType();
2370 auto NumElems = FixedVecTy->getNumElements();
2371 auto Align = DL.getPrefTypeAlign(ElemTy);
2372 InstructionCost LoadCost =
2373 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2374 InstructionCost StoreCost =
2375 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2376 return Opcode == Instruction::ExtractElement
2377 ? StoreCost * NumElems + LoadCost
2378 : (StoreCost + LoadCost) * NumElems + StoreCost;
2379 }
2380
2381 // For unsupported scalable vector.
2382 if (LT.second.isScalableVector() && !LT.first.isValid())
2383 return LT.first;
2384
2385 // Mask vector extract/insert is expanded via e8.
2386 if (Val->getScalarSizeInBits() == 1) {
2387 VectorType *WideTy =
2389 cast<VectorType>(Val)->getElementCount());
2390 if (Opcode == Instruction::ExtractElement) {
2391 InstructionCost ExtendCost
2392 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2394 InstructionCost ExtractCost
2395 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2396 return ExtendCost + ExtractCost;
2397 }
2398 InstructionCost ExtendCost
2399 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2401 InstructionCost InsertCost
2402 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2403 InstructionCost TruncCost
2404 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2406 return ExtendCost + InsertCost + TruncCost;
2407 }
2408
2409
2410 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2411 // and vslideup + vmv.s.x to insert element to vector.
2412 unsigned BaseCost = 1;
2413 // When insertelement we should add the index with 1 as the input of vslideup.
2414 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2415
2416 if (Index != -1U) {
2417 // The type may be split. For fixed-width vectors we can normalize the
2418 // index to the new type.
2419 if (LT.second.isFixedLengthVector()) {
2420 unsigned Width = LT.second.getVectorNumElements();
2421 Index = Index % Width;
2422 }
2423
2424 // If exact VLEN is known, we will insert/extract into the appropriate
2425 // subvector with no additional subvector insert/extract cost.
2426 if (auto VLEN = ST->getRealVLen()) {
2427 unsigned EltSize = LT.second.getScalarSizeInBits();
2428 unsigned M1Max = *VLEN / EltSize;
2429 Index = Index % M1Max;
2430 }
2431
2432 if (Index == 0)
2433 // We can extract/insert the first element without vslidedown/vslideup.
2434 SlideCost = 0;
2435 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2436 Val->getScalarType()->isIntegerTy())
2437 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2438 else if (Opcode == Instruction::InsertElement)
2439 SlideCost = 1; // With a constant index, we do not need to use addi.
2440 }
2441
2442 // When the vector needs to split into multiple register groups and the index
2443 // exceeds single vector register group, we need to insert/extract the element
2444 // via stack.
2445 if (LT.first > 1 &&
2446 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2447 LT.second.isScalableVector()))) {
2448 Type *ScalarType = Val->getScalarType();
2449 Align VecAlign = DL.getPrefTypeAlign(Val);
2450 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2451 // Extra addi for unknown index.
2452 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2453
2454 // Store all split vectors into stack and load the target element.
2455 if (Opcode == Instruction::ExtractElement)
2456 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2457 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2458 CostKind) +
2459 IdxCost;
2460
2461 // Store all split vectors into stack and store the target element and load
2462 // vectors back.
2463 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2464 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2465 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2466 CostKind) +
2467 IdxCost;
2468 }
2469
2470 // Extract i64 in the target that has XLEN=32 need more instruction.
2471 if (Val->getScalarType()->isIntegerTy() &&
2472 ST->getXLen() < Val->getScalarSizeInBits()) {
2473 // For extractelement, we need the following instructions:
2474 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2475 // vslidedown.vx v8, v8, a0
2476 // vmv.x.s a0, v8
2477 // li a1, 32
2478 // vsrl.vx v8, v8, a1
2479 // vmv.x.s a1, v8
2480
2481 // For insertelement, we need the following instructions:
2482 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2483 // vmv.v.i v12, 0
2484 // vslide1up.vx v16, v12, a1
2485 // vslide1up.vx v12, v16, a0
2486 // addi a0, a2, 1
2487 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2488 // vslideup.vx v8, v12, a2
2489
2490 // TODO: should we count these special vsetvlis?
2491 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2492 }
2493 return BaseCost + SlideCost;
2494}
2495
2499 unsigned Index) const {
2500 if (isa<FixedVectorType>(Val))
2502 Index);
2503
2504 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2505 // for the cost of extracting the last lane of a scalable vector. It probably
2506 // needs a more accurate cost.
2507 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2508 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2509 return getVectorInstrCost(Opcode, Val, CostKind,
2510 EC.getKnownMinValue() - 1 - Index, nullptr,
2511 nullptr);
2512}
2513
2515 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2517 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2518
2519 // TODO: Handle more cost kinds.
2521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2522 Args, CxtI);
2523
2524 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2525 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2526 Args, CxtI);
2527
2528 // Skip if scalar size of Ty is bigger than ELEN.
2529 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2530 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2531 Args, CxtI);
2532
2533 // Legalize the type.
2534 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2535
2536 // TODO: Handle scalar type.
2537 if (!LT.second.isVector())
2538 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2539 Args, CxtI);
2540
2541 // f16 with zvfhmin and bf16 will be promoted to f32.
2542 // FIXME: nxv32[b]f16 will be custom lowered and split.
2543 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2544 InstructionCost CastCost = 0;
2545 if ((LT.second.getVectorElementType() == MVT::f16 ||
2546 LT.second.getVectorElementType() == MVT::bf16) &&
2547 TLI->getOperationAction(ISDOpcode, LT.second) ==
2549 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2550 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2551 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2552 // Add cost of extending arguments
2553 CastCost += LT.first * Args.size() *
2554 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2556 // Add cost of truncating result
2557 CastCost +=
2558 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2560 // Compute cost of op in promoted type
2561 LT.second = PromotedVT;
2562 }
2563
2564 auto getConstantMatCost =
2565 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2566 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2567 // Two sub-cases:
2568 // * Has a 5 bit immediate operand which can be splatted.
2569 // * Has a larger immediate which must be materialized in scalar register
2570 // We return 0 for both as we currently ignore the cost of materializing
2571 // scalar constants in GPRs.
2572 return 0;
2573
2574 return getConstantPoolLoadCost(Ty, CostKind);
2575 };
2576
2577 // Add the cost of materializing any constant vectors required.
2578 InstructionCost ConstantMatCost = 0;
2579 if (Op1Info.isConstant())
2580 ConstantMatCost += getConstantMatCost(0, Op1Info);
2581 if (Op2Info.isConstant())
2582 ConstantMatCost += getConstantMatCost(1, Op2Info);
2583
2584 unsigned Op;
2585 switch (ISDOpcode) {
2586 case ISD::ADD:
2587 case ISD::SUB:
2588 Op = RISCV::VADD_VV;
2589 break;
2590 case ISD::SHL:
2591 case ISD::SRL:
2592 case ISD::SRA:
2593 Op = RISCV::VSLL_VV;
2594 break;
2595 case ISD::AND:
2596 case ISD::OR:
2597 case ISD::XOR:
2598 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2599 break;
2600 case ISD::MUL:
2601 case ISD::MULHS:
2602 case ISD::MULHU:
2603 Op = RISCV::VMUL_VV;
2604 break;
2605 case ISD::SDIV:
2606 case ISD::UDIV:
2607 Op = RISCV::VDIV_VV;
2608 break;
2609 case ISD::SREM:
2610 case ISD::UREM:
2611 Op = RISCV::VREM_VV;
2612 break;
2613 case ISD::FADD:
2614 case ISD::FSUB:
2615 Op = RISCV::VFADD_VV;
2616 break;
2617 case ISD::FMUL:
2618 Op = RISCV::VFMUL_VV;
2619 break;
2620 case ISD::FDIV:
2621 Op = RISCV::VFDIV_VV;
2622 break;
2623 case ISD::FNEG:
2624 Op = RISCV::VFSGNJN_VV;
2625 break;
2626 default:
2627 // Assuming all other instructions have the same cost until a need arises to
2628 // differentiate them.
2629 return CastCost + ConstantMatCost +
2630 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2631 Args, CxtI);
2632 }
2633
2634 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2635 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2636 // ops are twice as expensive as integer ops. Do the same for vectors so
2637 // scalar floating point ops aren't cheaper than their vector equivalents.
2638 if (Ty->isFPOrFPVectorTy())
2639 InstrCost *= 2;
2640 return CastCost + ConstantMatCost + LT.first * InstrCost;
2641}
2642
2643// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2645 ArrayRef<const Value *> Ptrs, const Value *Base,
2646 const TTI::PointersChainInfo &Info, Type *AccessTy,
2649 // In the basic model we take into account GEP instructions only
2650 // (although here can come alloca instruction, a value, constants and/or
2651 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2652 // pointer). Typically, if Base is a not a GEP-instruction and all the
2653 // pointers are relative to the same base address, all the rest are
2654 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2655 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2656 // any their index is a non-const.
2657 // If no known dependencies between the pointers cost is calculated as a sum
2658 // of costs of GEP instructions.
2659 for (auto [I, V] : enumerate(Ptrs)) {
2660 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2661 if (!GEP)
2662 continue;
2663 if (Info.isSameBase() && V != Base) {
2664 if (GEP->hasAllConstantIndices())
2665 continue;
2666 // If the chain is unit-stride and BaseReg + stride*i is a legal
2667 // addressing mode, then presume the base GEP is sitting around in a
2668 // register somewhere and check if we can fold the offset relative to
2669 // it.
2670 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2671 if (Info.isUnitStride() &&
2672 isLegalAddressingMode(AccessTy,
2673 /* BaseGV */ nullptr,
2674 /* BaseOffset */ Stride * I,
2675 /* HasBaseReg */ true,
2676 /* Scale */ 0,
2677 GEP->getType()->getPointerAddressSpace()))
2678 continue;
2679 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2680 {TTI::OK_AnyValue, TTI::OP_None},
2681 {TTI::OK_AnyValue, TTI::OP_None}, {});
2682 } else {
2683 SmallVector<const Value *> Indices(GEP->indices());
2684 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2685 Indices, AccessTy, CostKind);
2686 }
2687 }
2688 return Cost;
2689}
2690
2693 OptimizationRemarkEmitter *ORE) const {
2694 // TODO: More tuning on benchmarks and metrics with changes as needed
2695 // would apply to all settings below to enable performance.
2696
2697
2698 if (ST->enableDefaultUnroll())
2699 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2700
2701 // Enable Upper bound unrolling universally, not dependent upon the conditions
2702 // below.
2703 UP.UpperBound = true;
2704
2705 // Disable loop unrolling for Oz and Os.
2706 UP.OptSizeThreshold = 0;
2708 if (L->getHeader()->getParent()->hasOptSize())
2709 return;
2710
2711 SmallVector<BasicBlock *, 4> ExitingBlocks;
2712 L->getExitingBlocks(ExitingBlocks);
2713 LLVM_DEBUG(dbgs() << "Loop has:\n"
2714 << "Blocks: " << L->getNumBlocks() << "\n"
2715 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2716
2717 // Only allow another exit other than the latch. This acts as an early exit
2718 // as it mirrors the profitability calculation of the runtime unroller.
2719 if (ExitingBlocks.size() > 2)
2720 return;
2721
2722 // Limit the CFG of the loop body for targets with a branch predictor.
2723 // Allowing 4 blocks permits if-then-else diamonds in the body.
2724 if (L->getNumBlocks() > 4)
2725 return;
2726
2727 // Scan the loop: don't unroll loops with calls as this could prevent
2728 // inlining. Don't unroll auto-vectorized loops either, though do allow
2729 // unrolling of the scalar remainder.
2730 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2732 for (auto *BB : L->getBlocks()) {
2733 for (auto &I : *BB) {
2734 // Both auto-vectorized loops and the scalar remainder have the
2735 // isvectorized attribute, so differentiate between them by the presence
2736 // of vector instructions.
2737 if (IsVectorized && I.getType()->isVectorTy())
2738 return;
2739
2740 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2741 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2742 if (!isLoweredToCall(F))
2743 continue;
2744 }
2745 return;
2746 }
2747
2748 SmallVector<const Value *> Operands(I.operand_values());
2749 Cost += getInstructionCost(&I, Operands,
2751 }
2752 }
2753
2754 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2755
2756 UP.Partial = true;
2757 UP.Runtime = true;
2758 UP.UnrollRemainder = true;
2759 UP.UnrollAndJam = true;
2760
2761 // Force unrolling small loops can be very useful because of the branch
2762 // taken cost of the backedge.
2763 if (Cost < 12)
2764 UP.Force = true;
2765}
2766
2771
2773 MemIntrinsicInfo &Info) const {
2774 const DataLayout &DL = getDataLayout();
2775 Intrinsic::ID IID = Inst->getIntrinsicID();
2776 LLVMContext &C = Inst->getContext();
2777 bool HasMask = false;
2778
2779 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2780 bool IsWrite) -> int64_t {
2781 if (auto *TarExtTy =
2782 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2783 return TarExtTy->getIntParameter(0);
2784
2785 return 1;
2786 };
2787
2788 switch (IID) {
2789 case Intrinsic::riscv_vle_mask:
2790 case Intrinsic::riscv_vse_mask:
2791 case Intrinsic::riscv_vlseg2_mask:
2792 case Intrinsic::riscv_vlseg3_mask:
2793 case Intrinsic::riscv_vlseg4_mask:
2794 case Intrinsic::riscv_vlseg5_mask:
2795 case Intrinsic::riscv_vlseg6_mask:
2796 case Intrinsic::riscv_vlseg7_mask:
2797 case Intrinsic::riscv_vlseg8_mask:
2798 case Intrinsic::riscv_vsseg2_mask:
2799 case Intrinsic::riscv_vsseg3_mask:
2800 case Intrinsic::riscv_vsseg4_mask:
2801 case Intrinsic::riscv_vsseg5_mask:
2802 case Intrinsic::riscv_vsseg6_mask:
2803 case Intrinsic::riscv_vsseg7_mask:
2804 case Intrinsic::riscv_vsseg8_mask:
2805 HasMask = true;
2806 [[fallthrough]];
2807 case Intrinsic::riscv_vle:
2808 case Intrinsic::riscv_vse:
2809 case Intrinsic::riscv_vlseg2:
2810 case Intrinsic::riscv_vlseg3:
2811 case Intrinsic::riscv_vlseg4:
2812 case Intrinsic::riscv_vlseg5:
2813 case Intrinsic::riscv_vlseg6:
2814 case Intrinsic::riscv_vlseg7:
2815 case Intrinsic::riscv_vlseg8:
2816 case Intrinsic::riscv_vsseg2:
2817 case Intrinsic::riscv_vsseg3:
2818 case Intrinsic::riscv_vsseg4:
2819 case Intrinsic::riscv_vsseg5:
2820 case Intrinsic::riscv_vsseg6:
2821 case Intrinsic::riscv_vsseg7:
2822 case Intrinsic::riscv_vsseg8: {
2823 // Intrinsic interface:
2824 // riscv_vle(merge, ptr, vl)
2825 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2826 // riscv_vse(val, ptr, vl)
2827 // riscv_vse_mask(val, ptr, mask, vl, policy)
2828 // riscv_vlseg#(merge, ptr, vl, sew)
2829 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2830 // riscv_vsseg#(val, ptr, vl, sew)
2831 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2832 bool IsWrite = Inst->getType()->isVoidTy();
2833 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2834 // The results of segment loads are TargetExtType.
2835 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2836 unsigned SEW =
2837 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2838 ->getZExtValue();
2839 Ty = TarExtTy->getTypeParameter(0U);
2841 IntegerType::get(C, SEW),
2842 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2843 }
2844 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2845 unsigned VLIndex = RVVIInfo->VLOperand;
2846 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2847 MaybeAlign Alignment =
2848 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2849 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2850 Value *Mask = ConstantInt::getTrue(MaskType);
2851 if (HasMask)
2852 Mask = Inst->getArgOperand(VLIndex - 1);
2853 Value *EVL = Inst->getArgOperand(VLIndex);
2854 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2855 // RVV uses contiguous elements as a segment.
2856 if (SegNum > 1) {
2857 unsigned ElemSize = Ty->getScalarSizeInBits();
2858 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2859 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2860 }
2861 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2862 Alignment, Mask, EVL);
2863 return true;
2864 }
2865 case Intrinsic::riscv_vlse_mask:
2866 case Intrinsic::riscv_vsse_mask:
2867 case Intrinsic::riscv_vlsseg2_mask:
2868 case Intrinsic::riscv_vlsseg3_mask:
2869 case Intrinsic::riscv_vlsseg4_mask:
2870 case Intrinsic::riscv_vlsseg5_mask:
2871 case Intrinsic::riscv_vlsseg6_mask:
2872 case Intrinsic::riscv_vlsseg7_mask:
2873 case Intrinsic::riscv_vlsseg8_mask:
2874 case Intrinsic::riscv_vssseg2_mask:
2875 case Intrinsic::riscv_vssseg3_mask:
2876 case Intrinsic::riscv_vssseg4_mask:
2877 case Intrinsic::riscv_vssseg5_mask:
2878 case Intrinsic::riscv_vssseg6_mask:
2879 case Intrinsic::riscv_vssseg7_mask:
2880 case Intrinsic::riscv_vssseg8_mask:
2881 HasMask = true;
2882 [[fallthrough]];
2883 case Intrinsic::riscv_vlse:
2884 case Intrinsic::riscv_vsse:
2885 case Intrinsic::riscv_vlsseg2:
2886 case Intrinsic::riscv_vlsseg3:
2887 case Intrinsic::riscv_vlsseg4:
2888 case Intrinsic::riscv_vlsseg5:
2889 case Intrinsic::riscv_vlsseg6:
2890 case Intrinsic::riscv_vlsseg7:
2891 case Intrinsic::riscv_vlsseg8:
2892 case Intrinsic::riscv_vssseg2:
2893 case Intrinsic::riscv_vssseg3:
2894 case Intrinsic::riscv_vssseg4:
2895 case Intrinsic::riscv_vssseg5:
2896 case Intrinsic::riscv_vssseg6:
2897 case Intrinsic::riscv_vssseg7:
2898 case Intrinsic::riscv_vssseg8: {
2899 // Intrinsic interface:
2900 // riscv_vlse(merge, ptr, stride, vl)
2901 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2902 // riscv_vsse(val, ptr, stride, vl)
2903 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2904 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2905 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2906 // riscv_vssseg#(val, ptr, offset, vl, sew)
2907 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2908 bool IsWrite = Inst->getType()->isVoidTy();
2909 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2910 // The results of segment loads are TargetExtType.
2911 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2912 unsigned SEW =
2913 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2914 ->getZExtValue();
2915 Ty = TarExtTy->getTypeParameter(0U);
2917 IntegerType::get(C, SEW),
2918 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2919 }
2920 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2921 unsigned VLIndex = RVVIInfo->VLOperand;
2922 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2923 MaybeAlign Alignment =
2924 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2925
2926 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2927 // Use the pointer alignment as the element alignment if the stride is a
2928 // multiple of the pointer alignment. Otherwise, the element alignment
2929 // should be the greatest common divisor of pointer alignment and stride.
2930 // For simplicity, just consider unalignment for elements.
2931 unsigned PointerAlign = Alignment.valueOrOne().value();
2932 if (!isa<ConstantInt>(Stride) ||
2933 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2934 Alignment = Align(1);
2935
2936 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2937 Value *Mask = ConstantInt::getTrue(MaskType);
2938 if (HasMask)
2939 Mask = Inst->getArgOperand(VLIndex - 1);
2940 Value *EVL = Inst->getArgOperand(VLIndex);
2941 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2942 // RVV uses contiguous elements as a segment.
2943 if (SegNum > 1) {
2944 unsigned ElemSize = Ty->getScalarSizeInBits();
2945 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2946 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2947 }
2948 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2949 Alignment, Mask, EVL, Stride);
2950 return true;
2951 }
2952 case Intrinsic::riscv_vloxei_mask:
2953 case Intrinsic::riscv_vluxei_mask:
2954 case Intrinsic::riscv_vsoxei_mask:
2955 case Intrinsic::riscv_vsuxei_mask:
2956 case Intrinsic::riscv_vloxseg2_mask:
2957 case Intrinsic::riscv_vloxseg3_mask:
2958 case Intrinsic::riscv_vloxseg4_mask:
2959 case Intrinsic::riscv_vloxseg5_mask:
2960 case Intrinsic::riscv_vloxseg6_mask:
2961 case Intrinsic::riscv_vloxseg7_mask:
2962 case Intrinsic::riscv_vloxseg8_mask:
2963 case Intrinsic::riscv_vluxseg2_mask:
2964 case Intrinsic::riscv_vluxseg3_mask:
2965 case Intrinsic::riscv_vluxseg4_mask:
2966 case Intrinsic::riscv_vluxseg5_mask:
2967 case Intrinsic::riscv_vluxseg6_mask:
2968 case Intrinsic::riscv_vluxseg7_mask:
2969 case Intrinsic::riscv_vluxseg8_mask:
2970 case Intrinsic::riscv_vsoxseg2_mask:
2971 case Intrinsic::riscv_vsoxseg3_mask:
2972 case Intrinsic::riscv_vsoxseg4_mask:
2973 case Intrinsic::riscv_vsoxseg5_mask:
2974 case Intrinsic::riscv_vsoxseg6_mask:
2975 case Intrinsic::riscv_vsoxseg7_mask:
2976 case Intrinsic::riscv_vsoxseg8_mask:
2977 case Intrinsic::riscv_vsuxseg2_mask:
2978 case Intrinsic::riscv_vsuxseg3_mask:
2979 case Intrinsic::riscv_vsuxseg4_mask:
2980 case Intrinsic::riscv_vsuxseg5_mask:
2981 case Intrinsic::riscv_vsuxseg6_mask:
2982 case Intrinsic::riscv_vsuxseg7_mask:
2983 case Intrinsic::riscv_vsuxseg8_mask:
2984 HasMask = true;
2985 [[fallthrough]];
2986 case Intrinsic::riscv_vloxei:
2987 case Intrinsic::riscv_vluxei:
2988 case Intrinsic::riscv_vsoxei:
2989 case Intrinsic::riscv_vsuxei:
2990 case Intrinsic::riscv_vloxseg2:
2991 case Intrinsic::riscv_vloxseg3:
2992 case Intrinsic::riscv_vloxseg4:
2993 case Intrinsic::riscv_vloxseg5:
2994 case Intrinsic::riscv_vloxseg6:
2995 case Intrinsic::riscv_vloxseg7:
2996 case Intrinsic::riscv_vloxseg8:
2997 case Intrinsic::riscv_vluxseg2:
2998 case Intrinsic::riscv_vluxseg3:
2999 case Intrinsic::riscv_vluxseg4:
3000 case Intrinsic::riscv_vluxseg5:
3001 case Intrinsic::riscv_vluxseg6:
3002 case Intrinsic::riscv_vluxseg7:
3003 case Intrinsic::riscv_vluxseg8:
3004 case Intrinsic::riscv_vsoxseg2:
3005 case Intrinsic::riscv_vsoxseg3:
3006 case Intrinsic::riscv_vsoxseg4:
3007 case Intrinsic::riscv_vsoxseg5:
3008 case Intrinsic::riscv_vsoxseg6:
3009 case Intrinsic::riscv_vsoxseg7:
3010 case Intrinsic::riscv_vsoxseg8:
3011 case Intrinsic::riscv_vsuxseg2:
3012 case Intrinsic::riscv_vsuxseg3:
3013 case Intrinsic::riscv_vsuxseg4:
3014 case Intrinsic::riscv_vsuxseg5:
3015 case Intrinsic::riscv_vsuxseg6:
3016 case Intrinsic::riscv_vsuxseg7:
3017 case Intrinsic::riscv_vsuxseg8: {
3018 // Intrinsic interface (only listed ordered version):
3019 // riscv_vloxei(merge, ptr, index, vl)
3020 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3021 // riscv_vsoxei(val, ptr, index, vl)
3022 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3023 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3024 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3025 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3026 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3027 bool IsWrite = Inst->getType()->isVoidTy();
3028 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3029 // The results of segment loads are TargetExtType.
3030 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3031 unsigned SEW =
3032 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3033 ->getZExtValue();
3034 Ty = TarExtTy->getTypeParameter(0U);
3036 IntegerType::get(C, SEW),
3037 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3038 }
3039 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3040 unsigned VLIndex = RVVIInfo->VLOperand;
3041 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3042 Value *Mask;
3043 if (HasMask) {
3044 Mask = Inst->getArgOperand(VLIndex - 1);
3045 } else {
3046 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3047 // and casting that to scalar i64 triggers a vector/scalar mismatch
3048 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3049 // via extractelement instead.
3050 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3051 Mask = ConstantInt::getTrue(MaskType);
3052 }
3053 Value *EVL = Inst->getArgOperand(VLIndex);
3054 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3055 // RVV uses contiguous elements as a segment.
3056 if (SegNum > 1) {
3057 unsigned ElemSize = Ty->getScalarSizeInBits();
3058 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3059 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3060 }
3061 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3062 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3063 Align(1), Mask, EVL,
3064 /* Stride */ nullptr, OffsetOp);
3065 return true;
3066 }
3067 }
3068 return false;
3069}
3070
3072 if (Ty->isVectorTy()) {
3073 // f16 with only zvfhmin and bf16 will be promoted to f32
3074 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3075 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3076 EltTy->isBFloatTy())
3077 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3078 cast<VectorType>(Ty));
3079
3080 TypeSize Size = DL.getTypeSizeInBits(Ty);
3081 if (Size.isScalable() && ST->hasVInstructions())
3082 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3083
3084 if (ST->useRVVForFixedLengthVectors())
3085 return divideCeil(Size, ST->getRealMinVLen());
3086 }
3087
3088 return BaseT::getRegUsageForType(Ty);
3089}
3090
3091unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3092 if (SLPMaxVF.getNumOccurrences())
3093 return SLPMaxVF;
3094
3095 // Return how many elements can fit in getRegisterBitwidth. This is the
3096 // same routine as used in LoopVectorizer. We should probably be
3097 // accounting for whether we actually have instructions with the right
3098 // lane type, but we don't have enough information to do that without
3099 // some additional plumbing which hasn't been justified yet.
3100 TypeSize RegWidth =
3102 // If no vector registers, or absurd element widths, disable
3103 // vectorization by returning 1.
3104 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3105}
3106
3110
3112 return ST->enableUnalignedVectorMem();
3113}
3114
3117 ScalarEvolution *SE) const {
3118 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3119 return TTI::AMK_PostIndexed;
3120
3122}
3123
3125 const TargetTransformInfo::LSRCost &C2) const {
3126 // RISC-V specific here are "instruction number 1st priority".
3127 // If we need to emit adds inside the loop to add up base registers, then
3128 // we need at least one extra temporary register.
3129 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3130 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3131 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3132 C1.NumIVMuls, C1.NumBaseAdds,
3133 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3134 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3135 C2.NumIVMuls, C2.NumBaseAdds,
3136 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3137}
3138
3140 Align Alignment) const {
3141 auto *VTy = dyn_cast<VectorType>(DataTy);
3142 if (!VTy || VTy->isScalableTy())
3143 return false;
3144
3145 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3146 return false;
3147
3148 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3149 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3150 if (VTy->getElementType()->isIntegerTy(8))
3151 if (VTy->getElementCount().getFixedValue() > 256)
3152 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3153 ST->getMaxLMULForFixedLengthVectors();
3154 return true;
3155}
3156
3158 Align Alignment) const {
3159 auto *VTy = dyn_cast<VectorType>(DataTy);
3160 if (!VTy || VTy->isScalableTy())
3161 return false;
3162
3163 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3164 return false;
3165 return true;
3166}
3167
3168/// See if \p I should be considered for address type promotion. We check if \p
3169/// I is a sext with right type and used in memory accesses. If it used in a
3170/// "complex" getelementptr, we allow it to be promoted without finding other
3171/// sext instructions that sign extended the same initial value. A getelementptr
3172/// is considered as "complex" if it has more than 2 operands.
3174 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3175 bool Considerable = false;
3176 AllowPromotionWithoutCommonHeader = false;
3177 if (!isa<SExtInst>(&I))
3178 return false;
3179 Type *ConsideredSExtType =
3180 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3181 if (I.getType() != ConsideredSExtType)
3182 return false;
3183 // See if the sext is the one with the right type and used in at least one
3184 // GetElementPtrInst.
3185 for (const User *U : I.users()) {
3186 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3187 Considerable = true;
3188 // A getelementptr is considered as "complex" if it has more than 2
3189 // operands. We will promote a SExt used in such complex GEP as we
3190 // expect some computation to be merged if they are done on 64 bits.
3191 if (GEPInst->getNumOperands() > 2) {
3192 AllowPromotionWithoutCommonHeader = true;
3193 break;
3194 }
3195 }
3196 }
3197 return Considerable;
3198}
3199
3200bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3201 switch (Opcode) {
3202 case Instruction::Add:
3203 case Instruction::Sub:
3204 case Instruction::Mul:
3205 case Instruction::And:
3206 case Instruction::Or:
3207 case Instruction::Xor:
3208 case Instruction::FAdd:
3209 case Instruction::FSub:
3210 case Instruction::FMul:
3211 case Instruction::FDiv:
3212 case Instruction::ICmp:
3213 case Instruction::FCmp:
3214 return true;
3215 case Instruction::Shl:
3216 case Instruction::LShr:
3217 case Instruction::AShr:
3218 case Instruction::UDiv:
3219 case Instruction::SDiv:
3220 case Instruction::URem:
3221 case Instruction::SRem:
3222 case Instruction::Select:
3223 return Operand == 1;
3224 default:
3225 return false;
3226 }
3227}
3228
3230 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3231 return false;
3232
3233 if (canSplatOperand(I->getOpcode(), Operand))
3234 return true;
3235
3236 auto *II = dyn_cast<IntrinsicInst>(I);
3237 if (!II)
3238 return false;
3239
3240 switch (II->getIntrinsicID()) {
3241 case Intrinsic::fma:
3242 case Intrinsic::vp_fma:
3243 case Intrinsic::fmuladd:
3244 case Intrinsic::vp_fmuladd:
3245 return Operand == 0 || Operand == 1;
3246 case Intrinsic::vp_shl:
3247 case Intrinsic::vp_lshr:
3248 case Intrinsic::vp_ashr:
3249 case Intrinsic::vp_udiv:
3250 case Intrinsic::vp_sdiv:
3251 case Intrinsic::vp_urem:
3252 case Intrinsic::vp_srem:
3253 case Intrinsic::ssub_sat:
3254 case Intrinsic::vp_ssub_sat:
3255 case Intrinsic::usub_sat:
3256 case Intrinsic::vp_usub_sat:
3257 case Intrinsic::vp_select:
3258 return Operand == 1;
3259 // These intrinsics are commutative.
3260 case Intrinsic::vp_add:
3261 case Intrinsic::vp_mul:
3262 case Intrinsic::vp_and:
3263 case Intrinsic::vp_or:
3264 case Intrinsic::vp_xor:
3265 case Intrinsic::vp_fadd:
3266 case Intrinsic::vp_fmul:
3267 case Intrinsic::vp_icmp:
3268 case Intrinsic::vp_fcmp:
3269 case Intrinsic::smin:
3270 case Intrinsic::vp_smin:
3271 case Intrinsic::umin:
3272 case Intrinsic::vp_umin:
3273 case Intrinsic::smax:
3274 case Intrinsic::vp_smax:
3275 case Intrinsic::umax:
3276 case Intrinsic::vp_umax:
3277 case Intrinsic::sadd_sat:
3278 case Intrinsic::vp_sadd_sat:
3279 case Intrinsic::uadd_sat:
3280 case Intrinsic::vp_uadd_sat:
3281 // These intrinsics have 'vr' versions.
3282 case Intrinsic::vp_sub:
3283 case Intrinsic::vp_fsub:
3284 case Intrinsic::vp_fdiv:
3285 return Operand == 0 || Operand == 1;
3286 default:
3287 return false;
3288 }
3289}
3290
3291/// Check if sinking \p I's operands to I's basic block is profitable, because
3292/// the operands can be folded into a target instruction, e.g.
3293/// splats of scalars can fold into vector instructions.
3296 using namespace llvm::PatternMatch;
3297
3298 if (I->isBitwiseLogicOp()) {
3299 if (!I->getType()->isVectorTy()) {
3300 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3301 for (auto &Op : I->operands()) {
3302 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3303 if (match(Op.get(), m_Not(m_Value()))) {
3304 Ops.push_back(&Op);
3305 return true;
3306 }
3307 }
3308 }
3309 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3310 for (auto &Op : I->operands()) {
3311 // (and X, (not Y)) -> (vandn.vv X, Y)
3312 if (match(Op.get(), m_Not(m_Value()))) {
3313 Ops.push_back(&Op);
3314 return true;
3315 }
3316 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3318 m_ZeroInt()),
3319 m_Value(), m_ZeroMask()))) {
3320 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3321 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3322 Ops.push_back(&Not);
3323 Ops.push_back(&InsertElt);
3324 Ops.push_back(&Op);
3325 return true;
3326 }
3327 }
3328 }
3329 }
3330
3331 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3332 return false;
3333
3334 // Don't sink splat operands if the target prefers it. Some targets requires
3335 // S2V transfer buffers and we can run out of them copying the same value
3336 // repeatedly.
3337 // FIXME: It could still be worth doing if it would improve vector register
3338 // pressure and prevent a vector spill.
3339 if (!ST->sinkSplatOperands())
3340 return false;
3341
3342 for (auto OpIdx : enumerate(I->operands())) {
3343 if (!canSplatOperand(I, OpIdx.index()))
3344 continue;
3345
3346 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3347 // Make sure we are not already sinking this operand
3348 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3349 continue;
3350
3351 // We are looking for a splat/vp.splat that can be sunk.
3353 m_Value(), m_Value(), m_Value()));
3354 if (!IsVPSplat &&
3356 m_Value(), m_ZeroMask())))
3357 continue;
3358
3359 // Don't sink i1 splats.
3360 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3361 continue;
3362
3363 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3364 // and vector registers
3365 for (Use &U : Op->uses()) {
3366 Instruction *Insn = cast<Instruction>(U.getUser());
3367 if (!canSplatOperand(Insn, U.getOperandNo()))
3368 return false;
3369 }
3370
3371 // Sink any fpexts since they might be used in a widening fp pattern.
3372 if (IsVPSplat) {
3373 if (isa<FPExtInst>(Op->getOperand(0)))
3374 Ops.push_back(&Op->getOperandUse(0));
3375 } else {
3376 Use *InsertEltUse = &Op->getOperandUse(0);
3377 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3378 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3379 Ops.push_back(&InsertElt->getOperandUse(1));
3380 Ops.push_back(InsertEltUse);
3381 }
3382 Ops.push_back(&OpIdx.value());
3383 }
3384 return true;
3385}
3386
3388RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3390 // TODO: Enable expansion when unaligned access is not supported after we fix
3391 // issues in ExpandMemcmp.
3392 if (!ST->enableUnalignedScalarMem())
3393 return Options;
3394
3395 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3396 return Options;
3397
3398 Options.AllowOverlappingLoads = true;
3399 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3400 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3401 if (ST->is64Bit()) {
3402 Options.LoadSizes = {8, 4, 2, 1};
3403 Options.AllowedTailExpansions = {3, 5, 6};
3404 } else {
3405 Options.LoadSizes = {4, 2, 1};
3406 Options.AllowedTailExpansions = {3};
3407 }
3408
3409 if (IsZeroCmp && ST->hasVInstructions()) {
3410 unsigned VLenB = ST->getRealMinVLen() / 8;
3411 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3412 // `VLenB * MaxLMUL` so that it fits in a single register group.
3413 unsigned MinSize = ST->getXLen() / 8 + 1;
3414 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3415 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3416 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3417 }
3418 return Options;
3419}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).