LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
973 // For now, skip all fixed vector cost analysis when P extension is available
974 // to avoid crashes in getMinRVVVectorSizeInBits()
975 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
976 return 1; // Treat as single instruction cost for now
977 }
978
979 // A build_vector (which is m1 sized or smaller) can be done in no
980 // worse than one vslide1down.vx per element in the type. We could
981 // in theory do an explode_vector in the inverse manner, but our
982 // lowering today does not have a first class node for this pattern.
984 Ty, DemandedElts, Insert, Extract, CostKind);
985 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
986 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
987 if (Ty->getScalarSizeInBits() == 1) {
988 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
989 // Note: Implicit scalar anyextend is assumed to be free since the i1
990 // must be stored in a GPR.
991 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
992 CostKind) +
993 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
995 }
996
997 assert(LT.second.isFixedLengthVector());
998 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
999 if (isM1OrSmaller(ContainerVT)) {
1000 InstructionCost BV =
1001 cast<FixedVectorType>(Ty)->getNumElements() *
1002 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1003 if (BV < Cost)
1004 Cost = BV;
1005 }
1006 }
1007 return Cost;
1008}
1009
1011RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1012 unsigned AddressSpace,
1014 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1016 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1017 CostKind);
1018
1019 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1020}
1021
1023 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1024 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1025 bool UseMaskForCond, bool UseMaskForGaps) const {
1026
1027 // The interleaved memory access pass will lower (de)interleave ops combined
1028 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1029 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1030 // gap).
1031 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1032 auto *VTy = cast<VectorType>(VecTy);
1033 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1034 // Need to make sure type has't been scalarized
1035 if (LT.second.isVector()) {
1036 auto *SubVecTy =
1037 VectorType::get(VTy->getElementType(),
1038 VTy->getElementCount().divideCoefficientBy(Factor));
1039 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1040 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1041 AddressSpace, DL)) {
1042
1043 // Some processors optimize segment loads/stores as one wide memory op +
1044 // Factor * LMUL shuffle ops.
1045 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1047 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1048 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1049 Cost += Factor * TLI->getLMULCost(SubVecVT);
1050 return LT.first * Cost;
1051 }
1052
1053 // Otherwise, the cost is proportional to the number of elements (VL *
1054 // Factor ops).
1055 InstructionCost MemOpCost =
1056 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1057 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1058 unsigned NumLoads = getEstimatedVLFor(VTy);
1059 return NumLoads * MemOpCost;
1060 }
1061 }
1062 }
1063
1064 // TODO: Return the cost of interleaved accesses for scalable vector when
1065 // unable to convert to segment accesses instructions.
1066 if (isa<ScalableVectorType>(VecTy))
1068
1069 auto *FVTy = cast<FixedVectorType>(VecTy);
1070 InstructionCost MemCost =
1071 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1072 unsigned VF = FVTy->getNumElements() / Factor;
1073
1074 // An interleaved load will look like this for Factor=3:
1075 // %wide.vec = load <12 x i32>, ptr %3, align 4
1076 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1077 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1078 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1079 if (Opcode == Instruction::Load) {
1080 InstructionCost Cost = MemCost;
1081 for (unsigned Index : Indices) {
1082 FixedVectorType *VecTy =
1083 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1084 auto Mask = createStrideMask(Index, Factor, VF);
1085 Mask.resize(VF * Factor, -1);
1086 InstructionCost ShuffleCost =
1088 Mask, CostKind, 0, nullptr, {});
1089 Cost += ShuffleCost;
1090 }
1091 return Cost;
1092 }
1093
1094 // TODO: Model for NF > 2
1095 // We'll need to enhance getShuffleCost to model shuffles that are just
1096 // inserts and extracts into subvectors, since they won't have the full cost
1097 // of a vrgather.
1098 // An interleaved store for 3 vectors of 4 lanes will look like
1099 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1100 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1101 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1102 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1103 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1104 if (Factor != 2)
1105 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1106 Alignment, AddressSpace, CostKind,
1107 UseMaskForCond, UseMaskForGaps);
1108
1109 assert(Opcode == Instruction::Store && "Opcode must be a store");
1110 // For an interleaving store of 2 vectors, we perform one large interleaving
1111 // shuffle that goes into the wide store
1112 auto Mask = createInterleaveMask(VF, Factor);
1113 InstructionCost ShuffleCost =
1115 CostKind, 0, nullptr, {});
1116 return MemCost + ShuffleCost;
1117}
1118
1120 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1121 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1123 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1124 Alignment, CostKind, I);
1125
1126 if ((Opcode == Instruction::Load &&
1127 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1128 (Opcode == Instruction::Store &&
1129 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1130 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1131 Alignment, CostKind, I);
1132
1133 // Cost is proportional to the number of memory operations implied. For
1134 // scalable vectors, we use an estimate on that number since we don't
1135 // know exactly what VL will be.
1136 auto &VTy = *cast<VectorType>(DataTy);
1137 InstructionCost MemOpCost =
1138 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1139 {TTI::OK_AnyValue, TTI::OP_None}, I);
1140 unsigned NumLoads = getEstimatedVLFor(&VTy);
1141 return NumLoads * MemOpCost;
1142}
1143
1145 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1146 TTI::TargetCostKind CostKind, const Instruction *I) const {
1147 bool IsLegal = (Opcode == Instruction::Store &&
1148 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1149 (Opcode == Instruction::Load &&
1150 isLegalMaskedExpandLoad(DataTy, Alignment));
1151 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1152 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1153 Alignment, CostKind, I);
1154 // Example compressstore sequence:
1155 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1156 // vcompress.vm v10, v8, v0
1157 // vcpop.m a1, v0
1158 // vsetvli zero, a1, e32, m2, ta, ma
1159 // vse32.v v10, (a0)
1160 // Example expandload sequence:
1161 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1162 // vcpop.m a1, v0
1163 // vsetvli zero, a1, e32, m2, ta, ma
1164 // vle32.v v10, (a0)
1165 // vsetivli zero, 8, e32, m2, ta, ma
1166 // viota.m v12, v0
1167 // vrgather.vv v8, v10, v12, v0.t
1168 auto MemOpCost =
1169 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1170 auto LT = getTypeLegalizationCost(DataTy);
1171 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1172 if (VariableMask)
1173 Opcodes.push_back(RISCV::VCPOP_M);
1174 if (Opcode == Instruction::Store)
1175 Opcodes.append({RISCV::VCOMPRESS_VM});
1176 else
1177 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1178 return MemOpCost +
1179 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1180}
1181
1183 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1184 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1185 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1186 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1187 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1188 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1189 Alignment, CostKind, I);
1190
1192 return TTI::TCC_Basic;
1193
1194 // Cost is proportional to the number of memory operations implied. For
1195 // scalable vectors, we use an estimate on that number since we don't
1196 // know exactly what VL will be.
1197 auto &VTy = *cast<VectorType>(DataTy);
1198 InstructionCost MemOpCost =
1199 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1200 {TTI::OK_AnyValue, TTI::OP_None}, I);
1201 unsigned NumLoads = getEstimatedVLFor(&VTy);
1202 return NumLoads * MemOpCost;
1203}
1204
1207 // FIXME: This is a property of the default vector convention, not
1208 // all possible calling conventions. Fixing that will require
1209 // some TTI API and SLP rework.
1212 for (auto *Ty : Tys) {
1213 if (!Ty->isVectorTy())
1214 continue;
1215 Align A = DL.getPrefTypeAlign(Ty);
1216 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1217 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1218 }
1219 return Cost;
1220}
1221
1222// Currently, these represent both throughput and codesize costs
1223// for the respective intrinsics. The costs in this table are simply
1224// instruction counts with the following adjustments made:
1225// * One vsetvli is considered free.
1227 {Intrinsic::floor, MVT::f32, 9},
1228 {Intrinsic::floor, MVT::f64, 9},
1229 {Intrinsic::ceil, MVT::f32, 9},
1230 {Intrinsic::ceil, MVT::f64, 9},
1231 {Intrinsic::trunc, MVT::f32, 7},
1232 {Intrinsic::trunc, MVT::f64, 7},
1233 {Intrinsic::round, MVT::f32, 9},
1234 {Intrinsic::round, MVT::f64, 9},
1235 {Intrinsic::roundeven, MVT::f32, 9},
1236 {Intrinsic::roundeven, MVT::f64, 9},
1237 {Intrinsic::rint, MVT::f32, 7},
1238 {Intrinsic::rint, MVT::f64, 7},
1239 {Intrinsic::nearbyint, MVT::f32, 9},
1240 {Intrinsic::nearbyint, MVT::f64, 9},
1241 {Intrinsic::bswap, MVT::i16, 3},
1242 {Intrinsic::bswap, MVT::i32, 12},
1243 {Intrinsic::bswap, MVT::i64, 31},
1244 {Intrinsic::vp_bswap, MVT::i16, 3},
1245 {Intrinsic::vp_bswap, MVT::i32, 12},
1246 {Intrinsic::vp_bswap, MVT::i64, 31},
1247 {Intrinsic::vp_fshl, MVT::i8, 7},
1248 {Intrinsic::vp_fshl, MVT::i16, 7},
1249 {Intrinsic::vp_fshl, MVT::i32, 7},
1250 {Intrinsic::vp_fshl, MVT::i64, 7},
1251 {Intrinsic::vp_fshr, MVT::i8, 7},
1252 {Intrinsic::vp_fshr, MVT::i16, 7},
1253 {Intrinsic::vp_fshr, MVT::i32, 7},
1254 {Intrinsic::vp_fshr, MVT::i64, 7},
1255 {Intrinsic::bitreverse, MVT::i8, 17},
1256 {Intrinsic::bitreverse, MVT::i16, 24},
1257 {Intrinsic::bitreverse, MVT::i32, 33},
1258 {Intrinsic::bitreverse, MVT::i64, 52},
1259 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1260 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1261 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1262 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1263 {Intrinsic::ctpop, MVT::i8, 12},
1264 {Intrinsic::ctpop, MVT::i16, 19},
1265 {Intrinsic::ctpop, MVT::i32, 20},
1266 {Intrinsic::ctpop, MVT::i64, 21},
1267 {Intrinsic::ctlz, MVT::i8, 19},
1268 {Intrinsic::ctlz, MVT::i16, 28},
1269 {Intrinsic::ctlz, MVT::i32, 31},
1270 {Intrinsic::ctlz, MVT::i64, 35},
1271 {Intrinsic::cttz, MVT::i8, 16},
1272 {Intrinsic::cttz, MVT::i16, 23},
1273 {Intrinsic::cttz, MVT::i32, 24},
1274 {Intrinsic::cttz, MVT::i64, 25},
1275 {Intrinsic::vp_ctpop, MVT::i8, 12},
1276 {Intrinsic::vp_ctpop, MVT::i16, 19},
1277 {Intrinsic::vp_ctpop, MVT::i32, 20},
1278 {Intrinsic::vp_ctpop, MVT::i64, 21},
1279 {Intrinsic::vp_ctlz, MVT::i8, 19},
1280 {Intrinsic::vp_ctlz, MVT::i16, 28},
1281 {Intrinsic::vp_ctlz, MVT::i32, 31},
1282 {Intrinsic::vp_ctlz, MVT::i64, 35},
1283 {Intrinsic::vp_cttz, MVT::i8, 16},
1284 {Intrinsic::vp_cttz, MVT::i16, 23},
1285 {Intrinsic::vp_cttz, MVT::i32, 24},
1286 {Intrinsic::vp_cttz, MVT::i64, 25},
1287};
1288
1292 auto *RetTy = ICA.getReturnType();
1293 switch (ICA.getID()) {
1294 case Intrinsic::lrint:
1295 case Intrinsic::llrint:
1296 case Intrinsic::lround:
1297 case Intrinsic::llround: {
1298 auto LT = getTypeLegalizationCost(RetTy);
1299 Type *SrcTy = ICA.getArgTypes().front();
1300 auto SrcLT = getTypeLegalizationCost(SrcTy);
1301 if (ST->hasVInstructions() && LT.second.isVector()) {
1303 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1304 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1305 if (LT.second.getVectorElementType() == MVT::bf16) {
1306 if (!ST->hasVInstructionsBF16Minimal())
1308 if (DstEltSz == 32)
1309 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1310 else
1311 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1312 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1313 !ST->hasVInstructionsF16()) {
1314 if (!ST->hasVInstructionsF16Minimal())
1316 if (DstEltSz == 32)
1317 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1318 else
1319 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1320
1321 } else if (SrcEltSz > DstEltSz) {
1322 Ops = {RISCV::VFNCVT_X_F_W};
1323 } else if (SrcEltSz < DstEltSz) {
1324 Ops = {RISCV::VFWCVT_X_F_V};
1325 } else {
1326 Ops = {RISCV::VFCVT_X_F_V};
1327 }
1328
1329 // We need to use the source LMUL in the case of a narrowing op, and the
1330 // destination LMUL otherwise.
1331 if (SrcEltSz > DstEltSz)
1332 return SrcLT.first *
1333 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1334 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1335 }
1336 break;
1337 }
1338 case Intrinsic::ceil:
1339 case Intrinsic::floor:
1340 case Intrinsic::trunc:
1341 case Intrinsic::rint:
1342 case Intrinsic::round:
1343 case Intrinsic::roundeven: {
1344 // These all use the same code.
1345 auto LT = getTypeLegalizationCost(RetTy);
1346 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1347 return LT.first * 8;
1348 break;
1349 }
1350 case Intrinsic::umin:
1351 case Intrinsic::umax:
1352 case Intrinsic::smin:
1353 case Intrinsic::smax: {
1354 auto LT = getTypeLegalizationCost(RetTy);
1355 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1356 return LT.first;
1357
1358 if (ST->hasVInstructions() && LT.second.isVector()) {
1359 unsigned Op;
1360 switch (ICA.getID()) {
1361 case Intrinsic::umin:
1362 Op = RISCV::VMINU_VV;
1363 break;
1364 case Intrinsic::umax:
1365 Op = RISCV::VMAXU_VV;
1366 break;
1367 case Intrinsic::smin:
1368 Op = RISCV::VMIN_VV;
1369 break;
1370 case Intrinsic::smax:
1371 Op = RISCV::VMAX_VV;
1372 break;
1373 }
1374 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1375 }
1376 break;
1377 }
1378 case Intrinsic::sadd_sat:
1379 case Intrinsic::ssub_sat:
1380 case Intrinsic::uadd_sat:
1381 case Intrinsic::usub_sat: {
1382 auto LT = getTypeLegalizationCost(RetTy);
1383 if (ST->hasVInstructions() && LT.second.isVector()) {
1384 unsigned Op;
1385 switch (ICA.getID()) {
1386 case Intrinsic::sadd_sat:
1387 Op = RISCV::VSADD_VV;
1388 break;
1389 case Intrinsic::ssub_sat:
1390 Op = RISCV::VSSUBU_VV;
1391 break;
1392 case Intrinsic::uadd_sat:
1393 Op = RISCV::VSADDU_VV;
1394 break;
1395 case Intrinsic::usub_sat:
1396 Op = RISCV::VSSUBU_VV;
1397 break;
1398 }
1399 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1400 }
1401 break;
1402 }
1403 case Intrinsic::fma:
1404 case Intrinsic::fmuladd: {
1405 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1406 auto LT = getTypeLegalizationCost(RetTy);
1407 if (ST->hasVInstructions() && LT.second.isVector())
1408 return LT.first *
1409 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1410 break;
1411 }
1412 case Intrinsic::fabs: {
1413 auto LT = getTypeLegalizationCost(RetTy);
1414 if (ST->hasVInstructions() && LT.second.isVector()) {
1415 // lui a0, 8
1416 // addi a0, a0, -1
1417 // vsetvli a1, zero, e16, m1, ta, ma
1418 // vand.vx v8, v8, a0
1419 // f16 with zvfhmin and bf16 with zvfhbmin
1420 if (LT.second.getVectorElementType() == MVT::bf16 ||
1421 (LT.second.getVectorElementType() == MVT::f16 &&
1422 !ST->hasVInstructionsF16()))
1423 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1424 CostKind) +
1425 2;
1426 else
1427 return LT.first *
1428 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1429 }
1430 break;
1431 }
1432 case Intrinsic::sqrt: {
1433 auto LT = getTypeLegalizationCost(RetTy);
1434 if (ST->hasVInstructions() && LT.second.isVector()) {
1437 MVT ConvType = LT.second;
1438 MVT FsqrtType = LT.second;
1439 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1440 // will be spilt.
1441 if (LT.second.getVectorElementType() == MVT::bf16) {
1442 if (LT.second == MVT::nxv32bf16) {
1443 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1444 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1445 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1446 ConvType = MVT::nxv16f16;
1447 FsqrtType = MVT::nxv16f32;
1448 } else {
1449 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1450 FsqrtOp = {RISCV::VFSQRT_V};
1451 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1452 }
1453 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1454 !ST->hasVInstructionsF16()) {
1455 if (LT.second == MVT::nxv32f16) {
1456 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1457 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1458 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1459 ConvType = MVT::nxv16f16;
1460 FsqrtType = MVT::nxv16f32;
1461 } else {
1462 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1463 FsqrtOp = {RISCV::VFSQRT_V};
1464 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1465 }
1466 } else {
1467 FsqrtOp = {RISCV::VFSQRT_V};
1468 }
1469
1470 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1471 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1472 }
1473 break;
1474 }
1475 case Intrinsic::cttz:
1476 case Intrinsic::ctlz:
1477 case Intrinsic::ctpop: {
1478 auto LT = getTypeLegalizationCost(RetTy);
1479 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1480 unsigned Op;
1481 switch (ICA.getID()) {
1482 case Intrinsic::cttz:
1483 Op = RISCV::VCTZ_V;
1484 break;
1485 case Intrinsic::ctlz:
1486 Op = RISCV::VCLZ_V;
1487 break;
1488 case Intrinsic::ctpop:
1489 Op = RISCV::VCPOP_V;
1490 break;
1491 }
1492 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1493 }
1494 break;
1495 }
1496 case Intrinsic::abs: {
1497 auto LT = getTypeLegalizationCost(RetTy);
1498 if (ST->hasVInstructions() && LT.second.isVector()) {
1499 // vrsub.vi v10, v8, 0
1500 // vmax.vv v8, v8, v10
1501 return LT.first *
1502 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1503 LT.second, CostKind);
1504 }
1505 break;
1506 }
1507 case Intrinsic::get_active_lane_mask: {
1508 if (ST->hasVInstructions()) {
1509 Type *ExpRetTy = VectorType::get(
1510 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1511 auto LT = getTypeLegalizationCost(ExpRetTy);
1512
1513 // vid.v v8 // considered hoisted
1514 // vsaddu.vx v8, v8, a0
1515 // vmsltu.vx v0, v8, a1
1516 return LT.first *
1517 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1518 LT.second, CostKind);
1519 }
1520 break;
1521 }
1522 // TODO: add more intrinsic
1523 case Intrinsic::stepvector: {
1524 auto LT = getTypeLegalizationCost(RetTy);
1525 // Legalisation of illegal types involves an `index' instruction plus
1526 // (LT.first - 1) vector adds.
1527 if (ST->hasVInstructions())
1528 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1529 (LT.first - 1) *
1530 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1531 return 1 + (LT.first - 1);
1532 }
1533 case Intrinsic::experimental_cttz_elts: {
1534 Type *ArgTy = ICA.getArgTypes()[0];
1535 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1536 if (getTLI()->shouldExpandCttzElements(ArgType))
1537 break;
1538 InstructionCost Cost = getRISCVInstructionCost(
1539 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1540
1541 // If zero_is_poison is false, then we will generate additional
1542 // cmp + select instructions to convert -1 to EVL.
1543 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1544 if (ICA.getArgs().size() > 1 &&
1545 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1546 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1548 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1550
1551 return Cost;
1552 }
1553 case Intrinsic::experimental_vp_splat: {
1554 auto LT = getTypeLegalizationCost(RetTy);
1555 // TODO: Lower i1 experimental_vp_splat
1556 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1558 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1559 ? RISCV::VFMV_V_F
1560 : RISCV::VMV_V_X,
1561 LT.second, CostKind);
1562 }
1563 case Intrinsic::experimental_vp_splice: {
1564 // To support type-based query from vectorizer, set the index to 0.
1565 // Note that index only change the cost from vslide.vx to vslide.vi and in
1566 // current implementations they have same costs.
1568 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1570 }
1571 case Intrinsic::fptoui_sat:
1572 case Intrinsic::fptosi_sat: {
1574 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1575 Type *SrcTy = ICA.getArgTypes()[0];
1576
1577 auto SrcLT = getTypeLegalizationCost(SrcTy);
1578 auto DstLT = getTypeLegalizationCost(RetTy);
1579 if (!SrcTy->isVectorTy())
1580 break;
1581
1582 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1584
1585 Cost +=
1586 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1587 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1588
1589 // Handle NaN.
1590 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1591 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1592 Type *CondTy = RetTy->getWithNewBitWidth(1);
1593 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1595 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1597 return Cost;
1598 }
1599 }
1600
1601 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1602 if (auto LT = getTypeLegalizationCost(RetTy);
1603 LT.second.isVector()) {
1604 MVT EltTy = LT.second.getVectorElementType();
1605 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1606 ICA.getID(), EltTy))
1607 return LT.first * Entry->Cost;
1608 }
1609 }
1610
1612}
1613
1616 const SCEV *Ptr,
1618 // Address computations for vector indexed load/store likely require an offset
1619 // and/or scaling.
1620 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1621 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1622
1623 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1624}
1625
1627 Type *Src,
1630 const Instruction *I) const {
1631 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1632 if (!IsVectorType)
1633 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1634
1635 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1636 // For now, skip all fixed vector cost analysis when P extension is available
1637 // to avoid crashes in getMinRVVVectorSizeInBits()
1638 if (ST->enablePExtCodeGen() &&
1640 return 1; // Treat as single instruction cost for now
1641 }
1642
1643 // FIXME: Need to compute legalizing cost for illegal types. The current
1644 // code handles only legal types and those which can be trivially
1645 // promoted to legal.
1646 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1647 Dst->getScalarSizeInBits() > ST->getELen())
1648 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1649
1650 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1651 assert(ISD && "Invalid opcode");
1652 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1653 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1654
1655 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1656 // The shared implementation doesn't model vector widening during legalization
1657 // and instead assumes scalarization. In order to scalarize an <N x i1>
1658 // vector, we need to extend/trunc to/from i8. If we don't special case
1659 // this, we can get an infinite recursion cycle.
1660 switch (ISD) {
1661 default:
1662 break;
1663 case ISD::SIGN_EXTEND:
1664 case ISD::ZERO_EXTEND:
1665 if (Src->getScalarSizeInBits() == 1) {
1666 // We do not use vsext/vzext to extend from mask vector.
1667 // Instead we use the following instructions to extend from mask vector:
1668 // vmv.v.i v8, 0
1669 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1670 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1671 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1672 DstLT.second, CostKind) +
1673 DstLT.first - 1;
1674 }
1675 break;
1676 case ISD::TRUNCATE:
1677 if (Dst->getScalarSizeInBits() == 1) {
1678 // We do not use several vncvt to truncate to mask vector. So we could
1679 // not use PowDiff to calculate it.
1680 // Instead we use the following instructions to truncate to mask vector:
1681 // vand.vi v8, v8, 1
1682 // vmsne.vi v0, v8, 0
1683 return SrcLT.first *
1684 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1685 SrcLT.second, CostKind) +
1686 SrcLT.first - 1;
1687 }
1688 break;
1689 };
1690
1691 // Our actual lowering for the case where a wider legal type is available
1692 // uses promotion to the wider type. This is reflected in the result of
1693 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1694 // scalarized if the legalized Src and Dst are not equal sized.
1695 const DataLayout &DL = this->getDataLayout();
1696 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1697 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1698 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1699 SrcLT.second.getSizeInBits()) ||
1700 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1701 DstLT.second.getSizeInBits()) ||
1702 SrcLT.first > 1 || DstLT.first > 1)
1703 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1704
1705 // The split cost is handled by the base getCastInstrCost
1706 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1707
1708 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1709 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1710 switch (ISD) {
1711 case ISD::SIGN_EXTEND:
1712 case ISD::ZERO_EXTEND: {
1713 if ((PowDiff < 1) || (PowDiff > 3))
1714 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1715 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1716 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1717 unsigned Op =
1718 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1719 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1720 }
1721 case ISD::TRUNCATE:
1722 case ISD::FP_EXTEND:
1723 case ISD::FP_ROUND: {
1724 // Counts of narrow/widen instructions.
1725 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1726 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1727
1728 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1729 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1730 : RISCV::VFNCVT_F_F_W;
1732 for (; SrcEltSize != DstEltSize;) {
1733 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1734 ? MVT::getIntegerVT(DstEltSize)
1735 : MVT::getFloatingPointVT(DstEltSize);
1736 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1737 DstEltSize =
1738 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1739 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1740 }
1741 return Cost;
1742 }
1743 case ISD::FP_TO_SINT:
1744 case ISD::FP_TO_UINT: {
1745 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1746 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1747 unsigned FWCVT =
1748 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1749 unsigned FNCVT =
1750 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1751 unsigned SrcEltSize = Src->getScalarSizeInBits();
1752 unsigned DstEltSize = Dst->getScalarSizeInBits();
1754 if ((SrcEltSize == 16) &&
1755 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1756 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1757 // pre-widening to f32 and then convert f32 to integer
1758 VectorType *VecF32Ty =
1759 VectorType::get(Type::getFloatTy(Dst->getContext()),
1760 cast<VectorType>(Dst)->getElementCount());
1761 std::pair<InstructionCost, MVT> VecF32LT =
1762 getTypeLegalizationCost(VecF32Ty);
1763 Cost +=
1764 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1765 VecF32LT.second, CostKind);
1766 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1767 return Cost;
1768 }
1769 if (DstEltSize == SrcEltSize)
1770 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1771 else if (DstEltSize > SrcEltSize)
1772 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1773 else { // (SrcEltSize > DstEltSize)
1774 // First do a narrowing conversion to an integer half the size, then
1775 // truncate if needed.
1776 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1777 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1778 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1779 if ((SrcEltSize / 2) > DstEltSize) {
1780 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1781 Cost +=
1782 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1783 }
1784 }
1785 return Cost;
1786 }
1787 case ISD::SINT_TO_FP:
1788 case ISD::UINT_TO_FP: {
1789 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1790 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1791 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1792 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1793 unsigned SrcEltSize = Src->getScalarSizeInBits();
1794 unsigned DstEltSize = Dst->getScalarSizeInBits();
1795
1797 if ((DstEltSize == 16) &&
1798 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1799 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1800 // it is converted to f32 and then converted to f16
1801 VectorType *VecF32Ty =
1802 VectorType::get(Type::getFloatTy(Dst->getContext()),
1803 cast<VectorType>(Dst)->getElementCount());
1804 std::pair<InstructionCost, MVT> VecF32LT =
1805 getTypeLegalizationCost(VecF32Ty);
1806 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1807 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1808 DstLT.second, CostKind);
1809 return Cost;
1810 }
1811
1812 if (DstEltSize == SrcEltSize)
1813 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1814 else if (DstEltSize > SrcEltSize) {
1815 if ((DstEltSize / 2) > SrcEltSize) {
1816 VectorType *VecTy =
1817 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1818 cast<VectorType>(Dst)->getElementCount());
1819 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1820 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1821 }
1822 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1823 } else
1824 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1825 return Cost;
1826 }
1827 }
1828 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1829}
1830
1831unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1832 if (isa<ScalableVectorType>(Ty)) {
1833 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1834 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1835 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1836 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1837 }
1838 return cast<FixedVectorType>(Ty)->getNumElements();
1839}
1840
1843 FastMathFlags FMF,
1845 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1846 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1847
1848 // Skip if scalar size of Ty is bigger than ELEN.
1849 if (Ty->getScalarSizeInBits() > ST->getELen())
1850 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1851
1852 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1853 if (Ty->getElementType()->isIntegerTy(1)) {
1854 // SelectionDAGBuilder does following transforms:
1855 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1856 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1857 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1858 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1859 else
1860 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1861 }
1862
1863 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1865 InstructionCost ExtraCost = 0;
1866 switch (IID) {
1867 case Intrinsic::maximum:
1868 if (FMF.noNaNs()) {
1869 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1870 } else {
1871 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1872 RISCV::VFMV_F_S};
1873 // Cost of Canonical Nan + branch
1874 // lui a0, 523264
1875 // fmv.w.x fa0, a0
1876 Type *DstTy = Ty->getScalarType();
1877 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1878 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1879 ExtraCost = 1 +
1880 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1882 getCFInstrCost(Instruction::Br, CostKind);
1883 }
1884 break;
1885
1886 case Intrinsic::minimum:
1887 if (FMF.noNaNs()) {
1888 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1889 } else {
1890 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1891 RISCV::VFMV_F_S};
1892 // Cost of Canonical Nan + branch
1893 // lui a0, 523264
1894 // fmv.w.x fa0, a0
1895 Type *DstTy = Ty->getScalarType();
1896 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1897 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1898 ExtraCost = 1 +
1899 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1901 getCFInstrCost(Instruction::Br, CostKind);
1902 }
1903 break;
1904 }
1905 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1906 }
1907
1908 // IR Reduction is composed by one rvv reduction instruction and vmv
1909 unsigned SplitOp;
1911 switch (IID) {
1912 default:
1913 llvm_unreachable("Unsupported intrinsic");
1914 case Intrinsic::smax:
1915 SplitOp = RISCV::VMAX_VV;
1916 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1917 break;
1918 case Intrinsic::smin:
1919 SplitOp = RISCV::VMIN_VV;
1920 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1921 break;
1922 case Intrinsic::umax:
1923 SplitOp = RISCV::VMAXU_VV;
1924 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1925 break;
1926 case Intrinsic::umin:
1927 SplitOp = RISCV::VMINU_VV;
1928 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1929 break;
1930 case Intrinsic::maxnum:
1931 SplitOp = RISCV::VFMAX_VV;
1932 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1933 break;
1934 case Intrinsic::minnum:
1935 SplitOp = RISCV::VFMIN_VV;
1936 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1937 break;
1938 }
1939 // Add a cost for data larger than LMUL8
1940 InstructionCost SplitCost =
1941 (LT.first > 1) ? (LT.first - 1) *
1942 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1943 : 0;
1944 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1945}
1946
1949 std::optional<FastMathFlags> FMF,
1951 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1952 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1953
1954 // Skip if scalar size of Ty is bigger than ELEN.
1955 if (Ty->getScalarSizeInBits() > ST->getELen())
1956 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1957
1958 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1959 assert(ISD && "Invalid opcode");
1960
1961 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1962 ISD != ISD::FADD)
1963 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1964
1965 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1966 Type *ElementTy = Ty->getElementType();
1967 if (ElementTy->isIntegerTy(1)) {
1968 // Example sequences:
1969 // vfirst.m a0, v0
1970 // seqz a0, a0
1971 if (LT.second == MVT::v1i1)
1972 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1973 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1975
1976 if (ISD == ISD::AND) {
1977 // Example sequences:
1978 // vmand.mm v8, v9, v8 ; needed every time type is split
1979 // vmnot.m v8, v0 ; alias for vmnand
1980 // vcpop.m a0, v8
1981 // seqz a0, a0
1982
1983 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1984 // For LMUL <= 8, there is no splitting,
1985 // the sequences are vmnot, vcpop and seqz.
1986 // When LMUL > 8 and split = 1,
1987 // the sequences are vmnand, vcpop and seqz.
1988 // When LMUL > 8 and split > 1,
1989 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1990 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1991 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1992 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1993 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1994 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1996 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1997 // Example sequences:
1998 // vsetvli a0, zero, e8, mf8, ta, ma
1999 // vmxor.mm v8, v0, v8 ; needed every time type is split
2000 // vcpop.m a0, v8
2001 // andi a0, a0, 1
2002 return (LT.first - 1) *
2003 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2004 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2005 } else {
2006 assert(ISD == ISD::OR);
2007 // Example sequences:
2008 // vsetvli a0, zero, e8, mf8, ta, ma
2009 // vmor.mm v8, v9, v8 ; needed every time type is split
2010 // vcpop.m a0, v0
2011 // snez a0, a0
2012 return (LT.first - 1) *
2013 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2014 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2015 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2017 }
2018 }
2019
2020 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2021 // instruction, and others is composed by two vmv and one rvv reduction
2022 // instruction
2023 unsigned SplitOp;
2025 switch (ISD) {
2026 case ISD::ADD:
2027 SplitOp = RISCV::VADD_VV;
2028 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2029 break;
2030 case ISD::OR:
2031 SplitOp = RISCV::VOR_VV;
2032 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2033 break;
2034 case ISD::XOR:
2035 SplitOp = RISCV::VXOR_VV;
2036 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2037 break;
2038 case ISD::AND:
2039 SplitOp = RISCV::VAND_VV;
2040 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2041 break;
2042 case ISD::FADD:
2043 // We can't promote f16/bf16 fadd reductions.
2044 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2045 LT.second.getScalarType() == MVT::bf16)
2046 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2048 Opcodes.push_back(RISCV::VFMV_S_F);
2049 for (unsigned i = 0; i < LT.first.getValue(); i++)
2050 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2051 Opcodes.push_back(RISCV::VFMV_F_S);
2052 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2053 }
2054 SplitOp = RISCV::VFADD_VV;
2055 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2056 break;
2057 }
2058 // Add a cost for data larger than LMUL8
2059 InstructionCost SplitCost =
2060 (LT.first > 1) ? (LT.first - 1) *
2061 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2062 : 0;
2063 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2064}
2065
2067 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2068 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2069 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2070 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2071 FMF, CostKind);
2072
2073 // Skip if scalar size of ResTy is bigger than ELEN.
2074 if (ResTy->getScalarSizeInBits() > ST->getELen())
2075 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2076 FMF, CostKind);
2077
2078 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2079 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2080 FMF, CostKind);
2081
2082 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2083
2084 if (IsUnsigned && Opcode == Instruction::Add &&
2085 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2086 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2087 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2088 return LT.first *
2089 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2090 }
2091
2092 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2093 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2094 FMF, CostKind);
2095
2096 return (LT.first - 1) +
2097 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2098}
2099
2103 assert(OpInfo.isConstant() && "non constant operand?");
2104 if (!isa<VectorType>(Ty))
2105 // FIXME: We need to account for immediate materialization here, but doing
2106 // a decent job requires more knowledge about the immediate than we
2107 // currently have here.
2108 return 0;
2109
2110 if (OpInfo.isUniform())
2111 // vmv.v.i, vmv.v.x, or vfmv.v.f
2112 // We ignore the cost of the scalar constant materialization to be consistent
2113 // with how we treat scalar constants themselves just above.
2114 return 1;
2115
2116 return getConstantPoolLoadCost(Ty, CostKind);
2117}
2118
2120 Align Alignment,
2121 unsigned AddressSpace,
2123 TTI::OperandValueInfo OpInfo,
2124 const Instruction *I) const {
2125 EVT VT = TLI->getValueType(DL, Src, true);
2126 // Type legalization can't handle structs
2127 if (VT == MVT::Other)
2128 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2129 CostKind, OpInfo, I);
2130
2132 if (Opcode == Instruction::Store && OpInfo.isConstant())
2133 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2134
2135 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2136
2137 InstructionCost BaseCost = [&]() {
2138 InstructionCost Cost = LT.first;
2140 return Cost;
2141
2142 // Our actual lowering for the case where a wider legal type is available
2143 // uses the a VL predicated load on the wider type. This is reflected in
2144 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2145 // widened cases are scalarized.
2146 const DataLayout &DL = this->getDataLayout();
2147 if (Src->isVectorTy() && LT.second.isVector() &&
2148 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2149 LT.second.getSizeInBits()))
2150 return Cost;
2151
2152 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2153 CostKind, OpInfo, I);
2154 }();
2155
2156 // Assume memory ops cost scale with the number of vector registers
2157 // possible accessed by the instruction. Note that BasicTTI already
2158 // handles the LT.first term for us.
2159 if (ST->hasVInstructions() && LT.second.isVector() &&
2161 BaseCost *= TLI->getLMULCost(LT.second);
2162 return Cost + BaseCost;
2163}
2164
2166 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2168 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2170 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2171 Op1Info, Op2Info, I);
2172
2173 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2174 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2175 Op1Info, Op2Info, I);
2176
2177 // Skip if scalar size of ValTy is bigger than ELEN.
2178 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2179 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2180 Op1Info, Op2Info, I);
2181
2182 auto GetConstantMatCost =
2183 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2184 if (OpInfo.isUniform())
2185 // We return 0 we currently ignore the cost of materializing scalar
2186 // constants in GPRs.
2187 return 0;
2188
2189 return getConstantPoolLoadCost(ValTy, CostKind);
2190 };
2191
2192 InstructionCost ConstantMatCost;
2193 if (Op1Info.isConstant())
2194 ConstantMatCost += GetConstantMatCost(Op1Info);
2195 if (Op2Info.isConstant())
2196 ConstantMatCost += GetConstantMatCost(Op2Info);
2197
2198 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2199 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2200 if (CondTy->isVectorTy()) {
2201 if (ValTy->getScalarSizeInBits() == 1) {
2202 // vmandn.mm v8, v8, v9
2203 // vmand.mm v9, v0, v9
2204 // vmor.mm v0, v9, v8
2205 return ConstantMatCost +
2206 LT.first *
2207 getRISCVInstructionCost(
2208 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2209 LT.second, CostKind);
2210 }
2211 // vselect and max/min are supported natively.
2212 return ConstantMatCost +
2213 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2214 CostKind);
2215 }
2216
2217 if (ValTy->getScalarSizeInBits() == 1) {
2218 // vmv.v.x v9, a0
2219 // vmsne.vi v9, v9, 0
2220 // vmandn.mm v8, v8, v9
2221 // vmand.mm v9, v0, v9
2222 // vmor.mm v0, v9, v8
2223 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2224 return ConstantMatCost +
2225 LT.first *
2226 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2227 InterimVT, CostKind) +
2228 LT.first * getRISCVInstructionCost(
2229 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2230 LT.second, CostKind);
2231 }
2232
2233 // vmv.v.x v10, a0
2234 // vmsne.vi v0, v10, 0
2235 // vmerge.vvm v8, v9, v8, v0
2236 return ConstantMatCost +
2237 LT.first * getRISCVInstructionCost(
2238 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2239 LT.second, CostKind);
2240 }
2241
2242 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2243 CmpInst::isIntPredicate(VecPred)) {
2244 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2245 // provided they incur the same cost across all implementations
2246 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2247 LT.second,
2248 CostKind);
2249 }
2250
2251 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2252 CmpInst::isFPPredicate(VecPred)) {
2253
2254 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2255 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2256 return ConstantMatCost +
2257 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2258
2259 // If we do not support the input floating point vector type, use the base
2260 // one which will calculate as:
2261 // ScalarizeCost + Num * Cost for fixed vector,
2262 // InvalidCost for scalable vector.
2263 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2264 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2265 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2266 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2267 Op1Info, Op2Info, I);
2268
2269 // Assuming vector fp compare and mask instructions are all the same cost
2270 // until a need arises to differentiate them.
2271 switch (VecPred) {
2272 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2273 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2274 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2275 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2276 return ConstantMatCost +
2277 LT.first * getRISCVInstructionCost(
2278 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2279 LT.second, CostKind);
2280
2281 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2282 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2283 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2284 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2285 return ConstantMatCost +
2286 LT.first *
2287 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2288 LT.second, CostKind);
2289
2290 case CmpInst::FCMP_OEQ: // vmfeq.vv
2291 case CmpInst::FCMP_OGT: // vmflt.vv
2292 case CmpInst::FCMP_OGE: // vmfle.vv
2293 case CmpInst::FCMP_OLT: // vmflt.vv
2294 case CmpInst::FCMP_OLE: // vmfle.vv
2295 case CmpInst::FCMP_UNE: // vmfne.vv
2296 return ConstantMatCost +
2297 LT.first *
2298 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2299 default:
2300 break;
2301 }
2302 }
2303
2304 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2305 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2306 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2307 // be (0 + select instr cost).
2308 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2309 ValTy->isIntegerTy() && !I->user_empty()) {
2310 if (all_of(I->users(), [&](const User *U) {
2311 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2312 U->getType()->isIntegerTy() &&
2313 !isa<ConstantData>(U->getOperand(1)) &&
2314 !isa<ConstantData>(U->getOperand(2));
2315 }))
2316 return 0;
2317 }
2318
2319 // TODO: Add cost for scalar type.
2320
2321 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2322 Op1Info, Op2Info, I);
2323}
2324
2327 const Instruction *I) const {
2329 return Opcode == Instruction::PHI ? 0 : 1;
2330 // Branches are assumed to be predicted.
2331 return 0;
2332}
2333
2336 unsigned Index,
2337 const Value *Op0,
2338 const Value *Op1) const {
2339 assert(Val->isVectorTy() && "This must be a vector type");
2340
2341 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2342 // For now, skip all fixed vector cost analysis when P extension is available
2343 // to avoid crashes in getMinRVVVectorSizeInBits()
2344 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
2345 return 1; // Treat as single instruction cost for now
2346 }
2347
2348 if (Opcode != Instruction::ExtractElement &&
2349 Opcode != Instruction::InsertElement)
2350 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2351
2352 // Legalize the type.
2353 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2354
2355 // This type is legalized to a scalar type.
2356 if (!LT.second.isVector()) {
2357 auto *FixedVecTy = cast<FixedVectorType>(Val);
2358 // If Index is a known constant, cost is zero.
2359 if (Index != -1U)
2360 return 0;
2361 // Extract/InsertElement with non-constant index is very costly when
2362 // scalarized; estimate cost of loads/stores sequence via the stack:
2363 // ExtractElement cost: store vector to stack, load scalar;
2364 // InsertElement cost: store vector to stack, store scalar, load vector.
2365 Type *ElemTy = FixedVecTy->getElementType();
2366 auto NumElems = FixedVecTy->getNumElements();
2367 auto Align = DL.getPrefTypeAlign(ElemTy);
2368 InstructionCost LoadCost =
2369 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2370 InstructionCost StoreCost =
2371 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2372 return Opcode == Instruction::ExtractElement
2373 ? StoreCost * NumElems + LoadCost
2374 : (StoreCost + LoadCost) * NumElems + StoreCost;
2375 }
2376
2377 // For unsupported scalable vector.
2378 if (LT.second.isScalableVector() && !LT.first.isValid())
2379 return LT.first;
2380
2381 // Mask vector extract/insert is expanded via e8.
2382 if (Val->getScalarSizeInBits() == 1) {
2383 VectorType *WideTy =
2385 cast<VectorType>(Val)->getElementCount());
2386 if (Opcode == Instruction::ExtractElement) {
2387 InstructionCost ExtendCost
2388 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2390 InstructionCost ExtractCost
2391 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2392 return ExtendCost + ExtractCost;
2393 }
2394 InstructionCost ExtendCost
2395 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2397 InstructionCost InsertCost
2398 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2399 InstructionCost TruncCost
2400 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2402 return ExtendCost + InsertCost + TruncCost;
2403 }
2404
2405
2406 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2407 // and vslideup + vmv.s.x to insert element to vector.
2408 unsigned BaseCost = 1;
2409 // When insertelement we should add the index with 1 as the input of vslideup.
2410 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2411
2412 if (Index != -1U) {
2413 // The type may be split. For fixed-width vectors we can normalize the
2414 // index to the new type.
2415 if (LT.second.isFixedLengthVector()) {
2416 unsigned Width = LT.second.getVectorNumElements();
2417 Index = Index % Width;
2418 }
2419
2420 // If exact VLEN is known, we will insert/extract into the appropriate
2421 // subvector with no additional subvector insert/extract cost.
2422 if (auto VLEN = ST->getRealVLen()) {
2423 unsigned EltSize = LT.second.getScalarSizeInBits();
2424 unsigned M1Max = *VLEN / EltSize;
2425 Index = Index % M1Max;
2426 }
2427
2428 if (Index == 0)
2429 // We can extract/insert the first element without vslidedown/vslideup.
2430 SlideCost = 0;
2431 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2432 Val->getScalarType()->isIntegerTy())
2433 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2434 else if (Opcode == Instruction::InsertElement)
2435 SlideCost = 1; // With a constant index, we do not need to use addi.
2436 }
2437
2438 // When the vector needs to split into multiple register groups and the index
2439 // exceeds single vector register group, we need to insert/extract the element
2440 // via stack.
2441 if (LT.first > 1 &&
2442 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2443 LT.second.isScalableVector()))) {
2444 Type *ScalarType = Val->getScalarType();
2445 Align VecAlign = DL.getPrefTypeAlign(Val);
2446 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2447 // Extra addi for unknown index.
2448 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2449
2450 // Store all split vectors into stack and load the target element.
2451 if (Opcode == Instruction::ExtractElement)
2452 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2453 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2454 CostKind) +
2455 IdxCost;
2456
2457 // Store all split vectors into stack and store the target element and load
2458 // vectors back.
2459 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2460 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2461 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2462 CostKind) +
2463 IdxCost;
2464 }
2465
2466 // Extract i64 in the target that has XLEN=32 need more instruction.
2467 if (Val->getScalarType()->isIntegerTy() &&
2468 ST->getXLen() < Val->getScalarSizeInBits()) {
2469 // For extractelement, we need the following instructions:
2470 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2471 // vslidedown.vx v8, v8, a0
2472 // vmv.x.s a0, v8
2473 // li a1, 32
2474 // vsrl.vx v8, v8, a1
2475 // vmv.x.s a1, v8
2476
2477 // For insertelement, we need the following instructions:
2478 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2479 // vmv.v.i v12, 0
2480 // vslide1up.vx v16, v12, a1
2481 // vslide1up.vx v12, v16, a0
2482 // addi a0, a2, 1
2483 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2484 // vslideup.vx v8, v12, a2
2485
2486 // TODO: should we count these special vsetvlis?
2487 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2488 }
2489 return BaseCost + SlideCost;
2490}
2491
2495 unsigned Index) const {
2496 if (isa<FixedVectorType>(Val))
2498 Index);
2499
2500 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2501 // for the cost of extracting the last lane of a scalable vector. It probably
2502 // needs a more accurate cost.
2503 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2504 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2505 return getVectorInstrCost(Opcode, Val, CostKind,
2506 EC.getKnownMinValue() - 1 - Index, nullptr,
2507 nullptr);
2508}
2509
2511 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2513 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2514
2515 // TODO: Handle more cost kinds.
2517 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2518 Args, CxtI);
2519
2520 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2522 Args, CxtI);
2523
2524 // Skip if scalar size of Ty is bigger than ELEN.
2525 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2526 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2527 Args, CxtI);
2528
2529 // Legalize the type.
2530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2531
2532 // TODO: Handle scalar type.
2533 if (!LT.second.isVector())
2534 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2535 Args, CxtI);
2536
2537 // f16 with zvfhmin and bf16 will be promoted to f32.
2538 // FIXME: nxv32[b]f16 will be custom lowered and split.
2539 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2540 InstructionCost CastCost = 0;
2541 if ((LT.second.getVectorElementType() == MVT::f16 ||
2542 LT.second.getVectorElementType() == MVT::bf16) &&
2543 TLI->getOperationAction(ISDOpcode, LT.second) ==
2545 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2546 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2547 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2548 // Add cost of extending arguments
2549 CastCost += LT.first * Args.size() *
2550 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2552 // Add cost of truncating result
2553 CastCost +=
2554 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2556 // Compute cost of op in promoted type
2557 LT.second = PromotedVT;
2558 }
2559
2560 auto getConstantMatCost =
2561 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2562 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2563 // Two sub-cases:
2564 // * Has a 5 bit immediate operand which can be splatted.
2565 // * Has a larger immediate which must be materialized in scalar register
2566 // We return 0 for both as we currently ignore the cost of materializing
2567 // scalar constants in GPRs.
2568 return 0;
2569
2570 return getConstantPoolLoadCost(Ty, CostKind);
2571 };
2572
2573 // Add the cost of materializing any constant vectors required.
2574 InstructionCost ConstantMatCost = 0;
2575 if (Op1Info.isConstant())
2576 ConstantMatCost += getConstantMatCost(0, Op1Info);
2577 if (Op2Info.isConstant())
2578 ConstantMatCost += getConstantMatCost(1, Op2Info);
2579
2580 unsigned Op;
2581 switch (ISDOpcode) {
2582 case ISD::ADD:
2583 case ISD::SUB:
2584 Op = RISCV::VADD_VV;
2585 break;
2586 case ISD::SHL:
2587 case ISD::SRL:
2588 case ISD::SRA:
2589 Op = RISCV::VSLL_VV;
2590 break;
2591 case ISD::AND:
2592 case ISD::OR:
2593 case ISD::XOR:
2594 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2595 break;
2596 case ISD::MUL:
2597 case ISD::MULHS:
2598 case ISD::MULHU:
2599 Op = RISCV::VMUL_VV;
2600 break;
2601 case ISD::SDIV:
2602 case ISD::UDIV:
2603 Op = RISCV::VDIV_VV;
2604 break;
2605 case ISD::SREM:
2606 case ISD::UREM:
2607 Op = RISCV::VREM_VV;
2608 break;
2609 case ISD::FADD:
2610 case ISD::FSUB:
2611 Op = RISCV::VFADD_VV;
2612 break;
2613 case ISD::FMUL:
2614 Op = RISCV::VFMUL_VV;
2615 break;
2616 case ISD::FDIV:
2617 Op = RISCV::VFDIV_VV;
2618 break;
2619 case ISD::FNEG:
2620 Op = RISCV::VFSGNJN_VV;
2621 break;
2622 default:
2623 // Assuming all other instructions have the same cost until a need arises to
2624 // differentiate them.
2625 return CastCost + ConstantMatCost +
2626 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2627 Args, CxtI);
2628 }
2629
2630 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2631 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2632 // ops are twice as expensive as integer ops. Do the same for vectors so
2633 // scalar floating point ops aren't cheaper than their vector equivalents.
2634 if (Ty->isFPOrFPVectorTy())
2635 InstrCost *= 2;
2636 return CastCost + ConstantMatCost + LT.first * InstrCost;
2637}
2638
2639// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2641 ArrayRef<const Value *> Ptrs, const Value *Base,
2642 const TTI::PointersChainInfo &Info, Type *AccessTy,
2645 // In the basic model we take into account GEP instructions only
2646 // (although here can come alloca instruction, a value, constants and/or
2647 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2648 // pointer). Typically, if Base is a not a GEP-instruction and all the
2649 // pointers are relative to the same base address, all the rest are
2650 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2651 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2652 // any their index is a non-const.
2653 // If no known dependencies between the pointers cost is calculated as a sum
2654 // of costs of GEP instructions.
2655 for (auto [I, V] : enumerate(Ptrs)) {
2656 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2657 if (!GEP)
2658 continue;
2659 if (Info.isSameBase() && V != Base) {
2660 if (GEP->hasAllConstantIndices())
2661 continue;
2662 // If the chain is unit-stride and BaseReg + stride*i is a legal
2663 // addressing mode, then presume the base GEP is sitting around in a
2664 // register somewhere and check if we can fold the offset relative to
2665 // it.
2666 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2667 if (Info.isUnitStride() &&
2668 isLegalAddressingMode(AccessTy,
2669 /* BaseGV */ nullptr,
2670 /* BaseOffset */ Stride * I,
2671 /* HasBaseReg */ true,
2672 /* Scale */ 0,
2673 GEP->getType()->getPointerAddressSpace()))
2674 continue;
2675 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2676 {TTI::OK_AnyValue, TTI::OP_None},
2677 {TTI::OK_AnyValue, TTI::OP_None}, {});
2678 } else {
2679 SmallVector<const Value *> Indices(GEP->indices());
2680 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2681 Indices, AccessTy, CostKind);
2682 }
2683 }
2684 return Cost;
2685}
2686
2689 OptimizationRemarkEmitter *ORE) const {
2690 // TODO: More tuning on benchmarks and metrics with changes as needed
2691 // would apply to all settings below to enable performance.
2692
2693
2694 if (ST->enableDefaultUnroll())
2695 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2696
2697 // Enable Upper bound unrolling universally, not dependent upon the conditions
2698 // below.
2699 UP.UpperBound = true;
2700
2701 // Disable loop unrolling for Oz and Os.
2702 UP.OptSizeThreshold = 0;
2704 if (L->getHeader()->getParent()->hasOptSize())
2705 return;
2706
2707 SmallVector<BasicBlock *, 4> ExitingBlocks;
2708 L->getExitingBlocks(ExitingBlocks);
2709 LLVM_DEBUG(dbgs() << "Loop has:\n"
2710 << "Blocks: " << L->getNumBlocks() << "\n"
2711 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2712
2713 // Only allow another exit other than the latch. This acts as an early exit
2714 // as it mirrors the profitability calculation of the runtime unroller.
2715 if (ExitingBlocks.size() > 2)
2716 return;
2717
2718 // Limit the CFG of the loop body for targets with a branch predictor.
2719 // Allowing 4 blocks permits if-then-else diamonds in the body.
2720 if (L->getNumBlocks() > 4)
2721 return;
2722
2723 // Scan the loop: don't unroll loops with calls as this could prevent
2724 // inlining. Don't unroll auto-vectorized loops either, though do allow
2725 // unrolling of the scalar remainder.
2726 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2728 for (auto *BB : L->getBlocks()) {
2729 for (auto &I : *BB) {
2730 // Both auto-vectorized loops and the scalar remainder have the
2731 // isvectorized attribute, so differentiate between them by the presence
2732 // of vector instructions.
2733 if (IsVectorized && I.getType()->isVectorTy())
2734 return;
2735
2736 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2737 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2738 if (!isLoweredToCall(F))
2739 continue;
2740 }
2741 return;
2742 }
2743
2744 SmallVector<const Value *> Operands(I.operand_values());
2745 Cost += getInstructionCost(&I, Operands,
2747 }
2748 }
2749
2750 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2751
2752 UP.Partial = true;
2753 UP.Runtime = true;
2754 UP.UnrollRemainder = true;
2755 UP.UnrollAndJam = true;
2756
2757 // Force unrolling small loops can be very useful because of the branch
2758 // taken cost of the backedge.
2759 if (Cost < 12)
2760 UP.Force = true;
2761}
2762
2767
2769 MemIntrinsicInfo &Info) const {
2770 const DataLayout &DL = getDataLayout();
2771 Intrinsic::ID IID = Inst->getIntrinsicID();
2772 LLVMContext &C = Inst->getContext();
2773 bool HasMask = false;
2774
2775 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2776 bool IsWrite) -> int64_t {
2777 if (auto *TarExtTy =
2778 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2779 return TarExtTy->getIntParameter(0);
2780
2781 return 1;
2782 };
2783
2784 switch (IID) {
2785 case Intrinsic::riscv_vle_mask:
2786 case Intrinsic::riscv_vse_mask:
2787 case Intrinsic::riscv_vlseg2_mask:
2788 case Intrinsic::riscv_vlseg3_mask:
2789 case Intrinsic::riscv_vlseg4_mask:
2790 case Intrinsic::riscv_vlseg5_mask:
2791 case Intrinsic::riscv_vlseg6_mask:
2792 case Intrinsic::riscv_vlseg7_mask:
2793 case Intrinsic::riscv_vlseg8_mask:
2794 case Intrinsic::riscv_vsseg2_mask:
2795 case Intrinsic::riscv_vsseg3_mask:
2796 case Intrinsic::riscv_vsseg4_mask:
2797 case Intrinsic::riscv_vsseg5_mask:
2798 case Intrinsic::riscv_vsseg6_mask:
2799 case Intrinsic::riscv_vsseg7_mask:
2800 case Intrinsic::riscv_vsseg8_mask:
2801 HasMask = true;
2802 [[fallthrough]];
2803 case Intrinsic::riscv_vle:
2804 case Intrinsic::riscv_vse:
2805 case Intrinsic::riscv_vlseg2:
2806 case Intrinsic::riscv_vlseg3:
2807 case Intrinsic::riscv_vlseg4:
2808 case Intrinsic::riscv_vlseg5:
2809 case Intrinsic::riscv_vlseg6:
2810 case Intrinsic::riscv_vlseg7:
2811 case Intrinsic::riscv_vlseg8:
2812 case Intrinsic::riscv_vsseg2:
2813 case Intrinsic::riscv_vsseg3:
2814 case Intrinsic::riscv_vsseg4:
2815 case Intrinsic::riscv_vsseg5:
2816 case Intrinsic::riscv_vsseg6:
2817 case Intrinsic::riscv_vsseg7:
2818 case Intrinsic::riscv_vsseg8: {
2819 // Intrinsic interface:
2820 // riscv_vle(merge, ptr, vl)
2821 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2822 // riscv_vse(val, ptr, vl)
2823 // riscv_vse_mask(val, ptr, mask, vl, policy)
2824 // riscv_vlseg#(merge, ptr, vl, sew)
2825 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2826 // riscv_vsseg#(val, ptr, vl, sew)
2827 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2828 bool IsWrite = Inst->getType()->isVoidTy();
2829 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2830 // The results of segment loads are TargetExtType.
2831 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2832 unsigned SEW =
2833 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2834 ->getZExtValue();
2835 Ty = TarExtTy->getTypeParameter(0U);
2837 IntegerType::get(C, SEW),
2838 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2839 }
2840 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2841 unsigned VLIndex = RVVIInfo->VLOperand;
2842 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2843 MaybeAlign Alignment =
2844 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2845 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2846 Value *Mask = ConstantInt::getTrue(MaskType);
2847 if (HasMask)
2848 Mask = Inst->getArgOperand(VLIndex - 1);
2849 Value *EVL = Inst->getArgOperand(VLIndex);
2850 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2851 // RVV uses contiguous elements as a segment.
2852 if (SegNum > 1) {
2853 unsigned ElemSize = Ty->getScalarSizeInBits();
2854 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2855 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2856 }
2857 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2858 Alignment, Mask, EVL);
2859 return true;
2860 }
2861 case Intrinsic::riscv_vlse_mask:
2862 case Intrinsic::riscv_vsse_mask:
2863 case Intrinsic::riscv_vlsseg2_mask:
2864 case Intrinsic::riscv_vlsseg3_mask:
2865 case Intrinsic::riscv_vlsseg4_mask:
2866 case Intrinsic::riscv_vlsseg5_mask:
2867 case Intrinsic::riscv_vlsseg6_mask:
2868 case Intrinsic::riscv_vlsseg7_mask:
2869 case Intrinsic::riscv_vlsseg8_mask:
2870 case Intrinsic::riscv_vssseg2_mask:
2871 case Intrinsic::riscv_vssseg3_mask:
2872 case Intrinsic::riscv_vssseg4_mask:
2873 case Intrinsic::riscv_vssseg5_mask:
2874 case Intrinsic::riscv_vssseg6_mask:
2875 case Intrinsic::riscv_vssseg7_mask:
2876 case Intrinsic::riscv_vssseg8_mask:
2877 HasMask = true;
2878 [[fallthrough]];
2879 case Intrinsic::riscv_vlse:
2880 case Intrinsic::riscv_vsse:
2881 case Intrinsic::riscv_vlsseg2:
2882 case Intrinsic::riscv_vlsseg3:
2883 case Intrinsic::riscv_vlsseg4:
2884 case Intrinsic::riscv_vlsseg5:
2885 case Intrinsic::riscv_vlsseg6:
2886 case Intrinsic::riscv_vlsseg7:
2887 case Intrinsic::riscv_vlsseg8:
2888 case Intrinsic::riscv_vssseg2:
2889 case Intrinsic::riscv_vssseg3:
2890 case Intrinsic::riscv_vssseg4:
2891 case Intrinsic::riscv_vssseg5:
2892 case Intrinsic::riscv_vssseg6:
2893 case Intrinsic::riscv_vssseg7:
2894 case Intrinsic::riscv_vssseg8: {
2895 // Intrinsic interface:
2896 // riscv_vlse(merge, ptr, stride, vl)
2897 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2898 // riscv_vsse(val, ptr, stride, vl)
2899 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2900 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2901 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2902 // riscv_vssseg#(val, ptr, offset, vl, sew)
2903 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2904 bool IsWrite = Inst->getType()->isVoidTy();
2905 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2906 // The results of segment loads are TargetExtType.
2907 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2908 unsigned SEW =
2909 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2910 ->getZExtValue();
2911 Ty = TarExtTy->getTypeParameter(0U);
2913 IntegerType::get(C, SEW),
2914 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2915 }
2916 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2917 unsigned VLIndex = RVVIInfo->VLOperand;
2918 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2919 MaybeAlign Alignment =
2920 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2921
2922 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2923 // Use the pointer alignment as the element alignment if the stride is a
2924 // multiple of the pointer alignment. Otherwise, the element alignment
2925 // should be the greatest common divisor of pointer alignment and stride.
2926 // For simplicity, just consider unalignment for elements.
2927 unsigned PointerAlign = Alignment.valueOrOne().value();
2928 if (!isa<ConstantInt>(Stride) ||
2929 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2930 Alignment = Align(1);
2931
2932 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2933 Value *Mask = ConstantInt::getTrue(MaskType);
2934 if (HasMask)
2935 Mask = Inst->getArgOperand(VLIndex - 1);
2936 Value *EVL = Inst->getArgOperand(VLIndex);
2937 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2938 // RVV uses contiguous elements as a segment.
2939 if (SegNum > 1) {
2940 unsigned ElemSize = Ty->getScalarSizeInBits();
2941 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2942 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2943 }
2944 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2945 Alignment, Mask, EVL, Stride);
2946 return true;
2947 }
2948 case Intrinsic::riscv_vloxei_mask:
2949 case Intrinsic::riscv_vluxei_mask:
2950 case Intrinsic::riscv_vsoxei_mask:
2951 case Intrinsic::riscv_vsuxei_mask:
2952 case Intrinsic::riscv_vloxseg2_mask:
2953 case Intrinsic::riscv_vloxseg3_mask:
2954 case Intrinsic::riscv_vloxseg4_mask:
2955 case Intrinsic::riscv_vloxseg5_mask:
2956 case Intrinsic::riscv_vloxseg6_mask:
2957 case Intrinsic::riscv_vloxseg7_mask:
2958 case Intrinsic::riscv_vloxseg8_mask:
2959 case Intrinsic::riscv_vluxseg2_mask:
2960 case Intrinsic::riscv_vluxseg3_mask:
2961 case Intrinsic::riscv_vluxseg4_mask:
2962 case Intrinsic::riscv_vluxseg5_mask:
2963 case Intrinsic::riscv_vluxseg6_mask:
2964 case Intrinsic::riscv_vluxseg7_mask:
2965 case Intrinsic::riscv_vluxseg8_mask:
2966 case Intrinsic::riscv_vsoxseg2_mask:
2967 case Intrinsic::riscv_vsoxseg3_mask:
2968 case Intrinsic::riscv_vsoxseg4_mask:
2969 case Intrinsic::riscv_vsoxseg5_mask:
2970 case Intrinsic::riscv_vsoxseg6_mask:
2971 case Intrinsic::riscv_vsoxseg7_mask:
2972 case Intrinsic::riscv_vsoxseg8_mask:
2973 case Intrinsic::riscv_vsuxseg2_mask:
2974 case Intrinsic::riscv_vsuxseg3_mask:
2975 case Intrinsic::riscv_vsuxseg4_mask:
2976 case Intrinsic::riscv_vsuxseg5_mask:
2977 case Intrinsic::riscv_vsuxseg6_mask:
2978 case Intrinsic::riscv_vsuxseg7_mask:
2979 case Intrinsic::riscv_vsuxseg8_mask:
2980 HasMask = true;
2981 [[fallthrough]];
2982 case Intrinsic::riscv_vloxei:
2983 case Intrinsic::riscv_vluxei:
2984 case Intrinsic::riscv_vsoxei:
2985 case Intrinsic::riscv_vsuxei:
2986 case Intrinsic::riscv_vloxseg2:
2987 case Intrinsic::riscv_vloxseg3:
2988 case Intrinsic::riscv_vloxseg4:
2989 case Intrinsic::riscv_vloxseg5:
2990 case Intrinsic::riscv_vloxseg6:
2991 case Intrinsic::riscv_vloxseg7:
2992 case Intrinsic::riscv_vloxseg8:
2993 case Intrinsic::riscv_vluxseg2:
2994 case Intrinsic::riscv_vluxseg3:
2995 case Intrinsic::riscv_vluxseg4:
2996 case Intrinsic::riscv_vluxseg5:
2997 case Intrinsic::riscv_vluxseg6:
2998 case Intrinsic::riscv_vluxseg7:
2999 case Intrinsic::riscv_vluxseg8:
3000 case Intrinsic::riscv_vsoxseg2:
3001 case Intrinsic::riscv_vsoxseg3:
3002 case Intrinsic::riscv_vsoxseg4:
3003 case Intrinsic::riscv_vsoxseg5:
3004 case Intrinsic::riscv_vsoxseg6:
3005 case Intrinsic::riscv_vsoxseg7:
3006 case Intrinsic::riscv_vsoxseg8:
3007 case Intrinsic::riscv_vsuxseg2:
3008 case Intrinsic::riscv_vsuxseg3:
3009 case Intrinsic::riscv_vsuxseg4:
3010 case Intrinsic::riscv_vsuxseg5:
3011 case Intrinsic::riscv_vsuxseg6:
3012 case Intrinsic::riscv_vsuxseg7:
3013 case Intrinsic::riscv_vsuxseg8: {
3014 // Intrinsic interface (only listed ordered version):
3015 // riscv_vloxei(merge, ptr, index, vl)
3016 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3017 // riscv_vsoxei(val, ptr, index, vl)
3018 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3019 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3020 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3021 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3022 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3023 bool IsWrite = Inst->getType()->isVoidTy();
3024 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3025 // The results of segment loads are TargetExtType.
3026 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3027 unsigned SEW =
3028 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3029 ->getZExtValue();
3030 Ty = TarExtTy->getTypeParameter(0U);
3032 IntegerType::get(C, SEW),
3033 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3034 }
3035 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3036 unsigned VLIndex = RVVIInfo->VLOperand;
3037 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3038 Value *Mask;
3039 if (HasMask) {
3040 Mask = Inst->getArgOperand(VLIndex - 1);
3041 } else {
3042 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3043 // and casting that to scalar i64 triggers a vector/scalar mismatch
3044 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3045 // via extractelement instead.
3046 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3047 Mask = ConstantInt::getTrue(MaskType);
3048 }
3049 Value *EVL = Inst->getArgOperand(VLIndex);
3050 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3051 // RVV uses contiguous elements as a segment.
3052 if (SegNum > 1) {
3053 unsigned ElemSize = Ty->getScalarSizeInBits();
3054 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3055 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3056 }
3057 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3058 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3059 Align(1), Mask, EVL,
3060 /* Stride */ nullptr, OffsetOp);
3061 return true;
3062 }
3063 }
3064 return false;
3065}
3066
3068 if (Ty->isVectorTy()) {
3069 // f16 with only zvfhmin and bf16 will be promoted to f32
3070 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3071 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3072 EltTy->isBFloatTy())
3073 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3074 cast<VectorType>(Ty));
3075
3076 TypeSize Size = DL.getTypeSizeInBits(Ty);
3077 if (Size.isScalable() && ST->hasVInstructions())
3078 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3079
3080 if (ST->useRVVForFixedLengthVectors())
3081 return divideCeil(Size, ST->getRealMinVLen());
3082 }
3083
3084 return BaseT::getRegUsageForType(Ty);
3085}
3086
3087unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3088 if (SLPMaxVF.getNumOccurrences())
3089 return SLPMaxVF;
3090
3091 // Return how many elements can fit in getRegisterBitwidth. This is the
3092 // same routine as used in LoopVectorizer. We should probably be
3093 // accounting for whether we actually have instructions with the right
3094 // lane type, but we don't have enough information to do that without
3095 // some additional plumbing which hasn't been justified yet.
3096 TypeSize RegWidth =
3098 // If no vector registers, or absurd element widths, disable
3099 // vectorization by returning 1.
3100 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3101}
3102
3106
3108 return ST->enableUnalignedVectorMem();
3109}
3110
3113 ScalarEvolution *SE) const {
3114 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3115 return TTI::AMK_PostIndexed;
3116
3118}
3119
3121 const TargetTransformInfo::LSRCost &C2) const {
3122 // RISC-V specific here are "instruction number 1st priority".
3123 // If we need to emit adds inside the loop to add up base registers, then
3124 // we need at least one extra temporary register.
3125 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3126 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3127 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3128 C1.NumIVMuls, C1.NumBaseAdds,
3129 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3130 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3131 C2.NumIVMuls, C2.NumBaseAdds,
3132 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3133}
3134
3136 Align Alignment) const {
3137 auto *VTy = dyn_cast<VectorType>(DataTy);
3138 if (!VTy || VTy->isScalableTy())
3139 return false;
3140
3141 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3142 return false;
3143
3144 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3145 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3146 if (VTy->getElementType()->isIntegerTy(8))
3147 if (VTy->getElementCount().getFixedValue() > 256)
3148 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3149 ST->getMaxLMULForFixedLengthVectors();
3150 return true;
3151}
3152
3154 Align Alignment) const {
3155 auto *VTy = dyn_cast<VectorType>(DataTy);
3156 if (!VTy || VTy->isScalableTy())
3157 return false;
3158
3159 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3160 return false;
3161 return true;
3162}
3163
3164/// See if \p I should be considered for address type promotion. We check if \p
3165/// I is a sext with right type and used in memory accesses. If it used in a
3166/// "complex" getelementptr, we allow it to be promoted without finding other
3167/// sext instructions that sign extended the same initial value. A getelementptr
3168/// is considered as "complex" if it has more than 2 operands.
3170 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3171 bool Considerable = false;
3172 AllowPromotionWithoutCommonHeader = false;
3173 if (!isa<SExtInst>(&I))
3174 return false;
3175 Type *ConsideredSExtType =
3176 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3177 if (I.getType() != ConsideredSExtType)
3178 return false;
3179 // See if the sext is the one with the right type and used in at least one
3180 // GetElementPtrInst.
3181 for (const User *U : I.users()) {
3182 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3183 Considerable = true;
3184 // A getelementptr is considered as "complex" if it has more than 2
3185 // operands. We will promote a SExt used in such complex GEP as we
3186 // expect some computation to be merged if they are done on 64 bits.
3187 if (GEPInst->getNumOperands() > 2) {
3188 AllowPromotionWithoutCommonHeader = true;
3189 break;
3190 }
3191 }
3192 }
3193 return Considerable;
3194}
3195
3196bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3197 switch (Opcode) {
3198 case Instruction::Add:
3199 case Instruction::Sub:
3200 case Instruction::Mul:
3201 case Instruction::And:
3202 case Instruction::Or:
3203 case Instruction::Xor:
3204 case Instruction::FAdd:
3205 case Instruction::FSub:
3206 case Instruction::FMul:
3207 case Instruction::FDiv:
3208 case Instruction::ICmp:
3209 case Instruction::FCmp:
3210 return true;
3211 case Instruction::Shl:
3212 case Instruction::LShr:
3213 case Instruction::AShr:
3214 case Instruction::UDiv:
3215 case Instruction::SDiv:
3216 case Instruction::URem:
3217 case Instruction::SRem:
3218 case Instruction::Select:
3219 return Operand == 1;
3220 default:
3221 return false;
3222 }
3223}
3224
3226 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3227 return false;
3228
3229 if (canSplatOperand(I->getOpcode(), Operand))
3230 return true;
3231
3232 auto *II = dyn_cast<IntrinsicInst>(I);
3233 if (!II)
3234 return false;
3235
3236 switch (II->getIntrinsicID()) {
3237 case Intrinsic::fma:
3238 case Intrinsic::vp_fma:
3239 case Intrinsic::fmuladd:
3240 case Intrinsic::vp_fmuladd:
3241 return Operand == 0 || Operand == 1;
3242 case Intrinsic::vp_shl:
3243 case Intrinsic::vp_lshr:
3244 case Intrinsic::vp_ashr:
3245 case Intrinsic::vp_udiv:
3246 case Intrinsic::vp_sdiv:
3247 case Intrinsic::vp_urem:
3248 case Intrinsic::vp_srem:
3249 case Intrinsic::ssub_sat:
3250 case Intrinsic::vp_ssub_sat:
3251 case Intrinsic::usub_sat:
3252 case Intrinsic::vp_usub_sat:
3253 case Intrinsic::vp_select:
3254 return Operand == 1;
3255 // These intrinsics are commutative.
3256 case Intrinsic::vp_add:
3257 case Intrinsic::vp_mul:
3258 case Intrinsic::vp_and:
3259 case Intrinsic::vp_or:
3260 case Intrinsic::vp_xor:
3261 case Intrinsic::vp_fadd:
3262 case Intrinsic::vp_fmul:
3263 case Intrinsic::vp_icmp:
3264 case Intrinsic::vp_fcmp:
3265 case Intrinsic::smin:
3266 case Intrinsic::vp_smin:
3267 case Intrinsic::umin:
3268 case Intrinsic::vp_umin:
3269 case Intrinsic::smax:
3270 case Intrinsic::vp_smax:
3271 case Intrinsic::umax:
3272 case Intrinsic::vp_umax:
3273 case Intrinsic::sadd_sat:
3274 case Intrinsic::vp_sadd_sat:
3275 case Intrinsic::uadd_sat:
3276 case Intrinsic::vp_uadd_sat:
3277 // These intrinsics have 'vr' versions.
3278 case Intrinsic::vp_sub:
3279 case Intrinsic::vp_fsub:
3280 case Intrinsic::vp_fdiv:
3281 return Operand == 0 || Operand == 1;
3282 default:
3283 return false;
3284 }
3285}
3286
3287/// Check if sinking \p I's operands to I's basic block is profitable, because
3288/// the operands can be folded into a target instruction, e.g.
3289/// splats of scalars can fold into vector instructions.
3292 using namespace llvm::PatternMatch;
3293
3294 if (I->isBitwiseLogicOp()) {
3295 if (!I->getType()->isVectorTy()) {
3296 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3297 for (auto &Op : I->operands()) {
3298 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3299 if (match(Op.get(), m_Not(m_Value()))) {
3300 Ops.push_back(&Op);
3301 return true;
3302 }
3303 }
3304 }
3305 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3306 for (auto &Op : I->operands()) {
3307 // (and X, (not Y)) -> (vandn.vv X, Y)
3308 if (match(Op.get(), m_Not(m_Value()))) {
3309 Ops.push_back(&Op);
3310 return true;
3311 }
3312 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3314 m_ZeroInt()),
3315 m_Value(), m_ZeroMask()))) {
3316 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3317 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3318 Ops.push_back(&Not);
3319 Ops.push_back(&InsertElt);
3320 Ops.push_back(&Op);
3321 return true;
3322 }
3323 }
3324 }
3325 }
3326
3327 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3328 return false;
3329
3330 // Don't sink splat operands if the target prefers it. Some targets requires
3331 // S2V transfer buffers and we can run out of them copying the same value
3332 // repeatedly.
3333 // FIXME: It could still be worth doing if it would improve vector register
3334 // pressure and prevent a vector spill.
3335 if (!ST->sinkSplatOperands())
3336 return false;
3337
3338 for (auto OpIdx : enumerate(I->operands())) {
3339 if (!canSplatOperand(I, OpIdx.index()))
3340 continue;
3341
3342 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3343 // Make sure we are not already sinking this operand
3344 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3345 continue;
3346
3347 // We are looking for a splat/vp.splat that can be sunk.
3349 m_Value(), m_Value(), m_Value()));
3350 if (!IsVPSplat &&
3352 m_Value(), m_ZeroMask())))
3353 continue;
3354
3355 // Don't sink i1 splats.
3356 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3357 continue;
3358
3359 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3360 // and vector registers
3361 for (Use &U : Op->uses()) {
3362 Instruction *Insn = cast<Instruction>(U.getUser());
3363 if (!canSplatOperand(Insn, U.getOperandNo()))
3364 return false;
3365 }
3366
3367 // Sink any fpexts since they might be used in a widening fp pattern.
3368 if (IsVPSplat) {
3369 if (isa<FPExtInst>(Op->getOperand(0)))
3370 Ops.push_back(&Op->getOperandUse(0));
3371 } else {
3372 Use *InsertEltUse = &Op->getOperandUse(0);
3373 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3374 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3375 Ops.push_back(&InsertElt->getOperandUse(1));
3376 Ops.push_back(InsertEltUse);
3377 }
3378 Ops.push_back(&OpIdx.value());
3379 }
3380 return true;
3381}
3382
3384RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3386 // TODO: Enable expansion when unaligned access is not supported after we fix
3387 // issues in ExpandMemcmp.
3388 if (!ST->enableUnalignedScalarMem())
3389 return Options;
3390
3391 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3392 return Options;
3393
3394 Options.AllowOverlappingLoads = true;
3395 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3396 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3397 if (ST->is64Bit()) {
3398 Options.LoadSizes = {8, 4, 2, 1};
3399 Options.AllowedTailExpansions = {3, 5, 6};
3400 } else {
3401 Options.LoadSizes = {4, 2, 1};
3402 Options.AllowedTailExpansions = {3};
3403 }
3404
3405 if (IsZeroCmp && ST->hasVInstructions()) {
3406 unsigned VLenB = ST->getRealMinVLen() / 8;
3407 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3408 // `VLenB * MaxLMUL` so that it fits in a single register group.
3409 unsigned MinSize = ST->getXLen() / 8 + 1;
3410 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3411 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3412 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3413 }
3414 return Options;
3415}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).