LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdota4* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
704 *this, LT.second, ST->getRealVLen(),
705 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(DeinterleaveMask, Mask))
731 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
732 LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
739 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
745 FixedVectorType *DestTp =
747 std::pair<InstructionCost, MVT> DestLT =
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
753 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
754 CostKind, InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
773 }
774 break;
775 }
778
779 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
792 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(DstTy);
859 return LT.first *
860 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
877 Instruction::InsertElement);
878 if (LT.second.getScalarSizeInBits() == 1) {
879 if (HasScalar) {
880 // Example sequence:
881 // andi a0, a0, 1
882 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
883 // vmv.v.x v8, a0
884 // vmsne.vi v0, v8, 0
885 return LT.first *
886 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
887 LT.second, CostKind));
888 }
889 // Example sequence:
890 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
891 // vmv.v.i v8, 0
892 // vmerge.vim v8, v8, 1, v0
893 // vmv.x.s a0, v8
894 // andi a0, a0, 1
895 // vmv.v.x v8, a0
896 // vmsne.vi v0, v8, 0
897
898 return LT.first *
899 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
900 RISCV::VMV_X_S, RISCV::VMV_V_X,
901 RISCV::VMSNE_VI},
902 LT.second, CostKind));
903 }
904
905 if (HasScalar) {
906 // Example sequence:
907 // vmv.v.x v8, a0
908 return LT.first *
909 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
910 }
911
912 // Example sequence:
913 // vrgather.vi v9, v8, 0
914 return LT.first *
915 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
916 }
917 case TTI::SK_Splice: {
918 // vslidedown+vslideup.
919 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
920 // of similar code, but I think we expand through memory.
921 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
922 if (Index >= 0 && Index < 32)
923 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
924 else if (Index < 0 && Index > -32)
925 Opcodes[1] = RISCV::VSLIDEUP_VI;
926 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
927 }
928 case TTI::SK_Reverse: {
929
930 if (!LT.second.isVector())
932
933 // TODO: Cases to improve here:
934 // * Illegal vector types
935 // * i64 on RV32
936 if (SrcTy->getElementType()->isIntegerTy(1)) {
937 VectorType *WideTy =
938 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
939 cast<VectorType>(SrcTy)->getElementCount());
940 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
942 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
943 nullptr) +
944 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
946 }
947
948 MVT ContainerVT = LT.second;
949 if (LT.second.isFixedLengthVector())
950 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
951 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
952 if (ContainerVT.bitsLE(M1VT)) {
953 // Example sequence:
954 // csrr a0, vlenb
955 // srli a0, a0, 3
956 // addi a0, a0, -1
957 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
958 // vid.v v9
959 // vrsub.vx v10, v9, a0
960 // vrgather.vv v9, v8, v10
961 InstructionCost LenCost = 3;
962 if (LT.second.isFixedLengthVector())
963 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
964 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
965 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
966 if (LT.second.isFixedLengthVector() &&
967 isInt<5>(LT.second.getVectorNumElements() - 1))
968 Opcodes[1] = RISCV::VRSUB_VI;
969 InstructionCost GatherCost =
970 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
971 return LT.first * (LenCost + GatherCost);
972 }
973
974 // At high LMUL, we split into a series of M1 reverses (see
975 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
976 // the resulting gap at the bottom (for fixed vectors only). The important
977 // bit is that the cost scales linearly, not quadratically with LMUL.
978 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
979 InstructionCost FixedCost =
980 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
981 unsigned Ratio =
983 InstructionCost GatherCost =
984 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
985 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
986 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
987 return FixedCost + LT.first * (GatherCost + SlideCost);
988 }
989 }
990 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
991 SubTp);
992}
993
994static unsigned isM1OrSmaller(MVT VT) {
996 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1000}
1001
1003 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1004 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1005 TTI::VectorInstrContext VIC) const {
1008
1009 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1010 // For now, skip all fixed vector cost analysis when P extension is available
1011 // to avoid crashes in getMinRVVVectorSizeInBits()
1012 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1013 return 1; // Treat as single instruction cost for now
1014 }
1015
1016 // A build_vector (which is m1 sized or smaller) can be done in no
1017 // worse than one vslide1down.vx per element in the type. We could
1018 // in theory do an explode_vector in the inverse manner, but our
1019 // lowering today does not have a first class node for this pattern.
1021 Ty, DemandedElts, Insert, Extract, CostKind);
1022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1023 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1024 if (Ty->getScalarSizeInBits() == 1) {
1025 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1026 // Note: Implicit scalar anyextend is assumed to be free since the i1
1027 // must be stored in a GPR.
1028 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1029 CostKind) +
1030 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1032 }
1033
1034 assert(LT.second.isFixedLengthVector());
1035 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1036 if (isM1OrSmaller(ContainerVT)) {
1037 InstructionCost BV =
1038 cast<FixedVectorType>(Ty)->getNumElements() *
1039 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1040 if (BV < Cost)
1041 Cost = BV;
1042 }
1043 }
1044 return Cost;
1045}
1046
1050 Type *DataTy = MICA.getDataType();
1051 Align Alignment = MICA.getAlignment();
1052 switch (MICA.getID()) {
1053 case Intrinsic::vp_load_ff: {
1054 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1055 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1057
1058 unsigned AS = MICA.getAddressSpace();
1059 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1060 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1061 }
1062 case Intrinsic::experimental_vp_strided_load:
1063 case Intrinsic::experimental_vp_strided_store:
1064 return getStridedMemoryOpCost(MICA, CostKind);
1065 case Intrinsic::masked_compressstore:
1066 case Intrinsic::masked_expandload:
1068 case Intrinsic::vp_scatter:
1069 case Intrinsic::vp_gather:
1070 case Intrinsic::masked_scatter:
1071 case Intrinsic::masked_gather:
1072 return getGatherScatterOpCost(MICA, CostKind);
1073 case Intrinsic::vp_load:
1074 case Intrinsic::vp_store:
1075 case Intrinsic::masked_load:
1076 case Intrinsic::masked_store:
1077 return getMaskedMemoryOpCost(MICA, CostKind);
1078 }
1080}
1081
1085 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1086 : Instruction::Store;
1087 Type *Src = MICA.getDataType();
1088 Align Alignment = MICA.getAlignment();
1089 unsigned AddressSpace = MICA.getAddressSpace();
1090
1091 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1094
1095 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1096}
1097
1099 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1100 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1101 bool UseMaskForCond, bool UseMaskForGaps) const {
1102
1103 // The interleaved memory access pass will lower (de)interleave ops combined
1104 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1105 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1106 // gap).
1107 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1108 auto *VTy = cast<VectorType>(VecTy);
1109 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1110 // Need to make sure type has't been scalarized
1111 if (LT.second.isVector()) {
1112 auto *SubVecTy =
1113 VectorType::get(VTy->getElementType(),
1114 VTy->getElementCount().divideCoefficientBy(Factor));
1115 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1116 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1117 AddressSpace, DL)) {
1118
1119 // Some processors optimize segment loads/stores as one wide memory op +
1120 // Factor * LMUL shuffle ops.
1121 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1123 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1124 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1125 Cost += Factor * TLI->getLMULCost(SubVecVT);
1126 return LT.first * Cost;
1127 }
1128
1129 // Otherwise, the cost is proportional to the number of elements (VL *
1130 // Factor ops).
1131 InstructionCost MemOpCost =
1132 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1133 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1134 unsigned NumLoads = getEstimatedVLFor(VTy);
1135 return NumLoads * MemOpCost;
1136 }
1137 }
1138 }
1139
1140 // TODO: Return the cost of interleaved accesses for scalable vector when
1141 // unable to convert to segment accesses instructions.
1142 if (isa<ScalableVectorType>(VecTy))
1144
1145 auto *FVTy = cast<FixedVectorType>(VecTy);
1146 // When gaps are only at the tail, for interleaved load, we can emit a wide
1147 // masked load and shufflevectors. For interleaved store, we can emit
1148 // shufflevectors and a wide masked store. The interleaved memory access pass
1149 // will lower them into vlsseg/vssseg intrinsics.
1150 if (UseMaskForGaps) {
1151 assert(llvm::is_sorted(Indices) && "Indices must be sorted");
1152 assert(llvm::adjacent_find(Indices) == Indices.end() &&
1153 "Indices should not contain duplicate elements");
1154 unsigned NumOfFields = Indices.size();
1155 bool IsTailGapOnly = NumOfFields > 1 && (NumOfFields == Indices.back() + 1);
1156 if (IsTailGapOnly &&
1157 NumOfFields <= TLI->getMaxSupportedInterleaveFactor()) {
1158 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
1159 if (LT.second.isVector() &&
1160 FVTy->getElementCount().isKnownMultipleOf(Factor)) {
1161 auto *SubVecTy = VectorType::get(
1162 FVTy->getElementType(),
1163 FVTy->getElementCount().divideCoefficientBy(Factor));
1164 if (TLI->isLegalInterleavedAccessType(SubVecTy, NumOfFields, Alignment,
1165 AddressSpace, DL)) {
1166 // The cost is proportional to the total number of element accesses.
1167 unsigned NumAccesses = getEstimatedVLFor(FVTy);
1168 return NumAccesses * TTI::TCC_Basic;
1169 }
1170 }
1171 }
1172 }
1173
1174 InstructionCost MemCost =
1175 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1176 unsigned VF = FVTy->getNumElements() / Factor;
1177
1178 // An interleaved load will look like this for Factor=3:
1179 // %wide.vec = load <12 x i32>, ptr %3, align 4
1180 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1181 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1182 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1183 if (Opcode == Instruction::Load) {
1184 InstructionCost Cost = MemCost;
1185 for (unsigned Index : Indices) {
1186 FixedVectorType *VecTy =
1187 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1188 auto Mask = createStrideMask(Index, Factor, VF);
1189 Mask.resize(VF * Factor, -1);
1190 InstructionCost ShuffleCost =
1192 Mask, CostKind, 0, nullptr, {});
1193 Cost += ShuffleCost;
1194 }
1195 return Cost;
1196 }
1197
1198 // TODO: Model for NF > 2
1199 // We'll need to enhance getShuffleCost to model shuffles that are just
1200 // inserts and extracts into subvectors, since they won't have the full cost
1201 // of a vrgather.
1202 // An interleaved store for 3 vectors of 4 lanes will look like
1203 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1204 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1205 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1206 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1207 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1208 if (Factor != 2)
1209 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1210 Alignment, AddressSpace, CostKind,
1211 UseMaskForCond, UseMaskForGaps);
1212
1213 assert(Opcode == Instruction::Store && "Opcode must be a store");
1214 // For an interleaving store of 2 vectors, we perform one large interleaving
1215 // shuffle that goes into the wide store
1216 auto Mask = createInterleaveMask(VF, Factor);
1217 InstructionCost ShuffleCost =
1219 CostKind, 0, nullptr, {});
1220 return MemCost + ShuffleCost;
1221}
1222
1226
1227 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1228 MICA.getID() == Intrinsic::vp_gather;
1229 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1230 Type *DataTy = MICA.getDataType();
1231 Align Alignment = MICA.getAlignment();
1234
1235 if ((Opcode == Instruction::Load &&
1236 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1237 (Opcode == Instruction::Store &&
1238 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1240
1241 // Cost is proportional to the number of memory operations implied. For
1242 // scalable vectors, we use an estimate on that number since we don't
1243 // know exactly what VL will be.
1244 auto &VTy = *cast<VectorType>(DataTy);
1245 unsigned NumLoads = getEstimatedVLFor(&VTy);
1246 return NumLoads * TTI::TCC_Basic;
1247}
1248
1250 const MemIntrinsicCostAttributes &MICA,
1252 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1253 ? Instruction::Load
1254 : Instruction::Store;
1255 Type *DataTy = MICA.getDataType();
1256 bool VariableMask = MICA.getVariableMask();
1257 Align Alignment = MICA.getAlignment();
1258 bool IsLegal = (Opcode == Instruction::Store &&
1259 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1260 (Opcode == Instruction::Load &&
1261 isLegalMaskedExpandLoad(DataTy, Alignment));
1262 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1264 // Example compressstore sequence:
1265 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1266 // vcompress.vm v10, v8, v0
1267 // vcpop.m a1, v0
1268 // vsetvli zero, a1, e32, m2, ta, ma
1269 // vse32.v v10, (a0)
1270 // Example expandload sequence:
1271 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1272 // vcpop.m a1, v0
1273 // vsetvli zero, a1, e32, m2, ta, ma
1274 // vle32.v v10, (a0)
1275 // vsetivli zero, 8, e32, m2, ta, ma
1276 // viota.m v12, v0
1277 // vrgather.vv v8, v10, v12, v0.t
1278 auto MemOpCost =
1279 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1280 auto LT = getTypeLegalizationCost(DataTy);
1281 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1282 if (VariableMask)
1283 Opcodes.push_back(RISCV::VCPOP_M);
1284 if (Opcode == Instruction::Store)
1285 Opcodes.append({RISCV::VCOMPRESS_VM});
1286 else
1287 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1288 return MemOpCost +
1289 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1290}
1291
1295
1296 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1297 ? Instruction::Load
1298 : Instruction::Store;
1299
1300 Type *DataTy = MICA.getDataType();
1301 Align Alignment = MICA.getAlignment();
1302 const Instruction *I = MICA.getInst();
1303
1304 if (!isLegalStridedLoadStore(DataTy, Alignment))
1306
1308 return TTI::TCC_Basic;
1309
1310 // Cost is proportional to the number of memory operations implied. For
1311 // scalable vectors, we use an estimate on that number since we don't
1312 // know exactly what VL will be.
1313 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1314 auto &VTy = *cast<VectorType>(DataTy);
1315 InstructionCost MemOpCost =
1316 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1317 {TTI::OK_AnyValue, TTI::OP_None}, I);
1318 unsigned NumLoads = getEstimatedVLFor(&VTy);
1319 return NumLoads * MemOpCost;
1320}
1321
1324 // FIXME: This is a property of the default vector convention, not
1325 // all possible calling conventions. Fixing that will require
1326 // some TTI API and SLP rework.
1329 for (auto *Ty : Tys) {
1330 if (!Ty->isVectorTy())
1331 continue;
1332 Align A = DL.getPrefTypeAlign(Ty);
1333 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1334 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1335 }
1336 return Cost;
1337}
1338
1339// Currently, these represent both throughput and codesize costs
1340// for the respective intrinsics. The costs in this table are simply
1341// instruction counts with the following adjustments made:
1342// * One vsetvli is considered free.
1344 {Intrinsic::floor, MVT::f32, 9},
1345 {Intrinsic::floor, MVT::f64, 9},
1346 {Intrinsic::ceil, MVT::f32, 9},
1347 {Intrinsic::ceil, MVT::f64, 9},
1348 {Intrinsic::trunc, MVT::f32, 7},
1349 {Intrinsic::trunc, MVT::f64, 7},
1350 {Intrinsic::round, MVT::f32, 9},
1351 {Intrinsic::round, MVT::f64, 9},
1352 {Intrinsic::roundeven, MVT::f32, 9},
1353 {Intrinsic::roundeven, MVT::f64, 9},
1354 {Intrinsic::rint, MVT::f32, 7},
1355 {Intrinsic::rint, MVT::f64, 7},
1356 {Intrinsic::nearbyint, MVT::f32, 9},
1357 {Intrinsic::nearbyint, MVT::f64, 9},
1358 {Intrinsic::bswap, MVT::i16, 3},
1359 {Intrinsic::bswap, MVT::i32, 12},
1360 {Intrinsic::bswap, MVT::i64, 31},
1361 {Intrinsic::vp_bswap, MVT::i16, 3},
1362 {Intrinsic::vp_bswap, MVT::i32, 12},
1363 {Intrinsic::vp_bswap, MVT::i64, 31},
1364 {Intrinsic::vp_fshl, MVT::i8, 7},
1365 {Intrinsic::vp_fshl, MVT::i16, 7},
1366 {Intrinsic::vp_fshl, MVT::i32, 7},
1367 {Intrinsic::vp_fshl, MVT::i64, 7},
1368 {Intrinsic::vp_fshr, MVT::i8, 7},
1369 {Intrinsic::vp_fshr, MVT::i16, 7},
1370 {Intrinsic::vp_fshr, MVT::i32, 7},
1371 {Intrinsic::vp_fshr, MVT::i64, 7},
1372 {Intrinsic::bitreverse, MVT::i8, 17},
1373 {Intrinsic::bitreverse, MVT::i16, 24},
1374 {Intrinsic::bitreverse, MVT::i32, 33},
1375 {Intrinsic::bitreverse, MVT::i64, 52},
1376 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1377 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1378 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1379 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1380 {Intrinsic::ctpop, MVT::i8, 12},
1381 {Intrinsic::ctpop, MVT::i16, 19},
1382 {Intrinsic::ctpop, MVT::i32, 20},
1383 {Intrinsic::ctpop, MVT::i64, 21},
1384 {Intrinsic::ctlz, MVT::i8, 19},
1385 {Intrinsic::ctlz, MVT::i16, 28},
1386 {Intrinsic::ctlz, MVT::i32, 31},
1387 {Intrinsic::ctlz, MVT::i64, 35},
1388 {Intrinsic::cttz, MVT::i8, 16},
1389 {Intrinsic::cttz, MVT::i16, 23},
1390 {Intrinsic::cttz, MVT::i32, 24},
1391 {Intrinsic::cttz, MVT::i64, 25},
1392 {Intrinsic::vp_ctpop, MVT::i8, 12},
1393 {Intrinsic::vp_ctpop, MVT::i16, 19},
1394 {Intrinsic::vp_ctpop, MVT::i32, 20},
1395 {Intrinsic::vp_ctpop, MVT::i64, 21},
1396 {Intrinsic::vp_ctlz, MVT::i8, 19},
1397 {Intrinsic::vp_ctlz, MVT::i16, 28},
1398 {Intrinsic::vp_ctlz, MVT::i32, 31},
1399 {Intrinsic::vp_ctlz, MVT::i64, 35},
1400 {Intrinsic::vp_cttz, MVT::i8, 16},
1401 {Intrinsic::vp_cttz, MVT::i16, 23},
1402 {Intrinsic::vp_cttz, MVT::i32, 24},
1403 {Intrinsic::vp_cttz, MVT::i64, 25},
1404};
1405
1409 auto *RetTy = ICA.getReturnType();
1410 switch (ICA.getID()) {
1411 case Intrinsic::lrint:
1412 case Intrinsic::llrint:
1413 case Intrinsic::lround:
1414 case Intrinsic::llround: {
1415 auto LT = getTypeLegalizationCost(RetTy);
1416 Type *SrcTy = ICA.getArgTypes().front();
1417 auto SrcLT = getTypeLegalizationCost(SrcTy);
1418 if (ST->hasVInstructions() && LT.second.isVector()) {
1420 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1421 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1422 if (LT.second.getVectorElementType() == MVT::bf16) {
1423 if (!ST->hasVInstructionsBF16Minimal())
1425 if (DstEltSz == 32)
1426 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1427 else
1428 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1429 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1430 !ST->hasVInstructionsF16()) {
1431 if (!ST->hasVInstructionsF16Minimal())
1433 if (DstEltSz == 32)
1434 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1435 else
1436 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1437
1438 } else if (SrcEltSz > DstEltSz) {
1439 Ops = {RISCV::VFNCVT_X_F_W};
1440 } else if (SrcEltSz < DstEltSz) {
1441 Ops = {RISCV::VFWCVT_X_F_V};
1442 } else {
1443 Ops = {RISCV::VFCVT_X_F_V};
1444 }
1445
1446 // We need to use the source LMUL in the case of a narrowing op, and the
1447 // destination LMUL otherwise.
1448 if (SrcEltSz > DstEltSz)
1449 return SrcLT.first *
1450 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1451 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1452 }
1453 break;
1454 }
1455 case Intrinsic::ceil:
1456 case Intrinsic::floor:
1457 case Intrinsic::trunc:
1458 case Intrinsic::rint:
1459 case Intrinsic::round:
1460 case Intrinsic::roundeven: {
1461 // These all use the same code.
1462 auto LT = getTypeLegalizationCost(RetTy);
1463 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1464 return LT.first * 8;
1465 break;
1466 }
1467 case Intrinsic::umin:
1468 case Intrinsic::umax:
1469 case Intrinsic::smin:
1470 case Intrinsic::smax: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1473 return LT.first;
1474
1475 if (ST->hasVInstructions() && LT.second.isVector()) {
1476 unsigned Op;
1477 switch (ICA.getID()) {
1478 case Intrinsic::umin:
1479 Op = RISCV::VMINU_VV;
1480 break;
1481 case Intrinsic::umax:
1482 Op = RISCV::VMAXU_VV;
1483 break;
1484 case Intrinsic::smin:
1485 Op = RISCV::VMIN_VV;
1486 break;
1487 case Intrinsic::smax:
1488 Op = RISCV::VMAX_VV;
1489 break;
1490 }
1491 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1492 }
1493 break;
1494 }
1495 case Intrinsic::sadd_sat:
1496 case Intrinsic::ssub_sat:
1497 case Intrinsic::uadd_sat:
1498 case Intrinsic::usub_sat: {
1499 auto LT = getTypeLegalizationCost(RetTy);
1500 if (ST->hasVInstructions() && LT.second.isVector()) {
1501 unsigned Op;
1502 switch (ICA.getID()) {
1503 case Intrinsic::sadd_sat:
1504 Op = RISCV::VSADD_VV;
1505 break;
1506 case Intrinsic::ssub_sat:
1507 Op = RISCV::VSSUB_VV;
1508 break;
1509 case Intrinsic::uadd_sat:
1510 Op = RISCV::VSADDU_VV;
1511 break;
1512 case Intrinsic::usub_sat:
1513 Op = RISCV::VSSUBU_VV;
1514 break;
1515 }
1516 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1517 }
1518 break;
1519 }
1520 case Intrinsic::fma:
1521 case Intrinsic::fmuladd: {
1522 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1523 auto LT = getTypeLegalizationCost(RetTy);
1524 if (ST->hasVInstructions() && LT.second.isVector())
1525 return LT.first *
1526 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1527 break;
1528 }
1529 case Intrinsic::fabs: {
1530 auto LT = getTypeLegalizationCost(RetTy);
1531 if (ST->hasVInstructions() && LT.second.isVector()) {
1532 // lui a0, 8
1533 // addi a0, a0, -1
1534 // vsetvli a1, zero, e16, m1, ta, ma
1535 // vand.vx v8, v8, a0
1536 // f16 with zvfhmin and bf16 with zvfhbmin
1537 if (LT.second.getVectorElementType() == MVT::bf16 ||
1538 (LT.second.getVectorElementType() == MVT::f16 &&
1539 !ST->hasVInstructionsF16()))
1540 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1541 CostKind) +
1542 2;
1543 else
1544 return LT.first *
1545 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1546 }
1547 break;
1548 }
1549 case Intrinsic::sqrt: {
1550 auto LT = getTypeLegalizationCost(RetTy);
1551 if (ST->hasVInstructions() && LT.second.isVector()) {
1554 MVT ConvType = LT.second;
1555 MVT FsqrtType = LT.second;
1556 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1557 // will be spilt.
1558 if (LT.second.getVectorElementType() == MVT::bf16) {
1559 if (LT.second == MVT::nxv32bf16) {
1560 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1561 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1562 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1563 ConvType = MVT::nxv16f16;
1564 FsqrtType = MVT::nxv16f32;
1565 } else {
1566 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1567 FsqrtOp = {RISCV::VFSQRT_V};
1568 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1569 }
1570 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1571 !ST->hasVInstructionsF16()) {
1572 if (LT.second == MVT::nxv32f16) {
1573 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1574 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1575 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1576 ConvType = MVT::nxv16f16;
1577 FsqrtType = MVT::nxv16f32;
1578 } else {
1579 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1580 FsqrtOp = {RISCV::VFSQRT_V};
1581 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1582 }
1583 } else {
1584 FsqrtOp = {RISCV::VFSQRT_V};
1585 }
1586
1587 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1588 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1589 }
1590 break;
1591 }
1592 case Intrinsic::cttz:
1593 case Intrinsic::ctlz:
1594 case Intrinsic::ctpop: {
1595 auto LT = getTypeLegalizationCost(RetTy);
1596 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1597 unsigned Op;
1598 switch (ICA.getID()) {
1599 case Intrinsic::cttz:
1600 Op = RISCV::VCTZ_V;
1601 break;
1602 case Intrinsic::ctlz:
1603 Op = RISCV::VCLZ_V;
1604 break;
1605 case Intrinsic::ctpop:
1606 Op = RISCV::VCPOP_V;
1607 break;
1608 }
1609 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1610 }
1611 break;
1612 }
1613 case Intrinsic::abs: {
1614 auto LT = getTypeLegalizationCost(RetTy);
1615 if (ST->hasVInstructions() && LT.second.isVector()) {
1616 // vabs.v v10, v8
1617 if (ST->hasStdExtZvabd())
1618 return LT.first *
1619 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1620
1621 // vrsub.vi v10, v8, 0
1622 // vmax.vv v8, v8, v10
1623 return LT.first *
1624 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1625 LT.second, CostKind);
1626 }
1627 break;
1628 }
1629 case Intrinsic::fshl:
1630 case Intrinsic::fshr: {
1631 if (ICA.getArgs().empty())
1632 break;
1633
1634 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1635 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1636 // instruction.
1637 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1638 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1639 (RetTy->getIntegerBitWidth() == 32 ||
1640 RetTy->getIntegerBitWidth() == 64) &&
1641 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1642 return 1;
1643 }
1644 break;
1645 }
1646 case Intrinsic::masked_udiv:
1647 return getArithmeticInstrCost(Instruction::UDiv, ICA.getReturnType(),
1648 CostKind);
1649 case Intrinsic::masked_sdiv:
1650 return getArithmeticInstrCost(Instruction::SDiv, ICA.getReturnType(),
1651 CostKind);
1652 case Intrinsic::masked_urem:
1653 return getArithmeticInstrCost(Instruction::URem, ICA.getReturnType(),
1654 CostKind);
1655 case Intrinsic::masked_srem:
1656 return getArithmeticInstrCost(Instruction::SRem, ICA.getReturnType(),
1657 CostKind);
1658 case Intrinsic::get_active_lane_mask: {
1659 if (ST->hasVInstructions()) {
1660 Type *ExpRetTy = VectorType::get(
1661 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1662 auto LT = getTypeLegalizationCost(ExpRetTy);
1663
1664 // vid.v v8 // considered hoisted
1665 // vsaddu.vx v8, v8, a0
1666 // vmsltu.vx v0, v8, a1
1667 return LT.first *
1668 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1669 LT.second, CostKind);
1670 }
1671 break;
1672 }
1673 // TODO: add more intrinsic
1674 case Intrinsic::stepvector: {
1675 auto LT = getTypeLegalizationCost(RetTy);
1676 // Legalisation of illegal types involves an `index' instruction plus
1677 // (LT.first - 1) vector adds.
1678 if (ST->hasVInstructions())
1679 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1680 (LT.first - 1) *
1681 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1682 return 1 + (LT.first - 1);
1683 }
1684 case Intrinsic::vector_splice_left:
1685 case Intrinsic::vector_splice_right: {
1686 auto LT = getTypeLegalizationCost(RetTy);
1687 // Constant offsets fall through to getShuffleCost.
1688 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1689 break;
1690 if (ST->hasVInstructions() && LT.second.isVector()) {
1691 return LT.first *
1692 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1693 LT.second, CostKind);
1694 }
1695 break;
1696 }
1697 case Intrinsic::experimental_cttz_elts: {
1698 Type *ArgTy = ICA.getArgTypes()[0];
1699 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1700 if (getTLI()->shouldExpandCttzElements(ArgType))
1701 break;
1702 InstructionCost Cost = getRISCVInstructionCost(
1703 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1704
1705 // If zero_is_poison is false, then we will generate additional
1706 // cmp + select instructions to convert -1 to EVL.
1707 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1708 if (ICA.getArgs().size() > 1 &&
1709 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1710 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1712 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1714
1715 return Cost;
1716 }
1717 case Intrinsic::experimental_vp_splice: {
1718 // To support type-based query from vectorizer, set the index to 0.
1719 // Note that index only change the cost from vslide.vx to vslide.vi and in
1720 // current implementations they have same costs.
1722 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1724 }
1725 case Intrinsic::fptoui_sat:
1726 case Intrinsic::fptosi_sat: {
1728 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1729 Type *SrcTy = ICA.getArgTypes()[0];
1730
1731 auto SrcLT = getTypeLegalizationCost(SrcTy);
1732 auto DstLT = getTypeLegalizationCost(RetTy);
1733 if (!SrcTy->isVectorTy())
1734 break;
1735
1736 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1738
1739 Cost +=
1740 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1741 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1742
1743 // Handle NaN.
1744 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1745 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1746 Type *CondTy = RetTy->getWithNewBitWidth(1);
1747 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1749 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1751 return Cost;
1752 }
1753 case Intrinsic::experimental_vector_extract_last_active: {
1754 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1755 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1756
1757 auto ValLT = getTypeLegalizationCost(ValTy);
1758 auto MaskLT = getTypeLegalizationCost(MaskTy);
1759
1760 // TODO: Return cheaper cost when the entire lane is inactive.
1761 // The expected asm sequence is:
1762 // vcpop.m a0, v0
1763 // beqz a0, exit # Return passthru when the entire lane is inactive.
1764 // vid v10, v0.t
1765 // vredmaxu.vs v10, v10, v10
1766 // vmv.x.s a0, v10
1767 // zext.b a0, a0
1768 // vslidedown.vx v8, v8, a0
1769 // vmv.x.s a0, v8
1770 // exit:
1771 // ...
1772
1773 // Find a suitable type for a stepvector.
1774 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1775 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1776 TLI->getVectorIdxTy(getDataLayout()), MaskTy->getElementCount(),
1777 /*ZeroIsPoison=*/true, &VScaleRange);
1778 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1779 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1780 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1781 auto StepLT = getTypeLegalizationCost(StepVecTy);
1782
1783 // Currently expandVectorFindLastActive cannot handle step vector split.
1784 // So return invalid when the type needs split.
1785 // FIXME: Remove this if expandVectorFindLastActive supports split vector.
1786 if (StepLT.first > 1)
1788
1790 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1791
1792 Cost += MaskLT.first *
1793 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1794 Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);
1795 Cost += StepLT.first *
1796 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1797 Cost += getCastInstrCost(Instruction::ZExt,
1798 Type::getInt64Ty(ValTy->getContext()), StepTy,
1800 Cost += ValLT.first *
1801 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1802 ValLT.second, CostKind);
1803 return Cost;
1804 }
1805 }
1806
1807 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1808 if (auto LT = getTypeLegalizationCost(RetTy);
1809 LT.second.isVector()) {
1810 MVT EltTy = LT.second.getVectorElementType();
1811 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1812 ICA.getID(), EltTy))
1813 return LT.first * Entry->Cost;
1814 }
1815 }
1816
1818}
1819
1822 const SCEV *Ptr,
1824 // Address computations for vector indexed load/store likely require an offset
1825 // and/or scaling.
1826 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1827 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1828
1829 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1830}
1831
1833 Type *Src,
1836 const Instruction *I) const {
1837 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1838 if (!IsVectorType)
1839 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1840
1841 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1842 // For now, skip all fixed vector cost analysis when P extension is available
1843 // to avoid crashes in getMinRVVVectorSizeInBits()
1844 if (ST->hasStdExtP() &&
1846 return 1; // Treat as single instruction cost for now
1847 }
1848
1849 // FIXME: Need to compute legalizing cost for illegal types. The current
1850 // code handles only legal types and those which can be trivially
1851 // promoted to legal.
1852 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1853 Dst->getScalarSizeInBits() > ST->getELen())
1854 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1855
1856 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1857 assert(ISD && "Invalid opcode");
1858 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1859 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1860
1861 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1862 // The shared implementation doesn't model vector widening during legalization
1863 // and instead assumes scalarization. In order to scalarize an <N x i1>
1864 // vector, we need to extend/trunc to/from i8. If we don't special case
1865 // this, we can get an infinite recursion cycle.
1866 switch (ISD) {
1867 default:
1868 break;
1869 case ISD::SIGN_EXTEND:
1870 case ISD::ZERO_EXTEND:
1871 if (Src->getScalarSizeInBits() == 1) {
1872 // We do not use vsext/vzext to extend from mask vector.
1873 // Instead we use the following instructions to extend from mask vector:
1874 // vmv.v.i v8, 0
1875 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1876 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1877 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1878 DstLT.second, CostKind) +
1879 DstLT.first - 1;
1880 }
1881 break;
1882 case ISD::TRUNCATE:
1883 if (Dst->getScalarSizeInBits() == 1) {
1884 // We do not use several vncvt to truncate to mask vector. So we could
1885 // not use PowDiff to calculate it.
1886 // Instead we use the following instructions to truncate to mask vector:
1887 // vand.vi v8, v8, 1
1888 // vmsne.vi v0, v8, 0
1889 return SrcLT.first *
1890 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1891 SrcLT.second, CostKind) +
1892 SrcLT.first - 1;
1893 }
1894 break;
1895 };
1896
1897 // Our actual lowering for the case where a wider legal type is available
1898 // uses promotion to the wider type. This is reflected in the result of
1899 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1900 // scalarized if the legalized Src and Dst are not equal sized.
1901 const DataLayout &DL = this->getDataLayout();
1902 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1903 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1904 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1905 SrcLT.second.getSizeInBits()) ||
1906 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1907 DstLT.second.getSizeInBits()) ||
1908 SrcLT.first > 1 || DstLT.first > 1)
1909 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1910
1911 // The split cost is handled by the base getCastInstrCost
1912 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1913
1914 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1915 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1916 switch (ISD) {
1917 case ISD::SIGN_EXTEND:
1918 case ISD::ZERO_EXTEND: {
1919 if ((PowDiff < 1) || (PowDiff > 3))
1920 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1921 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1922 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1923 unsigned Op =
1924 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1925 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1926 }
1927 case ISD::TRUNCATE:
1928 case ISD::FP_EXTEND:
1929 case ISD::FP_ROUND: {
1930 // Counts of narrow/widen instructions.
1931 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1932 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1933
1934 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1935 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1936 : RISCV::VFNCVT_F_F_W;
1938 for (; SrcEltSize != DstEltSize;) {
1939 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1940 ? MVT::getIntegerVT(DstEltSize)
1941 : MVT::getFloatingPointVT(DstEltSize);
1942 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1943 DstEltSize =
1944 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1945 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1946 }
1947 return Cost;
1948 }
1949 case ISD::FP_TO_SINT:
1950 case ISD::FP_TO_UINT: {
1951 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1952 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1953 unsigned FWCVT =
1954 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1955 unsigned FNCVT =
1956 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1957 unsigned SrcEltSize = Src->getScalarSizeInBits();
1958 unsigned DstEltSize = Dst->getScalarSizeInBits();
1960 if ((SrcEltSize == 16) &&
1961 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1962 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1963 // pre-widening to f32 and then convert f32 to integer
1964 VectorType *VecF32Ty =
1965 VectorType::get(Type::getFloatTy(Dst->getContext()),
1966 cast<VectorType>(Dst)->getElementCount());
1967 std::pair<InstructionCost, MVT> VecF32LT =
1968 getTypeLegalizationCost(VecF32Ty);
1969 Cost +=
1970 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1971 VecF32LT.second, CostKind);
1972 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1973 return Cost;
1974 }
1975 if (DstEltSize == SrcEltSize)
1976 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1977 else if (DstEltSize > SrcEltSize)
1978 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1979 else { // (SrcEltSize > DstEltSize)
1980 // First do a narrowing conversion to an integer half the size, then
1981 // truncate if needed.
1982 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1983 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1984 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1985 if ((SrcEltSize / 2) > DstEltSize) {
1986 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1987 Cost +=
1988 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1989 }
1990 }
1991 return Cost;
1992 }
1993 case ISD::SINT_TO_FP:
1994 case ISD::UINT_TO_FP: {
1995 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1996 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1997 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1998 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1999 unsigned SrcEltSize = Src->getScalarSizeInBits();
2000 unsigned DstEltSize = Dst->getScalarSizeInBits();
2001
2003 if ((DstEltSize == 16) &&
2004 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
2005 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
2006 // it is converted to f32 and then converted to f16
2007 VectorType *VecF32Ty =
2008 VectorType::get(Type::getFloatTy(Dst->getContext()),
2009 cast<VectorType>(Dst)->getElementCount());
2010 std::pair<InstructionCost, MVT> VecF32LT =
2011 getTypeLegalizationCost(VecF32Ty);
2012 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
2013 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
2014 DstLT.second, CostKind);
2015 return Cost;
2016 }
2017
2018 if (DstEltSize == SrcEltSize)
2019 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
2020 else if (DstEltSize > SrcEltSize) {
2021 if ((DstEltSize / 2) > SrcEltSize) {
2022 VectorType *VecTy =
2023 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
2024 cast<VectorType>(Dst)->getElementCount());
2025 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
2026 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
2027 }
2028 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
2029 } else
2030 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
2031 return Cost;
2032 }
2033 }
2034 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
2035}
2036
2037unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
2038 if (isa<ScalableVectorType>(Ty)) {
2039 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
2040 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
2041 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
2042 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
2043 }
2044 return cast<FixedVectorType>(Ty)->getNumElements();
2045}
2046
2049 FastMathFlags FMF,
2051 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2052 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2053
2054 // Skip if scalar size of Ty is bigger than ELEN.
2055 if (Ty->getScalarSizeInBits() > ST->getELen())
2056 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2057
2058 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2059 if (Ty->getElementType()->isIntegerTy(1)) {
2060 // SelectionDAGBuilder does following transforms:
2061 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2062 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2063 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2064 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2065 else
2066 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2067 }
2068
2069 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2071 InstructionCost ExtraCost = 0;
2072 switch (IID) {
2073 case Intrinsic::maximum:
2074 if (FMF.noNaNs()) {
2075 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2076 } else {
2077 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2078 RISCV::VFMV_F_S};
2079 // Cost of Canonical Nan + branch
2080 // lui a0, 523264
2081 // fmv.w.x fa0, a0
2082 Type *DstTy = Ty->getScalarType();
2083 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2084 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2085 ExtraCost = 1 +
2086 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2088 getCFInstrCost(Instruction::CondBr, CostKind);
2089 }
2090 break;
2091
2092 case Intrinsic::minimum:
2093 if (FMF.noNaNs()) {
2094 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2095 } else {
2096 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2097 RISCV::VFMV_F_S};
2098 // Cost of Canonical Nan + branch
2099 // lui a0, 523264
2100 // fmv.w.x fa0, a0
2101 Type *DstTy = Ty->getScalarType();
2102 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2103 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2104 ExtraCost = 1 +
2105 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2107 getCFInstrCost(Instruction::CondBr, CostKind);
2108 }
2109 break;
2110 }
2111 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2112 }
2113
2114 // IR Reduction is composed by one rvv reduction instruction and vmv
2115 unsigned SplitOp;
2117 switch (IID) {
2118 default:
2119 llvm_unreachable("Unsupported intrinsic");
2120 case Intrinsic::smax:
2121 SplitOp = RISCV::VMAX_VV;
2122 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2123 break;
2124 case Intrinsic::smin:
2125 SplitOp = RISCV::VMIN_VV;
2126 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2127 break;
2128 case Intrinsic::umax:
2129 SplitOp = RISCV::VMAXU_VV;
2130 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2131 break;
2132 case Intrinsic::umin:
2133 SplitOp = RISCV::VMINU_VV;
2134 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2135 break;
2136 case Intrinsic::maxnum:
2137 SplitOp = RISCV::VFMAX_VV;
2138 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2139 break;
2140 case Intrinsic::minnum:
2141 SplitOp = RISCV::VFMIN_VV;
2142 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2143 break;
2144 }
2145 // Add a cost for data larger than LMUL8
2146 InstructionCost SplitCost =
2147 (LT.first > 1) ? (LT.first - 1) *
2148 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2149 : 0;
2150 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2151}
2152
2155 std::optional<FastMathFlags> FMF,
2157 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2158 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2159
2160 // Skip if scalar size of Ty is bigger than ELEN.
2161 if (Ty->getScalarSizeInBits() > ST->getELen())
2162 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2163
2164 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2165 assert(ISD && "Invalid opcode");
2166
2167 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2168 ISD != ISD::FADD)
2169 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2170
2171 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2172 Type *ElementTy = Ty->getElementType();
2173 if (ElementTy->isIntegerTy(1)) {
2174 // Example sequences:
2175 // vfirst.m a0, v0
2176 // seqz a0, a0
2177 if (LT.second == MVT::v1i1)
2178 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2179 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2181
2182 if (ISD == ISD::AND) {
2183 // Example sequences:
2184 // vmand.mm v8, v9, v8 ; needed every time type is split
2185 // vmnot.m v8, v0 ; alias for vmnand
2186 // vcpop.m a0, v8
2187 // seqz a0, a0
2188
2189 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2190 // For LMUL <= 8, there is no splitting,
2191 // the sequences are vmnot, vcpop and seqz.
2192 // When LMUL > 8 and split = 1,
2193 // the sequences are vmnand, vcpop and seqz.
2194 // When LMUL > 8 and split > 1,
2195 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2196 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2197 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2198 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2199 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2200 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2202 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2203 // Example sequences:
2204 // vsetvli a0, zero, e8, mf8, ta, ma
2205 // vmxor.mm v8, v0, v8 ; needed every time type is split
2206 // vcpop.m a0, v8
2207 // andi a0, a0, 1
2208 return (LT.first - 1) *
2209 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2210 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2211 } else {
2212 assert(ISD == ISD::OR);
2213 // Example sequences:
2214 // vsetvli a0, zero, e8, mf8, ta, ma
2215 // vmor.mm v8, v9, v8 ; needed every time type is split
2216 // vcpop.m a0, v0
2217 // snez a0, a0
2218 return (LT.first - 1) *
2219 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2220 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2221 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2223 }
2224 }
2225
2226 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2227 // instruction, and others is composed by two vmv and one rvv reduction
2228 // instruction
2229 unsigned SplitOp;
2231 switch (ISD) {
2232 case ISD::ADD:
2233 SplitOp = RISCV::VADD_VV;
2234 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2235 break;
2236 case ISD::OR:
2237 SplitOp = RISCV::VOR_VV;
2238 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2239 break;
2240 case ISD::XOR:
2241 SplitOp = RISCV::VXOR_VV;
2242 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2243 break;
2244 case ISD::AND:
2245 SplitOp = RISCV::VAND_VV;
2246 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2247 break;
2248 case ISD::FADD:
2249 // We can't promote f16/bf16 fadd reductions.
2250 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2251 LT.second.getScalarType() == MVT::bf16)
2252 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2254 Opcodes.push_back(RISCV::VFMV_S_F);
2255 for (unsigned i = 0; i < LT.first.getValue(); i++)
2256 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2257 Opcodes.push_back(RISCV::VFMV_F_S);
2258 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2259 }
2260 SplitOp = RISCV::VFADD_VV;
2261 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2262 break;
2263 }
2264 // Add a cost for data larger than LMUL8
2265 InstructionCost SplitCost =
2266 (LT.first > 1) ? (LT.first - 1) *
2267 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2268 : 0;
2269 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2270}
2271
2273 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2274 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2275 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2276 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2277 FMF, CostKind);
2278
2279 // Skip if scalar size of ResTy is bigger than ELEN.
2280 if (ResTy->getScalarSizeInBits() > ST->getELen())
2281 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2282 FMF, CostKind);
2283
2284 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2285 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2286 FMF, CostKind);
2287
2288 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2289
2290 if (IsUnsigned && Opcode == Instruction::Add &&
2291 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2292 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2293 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2294 return LT.first *
2295 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2296 }
2297
2298 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2299 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2300 FMF, CostKind);
2301
2302 return (LT.first - 1) +
2303 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2304}
2305
2309 assert(OpInfo.isConstant() && "non constant operand?");
2310 if (!isa<VectorType>(Ty))
2311 // FIXME: We need to account for immediate materialization here, but doing
2312 // a decent job requires more knowledge about the immediate than we
2313 // currently have here.
2314 return 0;
2315
2316 if (OpInfo.isUniform())
2317 // vmv.v.i, vmv.v.x, or vfmv.v.f
2318 // We ignore the cost of the scalar constant materialization to be consistent
2319 // with how we treat scalar constants themselves just above.
2320 return 1;
2321
2322 return getConstantPoolLoadCost(Ty, CostKind);
2323}
2324
2326 Align Alignment,
2327 unsigned AddressSpace,
2329 TTI::OperandValueInfo OpInfo,
2330 const Instruction *I) const {
2331 EVT VT = TLI->getValueType(DL, Src, true);
2332 // Type legalization can't handle structs
2333 if (VT == MVT::Other)
2334 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2335 CostKind, OpInfo, I);
2336
2338 if (Opcode == Instruction::Store && OpInfo.isConstant())
2339 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2340
2341 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2342
2343 InstructionCost BaseCost = [&]() {
2344 InstructionCost Cost = LT.first;
2346 return Cost;
2347
2348 // Our actual lowering for the case where a wider legal type is available
2349 // uses the a VL predicated load on the wider type. This is reflected in
2350 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2351 // widened cases are scalarized.
2352 const DataLayout &DL = this->getDataLayout();
2353 if (Src->isVectorTy() && LT.second.isVector() &&
2354 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2355 LT.second.getSizeInBits()))
2356 return Cost;
2357
2358 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2359 CostKind, OpInfo, I);
2360 }();
2361
2362 // Assume memory ops cost scale with the number of vector registers
2363 // possible accessed by the instruction. Note that BasicTTI already
2364 // handles the LT.first term for us.
2365 if (ST->hasVInstructions() && LT.second.isVector() &&
2367 BaseCost *= TLI->getLMULCost(LT.second);
2368 return Cost + BaseCost;
2369}
2370
2372 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2374 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2376 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2377 Op1Info, Op2Info, I);
2378
2379 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2380 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2381 Op1Info, Op2Info, I);
2382
2383 // Skip if scalar size of ValTy is bigger than ELEN.
2384 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2385 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2386 Op1Info, Op2Info, I);
2387
2388 auto GetConstantMatCost =
2389 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2390 if (OpInfo.isUniform())
2391 // We return 0 we currently ignore the cost of materializing scalar
2392 // constants in GPRs.
2393 return 0;
2394
2395 return getConstantPoolLoadCost(ValTy, CostKind);
2396 };
2397
2398 InstructionCost ConstantMatCost;
2399 if (Op1Info.isConstant())
2400 ConstantMatCost += GetConstantMatCost(Op1Info);
2401 if (Op2Info.isConstant())
2402 ConstantMatCost += GetConstantMatCost(Op2Info);
2403
2404 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2405 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2406 if (CondTy->isVectorTy()) {
2407 if (ValTy->getScalarSizeInBits() == 1) {
2408 // vmandn.mm v8, v8, v9
2409 // vmand.mm v9, v0, v9
2410 // vmor.mm v0, v9, v8
2411 return ConstantMatCost +
2412 LT.first *
2413 getRISCVInstructionCost(
2414 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2415 LT.second, CostKind);
2416 }
2417 // vselect and max/min are supported natively.
2418 return ConstantMatCost +
2419 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2420 CostKind);
2421 }
2422
2423 if (ValTy->getScalarSizeInBits() == 1) {
2424 // vmv.v.x v9, a0
2425 // vmsne.vi v9, v9, 0
2426 // vmandn.mm v8, v8, v9
2427 // vmand.mm v9, v0, v9
2428 // vmor.mm v0, v9, v8
2429 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2430 return ConstantMatCost +
2431 LT.first *
2432 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2433 InterimVT, CostKind) +
2434 LT.first * getRISCVInstructionCost(
2435 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2436 LT.second, CostKind);
2437 }
2438
2439 // vmv.v.x v10, a0
2440 // vmsne.vi v0, v10, 0
2441 // vmerge.vvm v8, v9, v8, v0
2442 return ConstantMatCost +
2443 LT.first * getRISCVInstructionCost(
2444 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2445 LT.second, CostKind);
2446 }
2447
2448 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2449 CmpInst::isIntPredicate(VecPred)) {
2450 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2451 // provided they incur the same cost across all implementations
2452 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2453 LT.second,
2454 CostKind);
2455 }
2456
2457 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2458 CmpInst::isFPPredicate(VecPred)) {
2459
2460 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2461 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2462 return ConstantMatCost +
2463 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2464
2465 // If we do not support the input floating point vector type, use the base
2466 // one which will calculate as:
2467 // ScalarizeCost + Num * Cost for fixed vector,
2468 // InvalidCost for scalable vector.
2469 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2470 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2471 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2472 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2473 Op1Info, Op2Info, I);
2474
2475 // Assuming vector fp compare and mask instructions are all the same cost
2476 // until a need arises to differentiate them.
2477 switch (VecPred) {
2478 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2479 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2480 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2481 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2482 return ConstantMatCost +
2483 LT.first * getRISCVInstructionCost(
2484 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2485 LT.second, CostKind);
2486
2487 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2488 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2489 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2490 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2491 return ConstantMatCost +
2492 LT.first *
2493 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2494 LT.second, CostKind);
2495
2496 case CmpInst::FCMP_OEQ: // vmfeq.vv
2497 case CmpInst::FCMP_OGT: // vmflt.vv
2498 case CmpInst::FCMP_OGE: // vmfle.vv
2499 case CmpInst::FCMP_OLT: // vmflt.vv
2500 case CmpInst::FCMP_OLE: // vmfle.vv
2501 case CmpInst::FCMP_UNE: // vmfne.vv
2502 return ConstantMatCost +
2503 LT.first *
2504 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2505 default:
2506 break;
2507 }
2508 }
2509
2510 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2511 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2512 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2513 // be (0 + select instr cost).
2514 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2515 ValTy->isIntegerTy() && !I->user_empty()) {
2516 if (all_of(I->users(), [&](const User *U) {
2517 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2518 U->getType()->isIntegerTy() &&
2519 !isa<ConstantData>(U->getOperand(1)) &&
2520 !isa<ConstantData>(U->getOperand(2));
2521 }))
2522 return 0;
2523 }
2524
2525 // TODO: Add cost for scalar type.
2526
2527 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2528 Op1Info, Op2Info, I);
2529}
2530
2533 const Instruction *I) const {
2535 return Opcode == Instruction::PHI ? 0 : 1;
2536 // Branches are assumed to be predicted.
2537 return 0;
2538}
2539
2541 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2542 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2543 assert(Val->isVectorTy() && "This must be a vector type");
2544
2545 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2546 // For now, skip all fixed vector cost analysis when P extension is available
2547 // to avoid crashes in getMinRVVVectorSizeInBits()
2548 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2549 return 1; // Treat as single instruction cost for now
2550 }
2551
2552 if (Opcode != Instruction::ExtractElement &&
2553 Opcode != Instruction::InsertElement)
2554 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2555 VIC);
2556
2557 // Legalize the type.
2558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2559
2560 // This type is legalized to a scalar type.
2561 if (!LT.second.isVector()) {
2562 auto *FixedVecTy = cast<FixedVectorType>(Val);
2563 // If Index is a known constant, cost is zero.
2564 if (Index != -1U)
2565 return 0;
2566 // Extract/InsertElement with non-constant index is very costly when
2567 // scalarized; estimate cost of loads/stores sequence via the stack:
2568 // ExtractElement cost: store vector to stack, load scalar;
2569 // InsertElement cost: store vector to stack, store scalar, load vector.
2570 Type *ElemTy = FixedVecTy->getElementType();
2571 auto NumElems = FixedVecTy->getNumElements();
2572 auto Align = DL.getPrefTypeAlign(ElemTy);
2573 InstructionCost LoadCost =
2574 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2575 InstructionCost StoreCost =
2576 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2577 return Opcode == Instruction::ExtractElement
2578 ? StoreCost * NumElems + LoadCost
2579 : (StoreCost + LoadCost) * NumElems + StoreCost;
2580 }
2581
2582 // For unsupported scalable vector.
2583 if (LT.second.isScalableVector() && !LT.first.isValid())
2584 return LT.first;
2585
2586 // Mask vector extract/insert is expanded via e8.
2587 if (Val->getScalarSizeInBits() == 1) {
2588 VectorType *WideTy =
2590 cast<VectorType>(Val)->getElementCount());
2591 if (Opcode == Instruction::ExtractElement) {
2592 InstructionCost ExtendCost
2593 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2595 InstructionCost ExtractCost
2596 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2597 return ExtendCost + ExtractCost;
2598 }
2599 InstructionCost ExtendCost
2600 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2602 InstructionCost InsertCost
2603 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2604 InstructionCost TruncCost
2605 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2607 return ExtendCost + InsertCost + TruncCost;
2608 }
2609
2610
2611 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2612 // and vslideup + vmv.s.x to insert element to vector.
2613 unsigned BaseCost = 1;
2614 // When insertelement we should add the index with 1 as the input of vslideup.
2615 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2616
2617 if (Index != -1U) {
2618 // The type may be split. For fixed-width vectors we can normalize the
2619 // index to the new type.
2620 if (LT.second.isFixedLengthVector()) {
2621 unsigned Width = LT.second.getVectorNumElements();
2622 Index = Index % Width;
2623 }
2624
2625 // If exact VLEN is known, we will insert/extract into the appropriate
2626 // subvector with no additional subvector insert/extract cost.
2627 if (auto VLEN = ST->getRealVLen()) {
2628 unsigned EltSize = LT.second.getScalarSizeInBits();
2629 unsigned M1Max = *VLEN / EltSize;
2630 Index = Index % M1Max;
2631 }
2632
2633 if (Index == 0)
2634 // We can extract/insert the first element without vslidedown/vslideup.
2635 SlideCost = 0;
2636 else if (Opcode == Instruction::InsertElement)
2637 SlideCost = 1; // With a constant index, we do not need to use addi.
2638 }
2639
2640 // When the vector needs to split into multiple register groups and the index
2641 // exceeds single vector register group, we need to insert/extract the element
2642 // via stack.
2643 if (LT.first > 1 &&
2644 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2645 LT.second.isScalableVector()))) {
2646 Type *ScalarType = Val->getScalarType();
2647 Align VecAlign = DL.getPrefTypeAlign(Val);
2648 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2649 // Extra addi for unknown index.
2650 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2651
2652 // Store all split vectors into stack and load the target element.
2653 if (Opcode == Instruction::ExtractElement)
2654 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2655 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2656 CostKind) +
2657 IdxCost;
2658
2659 // Store all split vectors into stack and store the target element and load
2660 // vectors back.
2661 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2662 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2663 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2664 CostKind) +
2665 IdxCost;
2666 }
2667
2668 // Extract i64 in the target that has XLEN=32 need more instruction.
2669 if (Val->getScalarType()->isIntegerTy() &&
2670 ST->getXLen() < Val->getScalarSizeInBits()) {
2671 // For extractelement, we need the following instructions:
2672 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2673 // vslidedown.vx v8, v8, a0
2674 // vmv.x.s a0, v8
2675 // li a1, 32
2676 // vsrl.vx v8, v8, a1
2677 // vmv.x.s a1, v8
2678
2679 // For insertelement, we need the following instructions:
2680 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2681 // vmv.v.i v12, 0
2682 // vslide1up.vx v16, v12, a1
2683 // vslide1up.vx v12, v16, a0
2684 // addi a0, a2, 1
2685 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2686 // vslideup.vx v8, v12, a2
2687
2688 // TODO: should we count these special vsetvlis?
2689 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2690 }
2691 return BaseCost + SlideCost;
2692}
2693
2697 unsigned Index) const {
2698 if (isa<FixedVectorType>(Val))
2700 Index);
2701
2702 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2703 // for the cost of extracting the last lane of a scalable vector. It probably
2704 // needs a more accurate cost.
2705 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2706 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2707 return getVectorInstrCost(Opcode, Val, CostKind,
2708 EC.getKnownMinValue() - 1 - Index, nullptr,
2709 nullptr);
2710}
2711
2712/// Check to see if this instruction is expected to be combined to a simpler
2713/// operation during/before lowering. If so return the cost of the combined
2714/// operation rather than provided one. For instance, `udiv i16 %X, 2` is likely
2715/// to be combined to `lshr i16 %X, 1`, so return the cost of a `lshr` rather
2716/// than the cost of a `udiv`
2717std::optional<InstructionCost>
2719 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2721 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2722 // Vector unsigned division/remainder will be simplified to shifts/masks.
2723 if ((Opcode == Instruction::UDiv || Opcode == Instruction::URem) &&
2724 Opd2Info.isConstant() && Opd2Info.isPowerOf2()) {
2725 if (Opcode == Instruction::UDiv)
2726 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Opd1Info,
2727 Opd2Info.getNoProps());
2728 // UREM
2729 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Opd1Info,
2730 Opd2Info.getNoProps());
2731 }
2732 return std::nullopt;
2733}
2734
2736 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2738 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2739
2740 // TODO: Handle more cost kinds.
2742 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2743 Args, CxtI);
2744
2745 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2746 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2747 Args, CxtI);
2748
2749 // Skip if scalar size of Ty is bigger than ELEN.
2750 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2751 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2752 Args, CxtI);
2753
2754 if (std::optional<InstructionCost> CombinedCost =
2756 Op2Info, Args, CxtI))
2757 return *CombinedCost;
2758
2759 // Legalize the type.
2760 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2761 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2762
2763 // TODO: Handle scalar type.
2764 if (!LT.second.isVector()) {
2765 static const CostTblEntry DivTbl[]{
2766 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2767 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2768 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2769 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2770 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2771 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2772 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2773 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2774 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2775 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2776 return Entry->Cost * LT.first;
2777
2778 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2779 Args, CxtI);
2780 }
2781
2782 // f16 with zvfhmin and bf16 will be promoted to f32.
2783 // FIXME: nxv32[b]f16 will be custom lowered and split.
2784 InstructionCost CastCost = 0;
2785 if ((LT.second.getVectorElementType() == MVT::f16 ||
2786 LT.second.getVectorElementType() == MVT::bf16) &&
2787 TLI->getOperationAction(ISDOpcode, LT.second) ==
2789 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2790 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2791 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2792 // Add cost of extending arguments
2793 CastCost += LT.first * Args.size() *
2794 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2796 // Add cost of truncating result
2797 CastCost +=
2798 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2800 // Compute cost of op in promoted type
2801 LT.second = PromotedVT;
2802 }
2803
2804 auto getConstantMatCost =
2805 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2806 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2807 // Two sub-cases:
2808 // * Has a 5 bit immediate operand which can be splatted.
2809 // * Has a larger immediate which must be materialized in scalar register
2810 // We return 0 for both as we currently ignore the cost of materializing
2811 // scalar constants in GPRs.
2812 return 0;
2813
2814 return getConstantPoolLoadCost(Ty, CostKind);
2815 };
2816
2817 // Add the cost of materializing any constant vectors required.
2818 InstructionCost ConstantMatCost = 0;
2819 if (Op1Info.isConstant())
2820 ConstantMatCost += getConstantMatCost(0, Op1Info);
2821 if (Op2Info.isConstant())
2822 ConstantMatCost += getConstantMatCost(1, Op2Info);
2823
2824 unsigned Op;
2825 switch (ISDOpcode) {
2826 case ISD::ADD:
2827 case ISD::SUB:
2828 Op = RISCV::VADD_VV;
2829 break;
2830 case ISD::SHL:
2831 case ISD::SRL:
2832 case ISD::SRA:
2833 Op = RISCV::VSLL_VV;
2834 break;
2835 case ISD::AND:
2836 case ISD::OR:
2837 case ISD::XOR:
2838 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2839 break;
2840 case ISD::MUL:
2841 case ISD::MULHS:
2842 case ISD::MULHU:
2843 Op = RISCV::VMUL_VV;
2844 break;
2845 case ISD::SDIV:
2846 case ISD::UDIV:
2847 Op = RISCV::VDIV_VV;
2848 break;
2849 case ISD::SREM:
2850 case ISD::UREM:
2851 Op = RISCV::VREM_VV;
2852 break;
2853 case ISD::FADD:
2854 case ISD::FSUB:
2855 Op = RISCV::VFADD_VV;
2856 break;
2857 case ISD::FMUL:
2858 Op = RISCV::VFMUL_VV;
2859 break;
2860 case ISD::FDIV:
2861 Op = RISCV::VFDIV_VV;
2862 break;
2863 case ISD::FNEG:
2864 Op = RISCV::VFSGNJN_VV;
2865 break;
2866 default:
2867 // Assuming all other instructions have the same cost until a need arises to
2868 // differentiate them.
2869 return CastCost + ConstantMatCost +
2870 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2871 Args, CxtI);
2872 }
2873
2874 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2875 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2876 // ops are twice as expensive as integer ops. Do the same for vectors so
2877 // scalar floating point ops aren't cheaper than their vector equivalents.
2878 if (Ty->isFPOrFPVectorTy())
2879 InstrCost *= 2;
2880 return CastCost + ConstantMatCost + LT.first * InstrCost;
2881}
2882
2883// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2885 ArrayRef<const Value *> Ptrs, const Value *Base,
2886 const TTI::PointersChainInfo &Info, Type *AccessTy,
2889 // In the basic model we take into account GEP instructions only
2890 // (although here can come alloca instruction, a value, constants and/or
2891 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2892 // pointer). Typically, if Base is a not a GEP-instruction and all the
2893 // pointers are relative to the same base address, all the rest are
2894 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2895 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2896 // any their index is a non-const.
2897 // If no known dependencies between the pointers cost is calculated as a sum
2898 // of costs of GEP instructions.
2899 for (auto [I, V] : enumerate(Ptrs)) {
2900 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2901 if (!GEP)
2902 continue;
2903 if (Info.isSameBase() && V != Base) {
2904 if (GEP->hasAllConstantIndices())
2905 continue;
2906 // If the chain is unit-stride and BaseReg + stride*i is a legal
2907 // addressing mode, then presume the base GEP is sitting around in a
2908 // register somewhere and check if we can fold the offset relative to
2909 // it.
2910 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2911 if (Info.isUnitStride() &&
2912 isLegalAddressingMode(AccessTy,
2913 /* BaseGV */ nullptr,
2914 /* BaseOffset */ Stride * I,
2915 /* HasBaseReg */ true,
2916 /* Scale */ 0,
2917 GEP->getType()->getPointerAddressSpace()))
2918 continue;
2919 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2920 {TTI::OK_AnyValue, TTI::OP_None},
2921 {TTI::OK_AnyValue, TTI::OP_None}, {});
2922 } else {
2923 SmallVector<const Value *> Indices(GEP->indices());
2924 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2925 Indices, AccessTy, CostKind);
2926 }
2927 }
2928 return Cost;
2929}
2930
2933 OptimizationRemarkEmitter *ORE) const {
2934 // TODO: More tuning on benchmarks and metrics with changes as needed
2935 // would apply to all settings below to enable performance.
2936
2937
2938 if (ST->enableDefaultUnroll())
2939 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2940
2941 // Enable Upper bound unrolling universally, not dependent upon the conditions
2942 // below.
2943 UP.UpperBound = true;
2944
2945 // Disable loop unrolling for Oz and Os.
2946 UP.OptSizeThreshold = 0;
2948 if (L->getHeader()->getParent()->hasOptSize())
2949 return;
2950
2951 SmallVector<BasicBlock *, 4> ExitingBlocks;
2952 L->getExitingBlocks(ExitingBlocks);
2953 LLVM_DEBUG(dbgs() << "Loop has:\n"
2954 << "Blocks: " << L->getNumBlocks() << "\n"
2955 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2956
2957 // Only allow another exit other than the latch. This acts as an early exit
2958 // as it mirrors the profitability calculation of the runtime unroller.
2959 if (ExitingBlocks.size() > 2)
2960 return;
2961
2962 // Limit the CFG of the loop body for targets with a branch predictor.
2963 // Allowing 4 blocks permits if-then-else diamonds in the body.
2964 if (L->getNumBlocks() > 4)
2965 return;
2966
2967 // Scan the loop: don't unroll loops with calls as this could prevent
2968 // inlining. Don't unroll auto-vectorized loops either, though do allow
2969 // unrolling of the scalar remainder.
2970 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2972 for (auto *BB : L->getBlocks()) {
2973 for (auto &I : *BB) {
2974 // Both auto-vectorized loops and the scalar remainder have the
2975 // isvectorized attribute, so differentiate between them by the presence
2976 // of vector instructions.
2977 if (IsVectorized && (I.getType()->isVectorTy() ||
2978 llvm::any_of(I.operand_values(), [](Value *V) {
2979 return V->getType()->isVectorTy();
2980 })))
2981 return;
2982
2983 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2984 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2985 if (!isLoweredToCall(F))
2986 continue;
2987 }
2988 return;
2989 }
2990
2991 SmallVector<const Value *> Operands(I.operand_values());
2992 Cost += getInstructionCost(&I, Operands,
2994 }
2995 }
2996
2997 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2998
2999 UP.Partial = true;
3000 UP.Runtime = true;
3001 UP.UnrollRemainder = true;
3002 UP.UnrollAndJam = true;
3003
3004 // Force unrolling small loops can be very useful because of the branch
3005 // taken cost of the backedge.
3006 if (Cost < 12)
3007 UP.Force = true;
3008}
3009
3014
3016 MemIntrinsicInfo &Info) const {
3017 const DataLayout &DL = getDataLayout();
3018 Intrinsic::ID IID = Inst->getIntrinsicID();
3019 LLVMContext &C = Inst->getContext();
3020 bool HasMask = false;
3021
3022 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
3023 bool IsWrite) -> int64_t {
3024 if (auto *TarExtTy =
3025 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
3026 return TarExtTy->getIntParameter(0);
3027
3028 return 1;
3029 };
3030
3031 switch (IID) {
3032 case Intrinsic::riscv_vle_mask:
3033 case Intrinsic::riscv_vse_mask:
3034 case Intrinsic::riscv_vlseg2_mask:
3035 case Intrinsic::riscv_vlseg3_mask:
3036 case Intrinsic::riscv_vlseg4_mask:
3037 case Intrinsic::riscv_vlseg5_mask:
3038 case Intrinsic::riscv_vlseg6_mask:
3039 case Intrinsic::riscv_vlseg7_mask:
3040 case Intrinsic::riscv_vlseg8_mask:
3041 case Intrinsic::riscv_vsseg2_mask:
3042 case Intrinsic::riscv_vsseg3_mask:
3043 case Intrinsic::riscv_vsseg4_mask:
3044 case Intrinsic::riscv_vsseg5_mask:
3045 case Intrinsic::riscv_vsseg6_mask:
3046 case Intrinsic::riscv_vsseg7_mask:
3047 case Intrinsic::riscv_vsseg8_mask:
3048 HasMask = true;
3049 [[fallthrough]];
3050 case Intrinsic::riscv_vle:
3051 case Intrinsic::riscv_vse:
3052 case Intrinsic::riscv_vlseg2:
3053 case Intrinsic::riscv_vlseg3:
3054 case Intrinsic::riscv_vlseg4:
3055 case Intrinsic::riscv_vlseg5:
3056 case Intrinsic::riscv_vlseg6:
3057 case Intrinsic::riscv_vlseg7:
3058 case Intrinsic::riscv_vlseg8:
3059 case Intrinsic::riscv_vsseg2:
3060 case Intrinsic::riscv_vsseg3:
3061 case Intrinsic::riscv_vsseg4:
3062 case Intrinsic::riscv_vsseg5:
3063 case Intrinsic::riscv_vsseg6:
3064 case Intrinsic::riscv_vsseg7:
3065 case Intrinsic::riscv_vsseg8: {
3066 // Intrinsic interface:
3067 // riscv_vle(merge, ptr, vl)
3068 // riscv_vle_mask(merge, ptr, mask, vl, policy)
3069 // riscv_vse(val, ptr, vl)
3070 // riscv_vse_mask(val, ptr, mask, vl, policy)
3071 // riscv_vlseg#(merge, ptr, vl, sew)
3072 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3073 // riscv_vsseg#(val, ptr, vl, sew)
3074 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3075 bool IsWrite = Inst->getType()->isVoidTy();
3076 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3077 // The results of segment loads are TargetExtType.
3078 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3079 unsigned SEW =
3080 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3081 ->getZExtValue();
3082 Ty = TarExtTy->getTypeParameter(0U);
3084 IntegerType::get(C, SEW),
3085 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3086 }
3087 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3088 unsigned VLIndex = RVVIInfo->VLOperand;
3089 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3090 MaybeAlign Alignment =
3091 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3092 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3093 Value *Mask = ConstantInt::getTrue(MaskType);
3094 if (HasMask)
3095 Mask = Inst->getArgOperand(VLIndex - 1);
3096 Value *EVL = Inst->getArgOperand(VLIndex);
3097 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3098 // RVV uses contiguous elements as a segment.
3099 if (SegNum > 1) {
3100 unsigned ElemSize = Ty->getScalarSizeInBits();
3101 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3102 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3103 }
3104 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3105 Alignment, Mask, EVL);
3106 return true;
3107 }
3108 case Intrinsic::riscv_vlse_mask:
3109 case Intrinsic::riscv_vsse_mask:
3110 case Intrinsic::riscv_vlsseg2_mask:
3111 case Intrinsic::riscv_vlsseg3_mask:
3112 case Intrinsic::riscv_vlsseg4_mask:
3113 case Intrinsic::riscv_vlsseg5_mask:
3114 case Intrinsic::riscv_vlsseg6_mask:
3115 case Intrinsic::riscv_vlsseg7_mask:
3116 case Intrinsic::riscv_vlsseg8_mask:
3117 case Intrinsic::riscv_vssseg2_mask:
3118 case Intrinsic::riscv_vssseg3_mask:
3119 case Intrinsic::riscv_vssseg4_mask:
3120 case Intrinsic::riscv_vssseg5_mask:
3121 case Intrinsic::riscv_vssseg6_mask:
3122 case Intrinsic::riscv_vssseg7_mask:
3123 case Intrinsic::riscv_vssseg8_mask:
3124 HasMask = true;
3125 [[fallthrough]];
3126 case Intrinsic::riscv_vlse:
3127 case Intrinsic::riscv_vsse:
3128 case Intrinsic::riscv_vlsseg2:
3129 case Intrinsic::riscv_vlsseg3:
3130 case Intrinsic::riscv_vlsseg4:
3131 case Intrinsic::riscv_vlsseg5:
3132 case Intrinsic::riscv_vlsseg6:
3133 case Intrinsic::riscv_vlsseg7:
3134 case Intrinsic::riscv_vlsseg8:
3135 case Intrinsic::riscv_vssseg2:
3136 case Intrinsic::riscv_vssseg3:
3137 case Intrinsic::riscv_vssseg4:
3138 case Intrinsic::riscv_vssseg5:
3139 case Intrinsic::riscv_vssseg6:
3140 case Intrinsic::riscv_vssseg7:
3141 case Intrinsic::riscv_vssseg8: {
3142 // Intrinsic interface:
3143 // riscv_vlse(merge, ptr, stride, vl)
3144 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3145 // riscv_vsse(val, ptr, stride, vl)
3146 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3147 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3148 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3149 // riscv_vssseg#(val, ptr, offset, vl, sew)
3150 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3151 bool IsWrite = Inst->getType()->isVoidTy();
3152 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3153 // The results of segment loads are TargetExtType.
3154 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3155 unsigned SEW =
3156 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3157 ->getZExtValue();
3158 Ty = TarExtTy->getTypeParameter(0U);
3160 IntegerType::get(C, SEW),
3161 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3162 }
3163 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3164 unsigned VLIndex = RVVIInfo->VLOperand;
3165 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3166 MaybeAlign Alignment =
3167 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3168
3169 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3170 // Use the pointer alignment as the element alignment if the stride is a
3171 // multiple of the pointer alignment. Otherwise, the element alignment
3172 // should be the greatest common divisor of pointer alignment and stride.
3173 // For simplicity, just consider unalignment for elements.
3174 unsigned PointerAlign = Alignment.valueOrOne().value();
3175 if (!isa<ConstantInt>(Stride) ||
3176 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3177 Alignment = Align(1);
3178
3179 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3180 Value *Mask = ConstantInt::getTrue(MaskType);
3181 if (HasMask)
3182 Mask = Inst->getArgOperand(VLIndex - 1);
3183 Value *EVL = Inst->getArgOperand(VLIndex);
3184 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3185 // RVV uses contiguous elements as a segment.
3186 if (SegNum > 1) {
3187 unsigned ElemSize = Ty->getScalarSizeInBits();
3188 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3189 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3190 }
3191 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3192 Alignment, Mask, EVL, Stride);
3193 return true;
3194 }
3195 case Intrinsic::riscv_vloxei_mask:
3196 case Intrinsic::riscv_vluxei_mask:
3197 case Intrinsic::riscv_vsoxei_mask:
3198 case Intrinsic::riscv_vsuxei_mask:
3199 case Intrinsic::riscv_vloxseg2_mask:
3200 case Intrinsic::riscv_vloxseg3_mask:
3201 case Intrinsic::riscv_vloxseg4_mask:
3202 case Intrinsic::riscv_vloxseg5_mask:
3203 case Intrinsic::riscv_vloxseg6_mask:
3204 case Intrinsic::riscv_vloxseg7_mask:
3205 case Intrinsic::riscv_vloxseg8_mask:
3206 case Intrinsic::riscv_vluxseg2_mask:
3207 case Intrinsic::riscv_vluxseg3_mask:
3208 case Intrinsic::riscv_vluxseg4_mask:
3209 case Intrinsic::riscv_vluxseg5_mask:
3210 case Intrinsic::riscv_vluxseg6_mask:
3211 case Intrinsic::riscv_vluxseg7_mask:
3212 case Intrinsic::riscv_vluxseg8_mask:
3213 case Intrinsic::riscv_vsoxseg2_mask:
3214 case Intrinsic::riscv_vsoxseg3_mask:
3215 case Intrinsic::riscv_vsoxseg4_mask:
3216 case Intrinsic::riscv_vsoxseg5_mask:
3217 case Intrinsic::riscv_vsoxseg6_mask:
3218 case Intrinsic::riscv_vsoxseg7_mask:
3219 case Intrinsic::riscv_vsoxseg8_mask:
3220 case Intrinsic::riscv_vsuxseg2_mask:
3221 case Intrinsic::riscv_vsuxseg3_mask:
3222 case Intrinsic::riscv_vsuxseg4_mask:
3223 case Intrinsic::riscv_vsuxseg5_mask:
3224 case Intrinsic::riscv_vsuxseg6_mask:
3225 case Intrinsic::riscv_vsuxseg7_mask:
3226 case Intrinsic::riscv_vsuxseg8_mask:
3227 HasMask = true;
3228 [[fallthrough]];
3229 case Intrinsic::riscv_vloxei:
3230 case Intrinsic::riscv_vluxei:
3231 case Intrinsic::riscv_vsoxei:
3232 case Intrinsic::riscv_vsuxei:
3233 case Intrinsic::riscv_vloxseg2:
3234 case Intrinsic::riscv_vloxseg3:
3235 case Intrinsic::riscv_vloxseg4:
3236 case Intrinsic::riscv_vloxseg5:
3237 case Intrinsic::riscv_vloxseg6:
3238 case Intrinsic::riscv_vloxseg7:
3239 case Intrinsic::riscv_vloxseg8:
3240 case Intrinsic::riscv_vluxseg2:
3241 case Intrinsic::riscv_vluxseg3:
3242 case Intrinsic::riscv_vluxseg4:
3243 case Intrinsic::riscv_vluxseg5:
3244 case Intrinsic::riscv_vluxseg6:
3245 case Intrinsic::riscv_vluxseg7:
3246 case Intrinsic::riscv_vluxseg8:
3247 case Intrinsic::riscv_vsoxseg2:
3248 case Intrinsic::riscv_vsoxseg3:
3249 case Intrinsic::riscv_vsoxseg4:
3250 case Intrinsic::riscv_vsoxseg5:
3251 case Intrinsic::riscv_vsoxseg6:
3252 case Intrinsic::riscv_vsoxseg7:
3253 case Intrinsic::riscv_vsoxseg8:
3254 case Intrinsic::riscv_vsuxseg2:
3255 case Intrinsic::riscv_vsuxseg3:
3256 case Intrinsic::riscv_vsuxseg4:
3257 case Intrinsic::riscv_vsuxseg5:
3258 case Intrinsic::riscv_vsuxseg6:
3259 case Intrinsic::riscv_vsuxseg7:
3260 case Intrinsic::riscv_vsuxseg8: {
3261 // Intrinsic interface (only listed ordered version):
3262 // riscv_vloxei(merge, ptr, index, vl)
3263 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3264 // riscv_vsoxei(val, ptr, index, vl)
3265 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3266 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3267 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3268 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3269 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3270 bool IsWrite = Inst->getType()->isVoidTy();
3271 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3272 // The results of segment loads are TargetExtType.
3273 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3274 unsigned SEW =
3275 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3276 ->getZExtValue();
3277 Ty = TarExtTy->getTypeParameter(0U);
3279 IntegerType::get(C, SEW),
3280 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3281 }
3282 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3283 unsigned VLIndex = RVVIInfo->VLOperand;
3284 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3285 Value *Mask;
3286 if (HasMask) {
3287 Mask = Inst->getArgOperand(VLIndex - 1);
3288 } else {
3289 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3290 // and casting that to scalar i64 triggers a vector/scalar mismatch
3291 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3292 // via extractelement instead.
3293 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3294 Mask = ConstantInt::getTrue(MaskType);
3295 }
3296 Value *EVL = Inst->getArgOperand(VLIndex);
3297 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3298 // RVV uses contiguous elements as a segment.
3299 if (SegNum > 1) {
3300 unsigned ElemSize = Ty->getScalarSizeInBits();
3301 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3302 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3303 }
3304 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3305 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3306 Align(1), Mask, EVL,
3307 /* Stride */ nullptr, OffsetOp);
3308 return true;
3309 }
3310 }
3311 return false;
3312}
3313
3315 if (Ty->isVectorTy()) {
3316 // f16 with only zvfhmin and bf16 will be promoted to f32
3317 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3318 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3319 EltTy->isBFloatTy())
3320 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3321 cast<VectorType>(Ty));
3322
3323 TypeSize Size = DL.getTypeSizeInBits(Ty);
3324 if (Size.isScalable() && ST->hasVInstructions())
3325 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3326
3327 if (ST->useRVVForFixedLengthVectors())
3328 return divideCeil(Size, ST->getRealMinVLen());
3329 }
3330
3331 return BaseT::getRegUsageForType(Ty);
3332}
3333
3334unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3335 if (SLPMaxVF.getNumOccurrences())
3336 return SLPMaxVF;
3337
3338 // Return how many elements can fit in getRegisterBitwidth. This is the
3339 // same routine as used in LoopVectorizer. We should probably be
3340 // accounting for whether we actually have instructions with the right
3341 // lane type, but we don't have enough information to do that without
3342 // some additional plumbing which hasn't been justified yet.
3343 TypeSize RegWidth =
3345 // If no vector registers, or absurd element widths, disable
3346 // vectorization by returning 1.
3347 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3348}
3349
3353
3355 return ST->enableUnalignedVectorMem();
3356}
3357
3360 ScalarEvolution *SE) const {
3361 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3362 return TTI::AMK_PostIndexed;
3363
3365}
3366
3368 const TargetTransformInfo::LSRCost &C2) const {
3369 // RISC-V specific here are "instruction number 1st priority".
3370 // If we need to emit adds inside the loop to add up base registers, then
3371 // we need at least one extra temporary register.
3372 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3373 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3374 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3375 C1.NumIVMuls, C1.NumBaseAdds,
3376 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3377 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3378 C2.NumIVMuls, C2.NumBaseAdds,
3379 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3380}
3381
3383 Align Alignment) const {
3384 auto *VTy = dyn_cast<VectorType>(DataTy);
3385 if (!VTy || VTy->isScalableTy())
3386 return false;
3387
3388 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3389 return false;
3390
3391 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3392 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3393 if (VTy->getElementType()->isIntegerTy(8))
3394 if (VTy->getElementCount().getFixedValue() > 256)
3395 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3396 ST->getMaxLMULForFixedLengthVectors();
3397 return true;
3398}
3399
3401 Align Alignment) const {
3402 auto *VTy = dyn_cast<VectorType>(DataTy);
3403 if (!VTy || VTy->isScalableTy())
3404 return false;
3405
3406 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3407 return false;
3408 return true;
3409}
3410
3411/// See if \p I should be considered for address type promotion. We check if \p
3412/// I is a sext with right type and used in memory accesses. If it used in a
3413/// "complex" getelementptr, we allow it to be promoted without finding other
3414/// sext instructions that sign extended the same initial value. A getelementptr
3415/// is considered as "complex" if it has more than 2 operands.
3417 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3418 bool Considerable = false;
3419 AllowPromotionWithoutCommonHeader = false;
3420 if (!isa<SExtInst>(&I))
3421 return false;
3422 Type *ConsideredSExtType =
3423 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3424 if (I.getType() != ConsideredSExtType)
3425 return false;
3426 // See if the sext is the one with the right type and used in at least one
3427 // GetElementPtrInst.
3428 for (const User *U : I.users()) {
3429 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3430 Considerable = true;
3431 // A getelementptr is considered as "complex" if it has more than 2
3432 // operands. We will promote a SExt used in such complex GEP as we
3433 // expect some computation to be merged if they are done on 64 bits.
3434 if (GEPInst->getNumOperands() > 2) {
3435 AllowPromotionWithoutCommonHeader = true;
3436 break;
3437 }
3438 }
3439 }
3440 return Considerable;
3441}
3442
3443bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3444 switch (Opcode) {
3445 case Instruction::Add:
3446 case Instruction::Sub:
3447 case Instruction::Mul:
3448 case Instruction::And:
3449 case Instruction::Or:
3450 case Instruction::Xor:
3451 case Instruction::FAdd:
3452 case Instruction::FSub:
3453 case Instruction::FMul:
3454 case Instruction::FDiv:
3455 case Instruction::ICmp:
3456 case Instruction::FCmp:
3457 return true;
3458 case Instruction::Shl:
3459 case Instruction::LShr:
3460 case Instruction::AShr:
3461 case Instruction::UDiv:
3462 case Instruction::SDiv:
3463 case Instruction::URem:
3464 case Instruction::SRem:
3465 case Instruction::Select:
3466 return Operand == 1;
3467 default:
3468 return false;
3469 }
3470}
3471
3473 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3474 return false;
3475
3476 if (canSplatOperand(I->getOpcode(), Operand))
3477 return true;
3478
3479 auto *II = dyn_cast<IntrinsicInst>(I);
3480 if (!II)
3481 return false;
3482
3483 switch (II->getIntrinsicID()) {
3484 case Intrinsic::fma:
3485 case Intrinsic::vp_fma:
3486 case Intrinsic::fmuladd:
3487 case Intrinsic::vp_fmuladd:
3488 return Operand == 0 || Operand == 1;
3489 case Intrinsic::vp_shl:
3490 case Intrinsic::vp_lshr:
3491 case Intrinsic::vp_ashr:
3492 case Intrinsic::vp_udiv:
3493 case Intrinsic::vp_sdiv:
3494 case Intrinsic::vp_urem:
3495 case Intrinsic::vp_srem:
3496 case Intrinsic::ssub_sat:
3497 case Intrinsic::vp_ssub_sat:
3498 case Intrinsic::usub_sat:
3499 case Intrinsic::vp_usub_sat:
3500 case Intrinsic::vp_select:
3501 return Operand == 1;
3502 // These intrinsics are commutative.
3503 case Intrinsic::vp_add:
3504 case Intrinsic::vp_mul:
3505 case Intrinsic::vp_and:
3506 case Intrinsic::vp_or:
3507 case Intrinsic::vp_xor:
3508 case Intrinsic::vp_fadd:
3509 case Intrinsic::vp_fmul:
3510 case Intrinsic::vp_icmp:
3511 case Intrinsic::vp_fcmp:
3512 case Intrinsic::smin:
3513 case Intrinsic::vp_smin:
3514 case Intrinsic::umin:
3515 case Intrinsic::vp_umin:
3516 case Intrinsic::smax:
3517 case Intrinsic::vp_smax:
3518 case Intrinsic::umax:
3519 case Intrinsic::vp_umax:
3520 case Intrinsic::sadd_sat:
3521 case Intrinsic::vp_sadd_sat:
3522 case Intrinsic::uadd_sat:
3523 case Intrinsic::vp_uadd_sat:
3524 // These intrinsics have 'vr' versions.
3525 case Intrinsic::vp_sub:
3526 case Intrinsic::vp_fsub:
3527 case Intrinsic::vp_fdiv:
3528 return Operand == 0 || Operand == 1;
3529 default:
3530 return false;
3531 }
3532}
3533
3534/// Check if sinking \p I's operands to I's basic block is profitable, because
3535/// the operands can be folded into a target instruction, e.g.
3536/// splats of scalars can fold into vector instructions.
3539 using namespace llvm::PatternMatch;
3540
3541 if (I->isBitwiseLogicOp()) {
3542 if (!I->getType()->isVectorTy()) {
3543 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3544 for (auto &Op : I->operands()) {
3545 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3546 if (match(Op.get(), m_Not(m_Value()))) {
3547 Ops.push_back(&Op);
3548 return true;
3549 }
3550 }
3551 }
3552 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3553 for (auto &Op : I->operands()) {
3554 // (and X, (not Y)) -> (vandn.vv X, Y)
3555 if (match(Op.get(), m_Not(m_Value()))) {
3556 Ops.push_back(&Op);
3557 return true;
3558 }
3559 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3561 m_ZeroInt()),
3562 m_Value(), m_ZeroMask()))) {
3563 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3564 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3565 Ops.push_back(&Not);
3566 Ops.push_back(&InsertElt);
3567 Ops.push_back(&Op);
3568 return true;
3569 }
3570 }
3571 }
3572 }
3573
3574 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3575 return false;
3576
3577 // Don't sink splat operands if the target prefers it. Some targets requires
3578 // S2V transfer buffers and we can run out of them copying the same value
3579 // repeatedly.
3580 // FIXME: It could still be worth doing if it would improve vector register
3581 // pressure and prevent a vector spill.
3582 if (!ST->sinkSplatOperands())
3583 return false;
3584
3585 for (auto OpIdx : enumerate(I->operands())) {
3586 if (!canSplatOperand(I, OpIdx.index()))
3587 continue;
3588
3589 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3590 // Make sure we are not already sinking this operand
3591 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3592 continue;
3593
3594 // We are looking for a splat that can be sunk.
3596 m_Value(), m_ZeroMask())))
3597 continue;
3598
3599 // Don't sink i1 splats.
3600 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3601 continue;
3602
3603 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3604 // and vector registers
3605 for (Use &U : Op->uses()) {
3606 Instruction *Insn = cast<Instruction>(U.getUser());
3607 if (!canSplatOperand(Insn, U.getOperandNo()))
3608 return false;
3609 }
3610
3611 // Sink any fpexts since they might be used in a widening fp pattern.
3612 Use *InsertEltUse = &Op->getOperandUse(0);
3613 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3614 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3615 Ops.push_back(&InsertElt->getOperandUse(1));
3616 Ops.push_back(InsertEltUse);
3617 Ops.push_back(&OpIdx.value());
3618 }
3619 return true;
3620}
3621
3623RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3625 // TODO: Enable expansion when unaligned access is not supported after we fix
3626 // issues in ExpandMemcmp.
3627 if (!ST->enableUnalignedScalarMem())
3628 return Options;
3629
3630 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3631 return Options;
3632
3633 Options.AllowOverlappingLoads = true;
3634 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3635 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3636 if (ST->is64Bit()) {
3637 Options.LoadSizes = {8, 4, 2, 1};
3638 Options.AllowedTailExpansions = {3, 5, 6};
3639 } else {
3640 Options.LoadSizes = {4, 2, 1};
3641 Options.AllowedTailExpansions = {3};
3642 }
3643
3644 if (IsZeroCmp && ST->hasVInstructions()) {
3645 unsigned VLenB = ST->getRealMinVLen() / 8;
3646 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3647 // `VLenB * MaxLMUL` so that it fits in a single register group.
3648 unsigned MinSize = ST->getXLen() / 8 + 1;
3649 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3650 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3651 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3652 }
3653 return Options;
3654}
3655
3657 const Instruction *I) const {
3659 // For the binary operators (e.g. or) we need to be more careful than
3660 // selects, here we only transform them if they are already at a natural
3661 // break point in the code - the end of a block with an unconditional
3662 // terminator.
3663 if (I->getOpcode() == Instruction::Or &&
3664 isa<UncondBrInst>(I->getNextNode()))
3665 return true;
3666
3667 if (I->getOpcode() == Instruction::Add ||
3668 I->getOpcode() == Instruction::Sub)
3669 return true;
3670 }
3672}
3673
3675 const Function *Caller, const Attribute &Attr) const {
3676 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3677 // restrictions on their signatures). We can outline from the bodies of these
3678 // handlers, but when we do we need to make sure we don't mark the outlined
3679 // function as an interrupt handler too.
3680 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3681 return false;
3682
3684}
3685
3686std::optional<Instruction *>
3688 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3689 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3690 // creating redundant masks.
3691 const DataLayout &DL = IC.getDataLayout();
3692 if (II.user_empty())
3693 return {};
3694 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3695 if (!TargetVecTy)
3696 return {};
3697 const APInt *Scalar;
3698 uint64_t VL;
3700 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3701 !all_of(II.users(), [TargetVecTy](User *U) {
3702 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3703 }))
3704 return {};
3705 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3706 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3707 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3708 if (TargetEltBW % SourceEltBW)
3709 return {};
3710 unsigned TargetScale = TargetEltBW / SourceEltBW;
3711 if (VL % TargetScale || TargetScale == 1)
3712 return {};
3713 Type *VLTy = II.getOperand(2)->getType();
3714 ElementCount SourceEC = SourceVecTy->getElementCount();
3715 unsigned NewEltBW = SourceEltBW * TargetScale;
3716 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3717 !DL.fitsInLegalInteger(NewEltBW))
3718 return {};
3719 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3720 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3721 return {};
3722 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3723 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3724 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3725 "Lossless bitcast between types expected");
3726 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3727 return IC.replaceInstUsesWith(
3728 II,
3731 RetTy, Intrinsic::riscv_vmv_v_x,
3732 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3733 ConstantInt::get(VLTy, VL / TargetScale)}),
3734 SourceVecTy));
3735}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
std::optional< InstructionCost > getCombinedArithmeticInstructionCost(unsigned ISDOpcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI) const
Check to see if this instruction is expected to be combined to a simpler operation during/before lowe...
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto adjacent_find(R &&Range)
Provide wrappers to std::adjacent_find which finds the first pair of adjacent elements that are equal...
Definition STLExtras.h:1817
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1969
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).