LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdota4* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
704 *this, LT.second, ST->getRealVLen(),
705 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(DeinterleaveMask, Mask))
731 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
732 LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
739 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
745 FixedVectorType *DestTp =
747 std::pair<InstructionCost, MVT> DestLT =
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
753 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
754 CostKind, InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
773 }
774 break;
775 }
778
779 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
792 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(DstTy);
859 return LT.first *
860 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
877 Instruction::InsertElement);
878 if (LT.second.getScalarSizeInBits() == 1) {
879 if (HasScalar) {
880 // Example sequence:
881 // andi a0, a0, 1
882 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
883 // vmv.v.x v8, a0
884 // vmsne.vi v0, v8, 0
885 return LT.first *
886 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
887 LT.second, CostKind));
888 }
889 // Example sequence:
890 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
891 // vmv.v.i v8, 0
892 // vmerge.vim v8, v8, 1, v0
893 // vmv.x.s a0, v8
894 // andi a0, a0, 1
895 // vmv.v.x v8, a0
896 // vmsne.vi v0, v8, 0
897
898 return LT.first *
899 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
900 RISCV::VMV_X_S, RISCV::VMV_V_X,
901 RISCV::VMSNE_VI},
902 LT.second, CostKind));
903 }
904
905 if (HasScalar) {
906 // Example sequence:
907 // vmv.v.x v8, a0
908 return LT.first *
909 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
910 }
911
912 // Example sequence:
913 // vrgather.vi v9, v8, 0
914 return LT.first *
915 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
916 }
917 case TTI::SK_Splice: {
918 // vslidedown+vslideup.
919 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
920 // of similar code, but I think we expand through memory.
921 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
922 if (Index >= 0 && Index < 32)
923 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
924 else if (Index < 0 && Index > -32)
925 Opcodes[1] = RISCV::VSLIDEUP_VI;
926 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
927 }
928 case TTI::SK_Reverse: {
929
930 if (!LT.second.isVector())
932
933 // TODO: Cases to improve here:
934 // * Illegal vector types
935 // * i64 on RV32
936 if (SrcTy->getElementType()->isIntegerTy(1)) {
937 VectorType *WideTy =
938 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
939 cast<VectorType>(SrcTy)->getElementCount());
940 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
942 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
943 nullptr) +
944 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
946 }
947
948 MVT ContainerVT = LT.second;
949 if (LT.second.isFixedLengthVector())
950 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
951 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
952 if (ContainerVT.bitsLE(M1VT)) {
953 // Example sequence:
954 // csrr a0, vlenb
955 // srli a0, a0, 3
956 // addi a0, a0, -1
957 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
958 // vid.v v9
959 // vrsub.vx v10, v9, a0
960 // vrgather.vv v9, v8, v10
961 InstructionCost LenCost = 3;
962 if (LT.second.isFixedLengthVector())
963 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
964 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
965 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
966 if (LT.second.isFixedLengthVector() &&
967 isInt<5>(LT.second.getVectorNumElements() - 1))
968 Opcodes[1] = RISCV::VRSUB_VI;
969 InstructionCost GatherCost =
970 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
971 return LT.first * (LenCost + GatherCost);
972 }
973
974 // At high LMUL, we split into a series of M1 reverses (see
975 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
976 // the resulting gap at the bottom (for fixed vectors only). The important
977 // bit is that the cost scales linearly, not quadratically with LMUL.
978 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
979 InstructionCost FixedCost =
980 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
981 unsigned Ratio =
983 InstructionCost GatherCost =
984 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
985 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
986 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
987 return FixedCost + LT.first * (GatherCost + SlideCost);
988 }
989 }
990 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
991 SubTp);
992}
993
994static unsigned isM1OrSmaller(MVT VT) {
996 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1000}
1001
1003 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1004 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1005 TTI::VectorInstrContext VIC) const {
1008
1009 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1010 // For now, skip all fixed vector cost analysis when P extension is available
1011 // to avoid crashes in getMinRVVVectorSizeInBits()
1012 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1013 return 1; // Treat as single instruction cost for now
1014 }
1015
1016 // A build_vector (which is m1 sized or smaller) can be done in no
1017 // worse than one vslide1down.vx per element in the type. We could
1018 // in theory do an explode_vector in the inverse manner, but our
1019 // lowering today does not have a first class node for this pattern.
1021 Ty, DemandedElts, Insert, Extract, CostKind);
1022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1023 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1024 if (Ty->getScalarSizeInBits() == 1) {
1025 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1026 // Note: Implicit scalar anyextend is assumed to be free since the i1
1027 // must be stored in a GPR.
1028 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1029 CostKind) +
1030 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1032 }
1033
1034 assert(LT.second.isFixedLengthVector());
1035 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1036 if (isM1OrSmaller(ContainerVT)) {
1037 InstructionCost BV =
1038 cast<FixedVectorType>(Ty)->getNumElements() *
1039 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1040 if (BV < Cost)
1041 Cost = BV;
1042 }
1043 }
1044 return Cost;
1045}
1046
1050 Type *DataTy = MICA.getDataType();
1051 Align Alignment = MICA.getAlignment();
1052 switch (MICA.getID()) {
1053 case Intrinsic::vp_load_ff: {
1054 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1055 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1057
1058 unsigned AS = MICA.getAddressSpace();
1059 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1060 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1061 }
1062 case Intrinsic::experimental_vp_strided_load:
1063 case Intrinsic::experimental_vp_strided_store:
1064 return getStridedMemoryOpCost(MICA, CostKind);
1065 case Intrinsic::masked_compressstore:
1066 case Intrinsic::masked_expandload:
1068 case Intrinsic::vp_scatter:
1069 case Intrinsic::vp_gather:
1070 case Intrinsic::masked_scatter:
1071 case Intrinsic::masked_gather:
1072 return getGatherScatterOpCost(MICA, CostKind);
1073 case Intrinsic::vp_load:
1074 case Intrinsic::vp_store:
1075 case Intrinsic::masked_load:
1076 case Intrinsic::masked_store:
1077 return getMaskedMemoryOpCost(MICA, CostKind);
1078 }
1080}
1081
1085 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1086 : Instruction::Store;
1087 Type *Src = MICA.getDataType();
1088 Align Alignment = MICA.getAlignment();
1089 unsigned AddressSpace = MICA.getAddressSpace();
1090
1091 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1094
1095 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1096}
1097
1099 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1100 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1101 bool UseMaskForCond, bool UseMaskForGaps) const {
1102
1103 // The interleaved memory access pass will lower (de)interleave ops combined
1104 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1105 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1106 // gap).
1107 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1108 auto *VTy = cast<VectorType>(VecTy);
1109 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1110 // Need to make sure type has't been scalarized
1111 if (LT.second.isVector()) {
1112 auto *SubVecTy =
1113 VectorType::get(VTy->getElementType(),
1114 VTy->getElementCount().divideCoefficientBy(Factor));
1115 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1116 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1117 AddressSpace, DL)) {
1118
1119 // Some processors optimize segment loads/stores as one wide memory op +
1120 // Factor * LMUL shuffle ops.
1121 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1123 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1124 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1125 Cost += Factor * TLI->getLMULCost(SubVecVT);
1126 return LT.first * Cost;
1127 }
1128
1129 // Otherwise, the cost is proportional to the number of elements (VL *
1130 // Factor ops).
1131 InstructionCost MemOpCost =
1132 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1133 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1134 unsigned NumLoads = getEstimatedVLFor(VTy);
1135 return NumLoads * MemOpCost;
1136 }
1137 }
1138 }
1139
1140 // TODO: Return the cost of interleaved accesses for scalable vector when
1141 // unable to convert to segment accesses instructions.
1142 if (isa<ScalableVectorType>(VecTy))
1144
1145 auto *FVTy = cast<FixedVectorType>(VecTy);
1146 InstructionCost MemCost =
1147 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1148 unsigned VF = FVTy->getNumElements() / Factor;
1149
1150 // An interleaved load will look like this for Factor=3:
1151 // %wide.vec = load <12 x i32>, ptr %3, align 4
1152 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1153 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1154 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1155 if (Opcode == Instruction::Load) {
1156 InstructionCost Cost = MemCost;
1157 for (unsigned Index : Indices) {
1158 FixedVectorType *VecTy =
1159 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1160 auto Mask = createStrideMask(Index, Factor, VF);
1161 Mask.resize(VF * Factor, -1);
1162 InstructionCost ShuffleCost =
1164 Mask, CostKind, 0, nullptr, {});
1165 Cost += ShuffleCost;
1166 }
1167 return Cost;
1168 }
1169
1170 // TODO: Model for NF > 2
1171 // We'll need to enhance getShuffleCost to model shuffles that are just
1172 // inserts and extracts into subvectors, since they won't have the full cost
1173 // of a vrgather.
1174 // An interleaved store for 3 vectors of 4 lanes will look like
1175 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1176 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1177 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1178 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1179 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1180 if (Factor != 2)
1181 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1182 Alignment, AddressSpace, CostKind,
1183 UseMaskForCond, UseMaskForGaps);
1184
1185 assert(Opcode == Instruction::Store && "Opcode must be a store");
1186 // For an interleaving store of 2 vectors, we perform one large interleaving
1187 // shuffle that goes into the wide store
1188 auto Mask = createInterleaveMask(VF, Factor);
1189 InstructionCost ShuffleCost =
1191 CostKind, 0, nullptr, {});
1192 return MemCost + ShuffleCost;
1193}
1194
1198
1199 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1200 MICA.getID() == Intrinsic::vp_gather;
1201 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1202 Type *DataTy = MICA.getDataType();
1203 Align Alignment = MICA.getAlignment();
1206
1207 if ((Opcode == Instruction::Load &&
1208 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1209 (Opcode == Instruction::Store &&
1210 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1212
1213 // Cost is proportional to the number of memory operations implied. For
1214 // scalable vectors, we use an estimate on that number since we don't
1215 // know exactly what VL will be.
1216 auto &VTy = *cast<VectorType>(DataTy);
1217 unsigned NumLoads = getEstimatedVLFor(&VTy);
1218 return NumLoads * TTI::TCC_Basic;
1219}
1220
1222 const MemIntrinsicCostAttributes &MICA,
1224 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1225 ? Instruction::Load
1226 : Instruction::Store;
1227 Type *DataTy = MICA.getDataType();
1228 bool VariableMask = MICA.getVariableMask();
1229 Align Alignment = MICA.getAlignment();
1230 bool IsLegal = (Opcode == Instruction::Store &&
1231 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1232 (Opcode == Instruction::Load &&
1233 isLegalMaskedExpandLoad(DataTy, Alignment));
1234 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1236 // Example compressstore sequence:
1237 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1238 // vcompress.vm v10, v8, v0
1239 // vcpop.m a1, v0
1240 // vsetvli zero, a1, e32, m2, ta, ma
1241 // vse32.v v10, (a0)
1242 // Example expandload sequence:
1243 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1244 // vcpop.m a1, v0
1245 // vsetvli zero, a1, e32, m2, ta, ma
1246 // vle32.v v10, (a0)
1247 // vsetivli zero, 8, e32, m2, ta, ma
1248 // viota.m v12, v0
1249 // vrgather.vv v8, v10, v12, v0.t
1250 auto MemOpCost =
1251 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1252 auto LT = getTypeLegalizationCost(DataTy);
1253 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1254 if (VariableMask)
1255 Opcodes.push_back(RISCV::VCPOP_M);
1256 if (Opcode == Instruction::Store)
1257 Opcodes.append({RISCV::VCOMPRESS_VM});
1258 else
1259 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1260 return MemOpCost +
1261 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1262}
1263
1267
1268 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1269 ? Instruction::Load
1270 : Instruction::Store;
1271
1272 Type *DataTy = MICA.getDataType();
1273 Align Alignment = MICA.getAlignment();
1274 const Instruction *I = MICA.getInst();
1275
1276 if (!isLegalStridedLoadStore(DataTy, Alignment))
1278
1280 return TTI::TCC_Basic;
1281
1282 // Cost is proportional to the number of memory operations implied. For
1283 // scalable vectors, we use an estimate on that number since we don't
1284 // know exactly what VL will be.
1285 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1286 auto &VTy = *cast<VectorType>(DataTy);
1287 InstructionCost MemOpCost =
1288 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1289 {TTI::OK_AnyValue, TTI::OP_None}, I);
1290 unsigned NumLoads = getEstimatedVLFor(&VTy);
1291 return NumLoads * MemOpCost;
1292}
1293
1296 // FIXME: This is a property of the default vector convention, not
1297 // all possible calling conventions. Fixing that will require
1298 // some TTI API and SLP rework.
1301 for (auto *Ty : Tys) {
1302 if (!Ty->isVectorTy())
1303 continue;
1304 Align A = DL.getPrefTypeAlign(Ty);
1305 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1306 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1307 }
1308 return Cost;
1309}
1310
1311// Currently, these represent both throughput and codesize costs
1312// for the respective intrinsics. The costs in this table are simply
1313// instruction counts with the following adjustments made:
1314// * One vsetvli is considered free.
1316 {Intrinsic::floor, MVT::f32, 9},
1317 {Intrinsic::floor, MVT::f64, 9},
1318 {Intrinsic::ceil, MVT::f32, 9},
1319 {Intrinsic::ceil, MVT::f64, 9},
1320 {Intrinsic::trunc, MVT::f32, 7},
1321 {Intrinsic::trunc, MVT::f64, 7},
1322 {Intrinsic::round, MVT::f32, 9},
1323 {Intrinsic::round, MVT::f64, 9},
1324 {Intrinsic::roundeven, MVT::f32, 9},
1325 {Intrinsic::roundeven, MVT::f64, 9},
1326 {Intrinsic::rint, MVT::f32, 7},
1327 {Intrinsic::rint, MVT::f64, 7},
1328 {Intrinsic::nearbyint, MVT::f32, 9},
1329 {Intrinsic::nearbyint, MVT::f64, 9},
1330 {Intrinsic::bswap, MVT::i16, 3},
1331 {Intrinsic::bswap, MVT::i32, 12},
1332 {Intrinsic::bswap, MVT::i64, 31},
1333 {Intrinsic::vp_bswap, MVT::i16, 3},
1334 {Intrinsic::vp_bswap, MVT::i32, 12},
1335 {Intrinsic::vp_bswap, MVT::i64, 31},
1336 {Intrinsic::vp_fshl, MVT::i8, 7},
1337 {Intrinsic::vp_fshl, MVT::i16, 7},
1338 {Intrinsic::vp_fshl, MVT::i32, 7},
1339 {Intrinsic::vp_fshl, MVT::i64, 7},
1340 {Intrinsic::vp_fshr, MVT::i8, 7},
1341 {Intrinsic::vp_fshr, MVT::i16, 7},
1342 {Intrinsic::vp_fshr, MVT::i32, 7},
1343 {Intrinsic::vp_fshr, MVT::i64, 7},
1344 {Intrinsic::bitreverse, MVT::i8, 17},
1345 {Intrinsic::bitreverse, MVT::i16, 24},
1346 {Intrinsic::bitreverse, MVT::i32, 33},
1347 {Intrinsic::bitreverse, MVT::i64, 52},
1348 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1349 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1350 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1351 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1352 {Intrinsic::ctpop, MVT::i8, 12},
1353 {Intrinsic::ctpop, MVT::i16, 19},
1354 {Intrinsic::ctpop, MVT::i32, 20},
1355 {Intrinsic::ctpop, MVT::i64, 21},
1356 {Intrinsic::ctlz, MVT::i8, 19},
1357 {Intrinsic::ctlz, MVT::i16, 28},
1358 {Intrinsic::ctlz, MVT::i32, 31},
1359 {Intrinsic::ctlz, MVT::i64, 35},
1360 {Intrinsic::cttz, MVT::i8, 16},
1361 {Intrinsic::cttz, MVT::i16, 23},
1362 {Intrinsic::cttz, MVT::i32, 24},
1363 {Intrinsic::cttz, MVT::i64, 25},
1364 {Intrinsic::vp_ctpop, MVT::i8, 12},
1365 {Intrinsic::vp_ctpop, MVT::i16, 19},
1366 {Intrinsic::vp_ctpop, MVT::i32, 20},
1367 {Intrinsic::vp_ctpop, MVT::i64, 21},
1368 {Intrinsic::vp_ctlz, MVT::i8, 19},
1369 {Intrinsic::vp_ctlz, MVT::i16, 28},
1370 {Intrinsic::vp_ctlz, MVT::i32, 31},
1371 {Intrinsic::vp_ctlz, MVT::i64, 35},
1372 {Intrinsic::vp_cttz, MVT::i8, 16},
1373 {Intrinsic::vp_cttz, MVT::i16, 23},
1374 {Intrinsic::vp_cttz, MVT::i32, 24},
1375 {Intrinsic::vp_cttz, MVT::i64, 25},
1376};
1377
1381 auto *RetTy = ICA.getReturnType();
1382 switch (ICA.getID()) {
1383 case Intrinsic::lrint:
1384 case Intrinsic::llrint:
1385 case Intrinsic::lround:
1386 case Intrinsic::llround: {
1387 auto LT = getTypeLegalizationCost(RetTy);
1388 Type *SrcTy = ICA.getArgTypes().front();
1389 auto SrcLT = getTypeLegalizationCost(SrcTy);
1390 if (ST->hasVInstructions() && LT.second.isVector()) {
1392 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1393 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1394 if (LT.second.getVectorElementType() == MVT::bf16) {
1395 if (!ST->hasVInstructionsBF16Minimal())
1397 if (DstEltSz == 32)
1398 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1399 else
1400 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1401 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1402 !ST->hasVInstructionsF16()) {
1403 if (!ST->hasVInstructionsF16Minimal())
1405 if (DstEltSz == 32)
1406 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1407 else
1408 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1409
1410 } else if (SrcEltSz > DstEltSz) {
1411 Ops = {RISCV::VFNCVT_X_F_W};
1412 } else if (SrcEltSz < DstEltSz) {
1413 Ops = {RISCV::VFWCVT_X_F_V};
1414 } else {
1415 Ops = {RISCV::VFCVT_X_F_V};
1416 }
1417
1418 // We need to use the source LMUL in the case of a narrowing op, and the
1419 // destination LMUL otherwise.
1420 if (SrcEltSz > DstEltSz)
1421 return SrcLT.first *
1422 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1423 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1424 }
1425 break;
1426 }
1427 case Intrinsic::ceil:
1428 case Intrinsic::floor:
1429 case Intrinsic::trunc:
1430 case Intrinsic::rint:
1431 case Intrinsic::round:
1432 case Intrinsic::roundeven: {
1433 // These all use the same code.
1434 auto LT = getTypeLegalizationCost(RetTy);
1435 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1436 return LT.first * 8;
1437 break;
1438 }
1439 case Intrinsic::umin:
1440 case Intrinsic::umax:
1441 case Intrinsic::smin:
1442 case Intrinsic::smax: {
1443 auto LT = getTypeLegalizationCost(RetTy);
1444 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1445 return LT.first;
1446
1447 if (ST->hasVInstructions() && LT.second.isVector()) {
1448 unsigned Op;
1449 switch (ICA.getID()) {
1450 case Intrinsic::umin:
1451 Op = RISCV::VMINU_VV;
1452 break;
1453 case Intrinsic::umax:
1454 Op = RISCV::VMAXU_VV;
1455 break;
1456 case Intrinsic::smin:
1457 Op = RISCV::VMIN_VV;
1458 break;
1459 case Intrinsic::smax:
1460 Op = RISCV::VMAX_VV;
1461 break;
1462 }
1463 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1464 }
1465 break;
1466 }
1467 case Intrinsic::sadd_sat:
1468 case Intrinsic::ssub_sat:
1469 case Intrinsic::uadd_sat:
1470 case Intrinsic::usub_sat: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (ST->hasVInstructions() && LT.second.isVector()) {
1473 unsigned Op;
1474 switch (ICA.getID()) {
1475 case Intrinsic::sadd_sat:
1476 Op = RISCV::VSADD_VV;
1477 break;
1478 case Intrinsic::ssub_sat:
1479 Op = RISCV::VSSUBU_VV;
1480 break;
1481 case Intrinsic::uadd_sat:
1482 Op = RISCV::VSADDU_VV;
1483 break;
1484 case Intrinsic::usub_sat:
1485 Op = RISCV::VSSUBU_VV;
1486 break;
1487 }
1488 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1489 }
1490 break;
1491 }
1492 case Intrinsic::fma:
1493 case Intrinsic::fmuladd: {
1494 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1495 auto LT = getTypeLegalizationCost(RetTy);
1496 if (ST->hasVInstructions() && LT.second.isVector())
1497 return LT.first *
1498 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1499 break;
1500 }
1501 case Intrinsic::fabs: {
1502 auto LT = getTypeLegalizationCost(RetTy);
1503 if (ST->hasVInstructions() && LT.second.isVector()) {
1504 // lui a0, 8
1505 // addi a0, a0, -1
1506 // vsetvli a1, zero, e16, m1, ta, ma
1507 // vand.vx v8, v8, a0
1508 // f16 with zvfhmin and bf16 with zvfhbmin
1509 if (LT.second.getVectorElementType() == MVT::bf16 ||
1510 (LT.second.getVectorElementType() == MVT::f16 &&
1511 !ST->hasVInstructionsF16()))
1512 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1513 CostKind) +
1514 2;
1515 else
1516 return LT.first *
1517 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1518 }
1519 break;
1520 }
1521 case Intrinsic::sqrt: {
1522 auto LT = getTypeLegalizationCost(RetTy);
1523 if (ST->hasVInstructions() && LT.second.isVector()) {
1526 MVT ConvType = LT.second;
1527 MVT FsqrtType = LT.second;
1528 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1529 // will be spilt.
1530 if (LT.second.getVectorElementType() == MVT::bf16) {
1531 if (LT.second == MVT::nxv32bf16) {
1532 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1533 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1534 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1535 ConvType = MVT::nxv16f16;
1536 FsqrtType = MVT::nxv16f32;
1537 } else {
1538 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1539 FsqrtOp = {RISCV::VFSQRT_V};
1540 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1541 }
1542 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1543 !ST->hasVInstructionsF16()) {
1544 if (LT.second == MVT::nxv32f16) {
1545 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1546 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1547 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1548 ConvType = MVT::nxv16f16;
1549 FsqrtType = MVT::nxv16f32;
1550 } else {
1551 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1552 FsqrtOp = {RISCV::VFSQRT_V};
1553 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1554 }
1555 } else {
1556 FsqrtOp = {RISCV::VFSQRT_V};
1557 }
1558
1559 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1560 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1561 }
1562 break;
1563 }
1564 case Intrinsic::cttz:
1565 case Intrinsic::ctlz:
1566 case Intrinsic::ctpop: {
1567 auto LT = getTypeLegalizationCost(RetTy);
1568 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1569 unsigned Op;
1570 switch (ICA.getID()) {
1571 case Intrinsic::cttz:
1572 Op = RISCV::VCTZ_V;
1573 break;
1574 case Intrinsic::ctlz:
1575 Op = RISCV::VCLZ_V;
1576 break;
1577 case Intrinsic::ctpop:
1578 Op = RISCV::VCPOP_V;
1579 break;
1580 }
1581 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1582 }
1583 break;
1584 }
1585 case Intrinsic::abs: {
1586 auto LT = getTypeLegalizationCost(RetTy);
1587 if (ST->hasVInstructions() && LT.second.isVector()) {
1588 // vabs.v v10, v8
1589 if (ST->hasStdExtZvabd())
1590 return LT.first *
1591 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1592
1593 // vrsub.vi v10, v8, 0
1594 // vmax.vv v8, v8, v10
1595 return LT.first *
1596 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1597 LT.second, CostKind);
1598 }
1599 break;
1600 }
1601 case Intrinsic::fshl:
1602 case Intrinsic::fshr: {
1603 if (ICA.getArgs().empty())
1604 break;
1605
1606 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1607 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1608 // instruction.
1609 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1610 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1611 (RetTy->getIntegerBitWidth() == 32 ||
1612 RetTy->getIntegerBitWidth() == 64) &&
1613 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1614 return 1;
1615 }
1616 break;
1617 }
1618 case Intrinsic::get_active_lane_mask: {
1619 if (ST->hasVInstructions()) {
1620 Type *ExpRetTy = VectorType::get(
1621 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1622 auto LT = getTypeLegalizationCost(ExpRetTy);
1623
1624 // vid.v v8 // considered hoisted
1625 // vsaddu.vx v8, v8, a0
1626 // vmsltu.vx v0, v8, a1
1627 return LT.first *
1628 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1629 LT.second, CostKind);
1630 }
1631 break;
1632 }
1633 // TODO: add more intrinsic
1634 case Intrinsic::stepvector: {
1635 auto LT = getTypeLegalizationCost(RetTy);
1636 // Legalisation of illegal types involves an `index' instruction plus
1637 // (LT.first - 1) vector adds.
1638 if (ST->hasVInstructions())
1639 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1640 (LT.first - 1) *
1641 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1642 return 1 + (LT.first - 1);
1643 }
1644 case Intrinsic::vector_splice_left:
1645 case Intrinsic::vector_splice_right: {
1646 auto LT = getTypeLegalizationCost(RetTy);
1647 // Constant offsets fall through to getShuffleCost.
1648 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1649 break;
1650 if (ST->hasVInstructions() && LT.second.isVector()) {
1651 return LT.first *
1652 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1653 LT.second, CostKind);
1654 }
1655 break;
1656 }
1657 case Intrinsic::experimental_cttz_elts: {
1658 Type *ArgTy = ICA.getArgTypes()[0];
1659 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1660 if (getTLI()->shouldExpandCttzElements(ArgType))
1661 break;
1662 InstructionCost Cost = getRISCVInstructionCost(
1663 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1664
1665 // If zero_is_poison is false, then we will generate additional
1666 // cmp + select instructions to convert -1 to EVL.
1667 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1668 if (ICA.getArgs().size() > 1 &&
1669 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1670 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1672 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1674
1675 return Cost;
1676 }
1677 case Intrinsic::experimental_vp_splice: {
1678 // To support type-based query from vectorizer, set the index to 0.
1679 // Note that index only change the cost from vslide.vx to vslide.vi and in
1680 // current implementations they have same costs.
1682 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1684 }
1685 case Intrinsic::fptoui_sat:
1686 case Intrinsic::fptosi_sat: {
1688 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1689 Type *SrcTy = ICA.getArgTypes()[0];
1690
1691 auto SrcLT = getTypeLegalizationCost(SrcTy);
1692 auto DstLT = getTypeLegalizationCost(RetTy);
1693 if (!SrcTy->isVectorTy())
1694 break;
1695
1696 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1698
1699 Cost +=
1700 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1701 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1702
1703 // Handle NaN.
1704 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1705 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1706 Type *CondTy = RetTy->getWithNewBitWidth(1);
1707 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1709 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1711 return Cost;
1712 }
1713 case Intrinsic::experimental_vector_extract_last_active: {
1714 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1715 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1716
1717 auto ValLT = getTypeLegalizationCost(ValTy);
1718 auto MaskLT = getTypeLegalizationCost(MaskTy);
1719
1720 // TODO: Return cheaper cost when the entire lane is inactive.
1721 // The expected asm sequence is:
1722 // vcpop.m a0, v0
1723 // beqz a0, exit # Return passthru when the entire lane is inactive.
1724 // vid v10, v0.t
1725 // vredmaxu.vs v10, v10, v10
1726 // vmv.x.s a0, v10
1727 // zext.b a0, a0
1728 // vslidedown.vx v8, v8, a0
1729 // vmv.x.s a0, v8
1730 // exit:
1731 // ...
1732
1733 // Find a suitable type for a stepvector.
1734 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1735 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1736 MaskTy->getScalarType(), MaskTy->getElementCount(),
1737 /*ZeroIsPoison=*/true, &VScaleRange);
1738 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1739 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1740 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1741 auto StepLT = getTypeLegalizationCost(StepVecTy);
1743 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1744
1745 Cost += MaskLT.first *
1746 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1747 Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);
1748 Cost += StepLT.first *
1749 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1750 Cost += getCastInstrCost(Instruction::ZExt,
1751 Type::getInt64Ty(ValTy->getContext()), StepTy,
1753 Cost += ValLT.first *
1754 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1755 ValLT.second, CostKind);
1756 return Cost;
1757 }
1758 }
1759
1760 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1761 if (auto LT = getTypeLegalizationCost(RetTy);
1762 LT.second.isVector()) {
1763 MVT EltTy = LT.second.getVectorElementType();
1764 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1765 ICA.getID(), EltTy))
1766 return LT.first * Entry->Cost;
1767 }
1768 }
1769
1771}
1772
1775 const SCEV *Ptr,
1777 // Address computations for vector indexed load/store likely require an offset
1778 // and/or scaling.
1779 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1780 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1781
1782 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1783}
1784
1786 Type *Src,
1789 const Instruction *I) const {
1790 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1791 if (!IsVectorType)
1792 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1793
1794 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1795 // For now, skip all fixed vector cost analysis when P extension is available
1796 // to avoid crashes in getMinRVVVectorSizeInBits()
1797 if (ST->hasStdExtP() &&
1799 return 1; // Treat as single instruction cost for now
1800 }
1801
1802 // FIXME: Need to compute legalizing cost for illegal types. The current
1803 // code handles only legal types and those which can be trivially
1804 // promoted to legal.
1805 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1806 Dst->getScalarSizeInBits() > ST->getELen())
1807 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1808
1809 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1810 assert(ISD && "Invalid opcode");
1811 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1812 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1813
1814 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1815 // The shared implementation doesn't model vector widening during legalization
1816 // and instead assumes scalarization. In order to scalarize an <N x i1>
1817 // vector, we need to extend/trunc to/from i8. If we don't special case
1818 // this, we can get an infinite recursion cycle.
1819 switch (ISD) {
1820 default:
1821 break;
1822 case ISD::SIGN_EXTEND:
1823 case ISD::ZERO_EXTEND:
1824 if (Src->getScalarSizeInBits() == 1) {
1825 // We do not use vsext/vzext to extend from mask vector.
1826 // Instead we use the following instructions to extend from mask vector:
1827 // vmv.v.i v8, 0
1828 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1829 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1830 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1831 DstLT.second, CostKind) +
1832 DstLT.first - 1;
1833 }
1834 break;
1835 case ISD::TRUNCATE:
1836 if (Dst->getScalarSizeInBits() == 1) {
1837 // We do not use several vncvt to truncate to mask vector. So we could
1838 // not use PowDiff to calculate it.
1839 // Instead we use the following instructions to truncate to mask vector:
1840 // vand.vi v8, v8, 1
1841 // vmsne.vi v0, v8, 0
1842 return SrcLT.first *
1843 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1844 SrcLT.second, CostKind) +
1845 SrcLT.first - 1;
1846 }
1847 break;
1848 };
1849
1850 // Our actual lowering for the case where a wider legal type is available
1851 // uses promotion to the wider type. This is reflected in the result of
1852 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1853 // scalarized if the legalized Src and Dst are not equal sized.
1854 const DataLayout &DL = this->getDataLayout();
1855 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1856 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1857 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1858 SrcLT.second.getSizeInBits()) ||
1859 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1860 DstLT.second.getSizeInBits()) ||
1861 SrcLT.first > 1 || DstLT.first > 1)
1862 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1863
1864 // The split cost is handled by the base getCastInstrCost
1865 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1866
1867 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1868 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1869 switch (ISD) {
1870 case ISD::SIGN_EXTEND:
1871 case ISD::ZERO_EXTEND: {
1872 if ((PowDiff < 1) || (PowDiff > 3))
1873 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1874 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1875 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1876 unsigned Op =
1877 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1878 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1879 }
1880 case ISD::TRUNCATE:
1881 case ISD::FP_EXTEND:
1882 case ISD::FP_ROUND: {
1883 // Counts of narrow/widen instructions.
1884 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1885 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1886
1887 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1888 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1889 : RISCV::VFNCVT_F_F_W;
1891 for (; SrcEltSize != DstEltSize;) {
1892 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1893 ? MVT::getIntegerVT(DstEltSize)
1894 : MVT::getFloatingPointVT(DstEltSize);
1895 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1896 DstEltSize =
1897 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1898 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1899 }
1900 return Cost;
1901 }
1902 case ISD::FP_TO_SINT:
1903 case ISD::FP_TO_UINT: {
1904 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1905 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1906 unsigned FWCVT =
1907 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1908 unsigned FNCVT =
1909 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1910 unsigned SrcEltSize = Src->getScalarSizeInBits();
1911 unsigned DstEltSize = Dst->getScalarSizeInBits();
1913 if ((SrcEltSize == 16) &&
1914 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1915 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1916 // pre-widening to f32 and then convert f32 to integer
1917 VectorType *VecF32Ty =
1918 VectorType::get(Type::getFloatTy(Dst->getContext()),
1919 cast<VectorType>(Dst)->getElementCount());
1920 std::pair<InstructionCost, MVT> VecF32LT =
1921 getTypeLegalizationCost(VecF32Ty);
1922 Cost +=
1923 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1924 VecF32LT.second, CostKind);
1925 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1926 return Cost;
1927 }
1928 if (DstEltSize == SrcEltSize)
1929 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1930 else if (DstEltSize > SrcEltSize)
1931 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1932 else { // (SrcEltSize > DstEltSize)
1933 // First do a narrowing conversion to an integer half the size, then
1934 // truncate if needed.
1935 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1936 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1937 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1938 if ((SrcEltSize / 2) > DstEltSize) {
1939 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1940 Cost +=
1941 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1942 }
1943 }
1944 return Cost;
1945 }
1946 case ISD::SINT_TO_FP:
1947 case ISD::UINT_TO_FP: {
1948 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1949 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1950 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1951 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1952 unsigned SrcEltSize = Src->getScalarSizeInBits();
1953 unsigned DstEltSize = Dst->getScalarSizeInBits();
1954
1956 if ((DstEltSize == 16) &&
1957 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1958 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1959 // it is converted to f32 and then converted to f16
1960 VectorType *VecF32Ty =
1961 VectorType::get(Type::getFloatTy(Dst->getContext()),
1962 cast<VectorType>(Dst)->getElementCount());
1963 std::pair<InstructionCost, MVT> VecF32LT =
1964 getTypeLegalizationCost(VecF32Ty);
1965 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1966 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1967 DstLT.second, CostKind);
1968 return Cost;
1969 }
1970
1971 if (DstEltSize == SrcEltSize)
1972 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1973 else if (DstEltSize > SrcEltSize) {
1974 if ((DstEltSize / 2) > SrcEltSize) {
1975 VectorType *VecTy =
1976 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1977 cast<VectorType>(Dst)->getElementCount());
1978 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1979 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1980 }
1981 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1982 } else
1983 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1984 return Cost;
1985 }
1986 }
1987 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1988}
1989
1990unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1991 if (isa<ScalableVectorType>(Ty)) {
1992 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1993 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1994 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1995 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1996 }
1997 return cast<FixedVectorType>(Ty)->getNumElements();
1998}
1999
2002 FastMathFlags FMF,
2004 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2005 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2006
2007 // Skip if scalar size of Ty is bigger than ELEN.
2008 if (Ty->getScalarSizeInBits() > ST->getELen())
2009 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2010
2011 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2012 if (Ty->getElementType()->isIntegerTy(1)) {
2013 // SelectionDAGBuilder does following transforms:
2014 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2015 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2016 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2017 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2018 else
2019 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2020 }
2021
2022 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2024 InstructionCost ExtraCost = 0;
2025 switch (IID) {
2026 case Intrinsic::maximum:
2027 if (FMF.noNaNs()) {
2028 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2029 } else {
2030 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2031 RISCV::VFMV_F_S};
2032 // Cost of Canonical Nan + branch
2033 // lui a0, 523264
2034 // fmv.w.x fa0, a0
2035 Type *DstTy = Ty->getScalarType();
2036 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2037 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2038 ExtraCost = 1 +
2039 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2041 getCFInstrCost(Instruction::CondBr, CostKind);
2042 }
2043 break;
2044
2045 case Intrinsic::minimum:
2046 if (FMF.noNaNs()) {
2047 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2048 } else {
2049 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2050 RISCV::VFMV_F_S};
2051 // Cost of Canonical Nan + branch
2052 // lui a0, 523264
2053 // fmv.w.x fa0, a0
2054 Type *DstTy = Ty->getScalarType();
2055 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2056 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2057 ExtraCost = 1 +
2058 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2060 getCFInstrCost(Instruction::CondBr, CostKind);
2061 }
2062 break;
2063 }
2064 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2065 }
2066
2067 // IR Reduction is composed by one rvv reduction instruction and vmv
2068 unsigned SplitOp;
2070 switch (IID) {
2071 default:
2072 llvm_unreachable("Unsupported intrinsic");
2073 case Intrinsic::smax:
2074 SplitOp = RISCV::VMAX_VV;
2075 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2076 break;
2077 case Intrinsic::smin:
2078 SplitOp = RISCV::VMIN_VV;
2079 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2080 break;
2081 case Intrinsic::umax:
2082 SplitOp = RISCV::VMAXU_VV;
2083 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2084 break;
2085 case Intrinsic::umin:
2086 SplitOp = RISCV::VMINU_VV;
2087 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2088 break;
2089 case Intrinsic::maxnum:
2090 SplitOp = RISCV::VFMAX_VV;
2091 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2092 break;
2093 case Intrinsic::minnum:
2094 SplitOp = RISCV::VFMIN_VV;
2095 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2096 break;
2097 }
2098 // Add a cost for data larger than LMUL8
2099 InstructionCost SplitCost =
2100 (LT.first > 1) ? (LT.first - 1) *
2101 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2102 : 0;
2103 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2104}
2105
2108 std::optional<FastMathFlags> FMF,
2110 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2111 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2112
2113 // Skip if scalar size of Ty is bigger than ELEN.
2114 if (Ty->getScalarSizeInBits() > ST->getELen())
2115 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2116
2117 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2118 assert(ISD && "Invalid opcode");
2119
2120 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2121 ISD != ISD::FADD)
2122 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2123
2124 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2125 Type *ElementTy = Ty->getElementType();
2126 if (ElementTy->isIntegerTy(1)) {
2127 // Example sequences:
2128 // vfirst.m a0, v0
2129 // seqz a0, a0
2130 if (LT.second == MVT::v1i1)
2131 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2132 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2134
2135 if (ISD == ISD::AND) {
2136 // Example sequences:
2137 // vmand.mm v8, v9, v8 ; needed every time type is split
2138 // vmnot.m v8, v0 ; alias for vmnand
2139 // vcpop.m a0, v8
2140 // seqz a0, a0
2141
2142 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2143 // For LMUL <= 8, there is no splitting,
2144 // the sequences are vmnot, vcpop and seqz.
2145 // When LMUL > 8 and split = 1,
2146 // the sequences are vmnand, vcpop and seqz.
2147 // When LMUL > 8 and split > 1,
2148 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2149 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2150 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2151 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2152 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2153 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2155 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2156 // Example sequences:
2157 // vsetvli a0, zero, e8, mf8, ta, ma
2158 // vmxor.mm v8, v0, v8 ; needed every time type is split
2159 // vcpop.m a0, v8
2160 // andi a0, a0, 1
2161 return (LT.first - 1) *
2162 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2163 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2164 } else {
2165 assert(ISD == ISD::OR);
2166 // Example sequences:
2167 // vsetvli a0, zero, e8, mf8, ta, ma
2168 // vmor.mm v8, v9, v8 ; needed every time type is split
2169 // vcpop.m a0, v0
2170 // snez a0, a0
2171 return (LT.first - 1) *
2172 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2173 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2174 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2176 }
2177 }
2178
2179 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2180 // instruction, and others is composed by two vmv and one rvv reduction
2181 // instruction
2182 unsigned SplitOp;
2184 switch (ISD) {
2185 case ISD::ADD:
2186 SplitOp = RISCV::VADD_VV;
2187 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2188 break;
2189 case ISD::OR:
2190 SplitOp = RISCV::VOR_VV;
2191 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2192 break;
2193 case ISD::XOR:
2194 SplitOp = RISCV::VXOR_VV;
2195 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2196 break;
2197 case ISD::AND:
2198 SplitOp = RISCV::VAND_VV;
2199 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2200 break;
2201 case ISD::FADD:
2202 // We can't promote f16/bf16 fadd reductions.
2203 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2204 LT.second.getScalarType() == MVT::bf16)
2205 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2207 Opcodes.push_back(RISCV::VFMV_S_F);
2208 for (unsigned i = 0; i < LT.first.getValue(); i++)
2209 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2210 Opcodes.push_back(RISCV::VFMV_F_S);
2211 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2212 }
2213 SplitOp = RISCV::VFADD_VV;
2214 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2215 break;
2216 }
2217 // Add a cost for data larger than LMUL8
2218 InstructionCost SplitCost =
2219 (LT.first > 1) ? (LT.first - 1) *
2220 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2221 : 0;
2222 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2223}
2224
2226 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2227 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2228 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2229 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2230 FMF, CostKind);
2231
2232 // Skip if scalar size of ResTy is bigger than ELEN.
2233 if (ResTy->getScalarSizeInBits() > ST->getELen())
2234 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2235 FMF, CostKind);
2236
2237 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2238 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2239 FMF, CostKind);
2240
2241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2242
2243 if (IsUnsigned && Opcode == Instruction::Add &&
2244 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2245 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2246 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2247 return LT.first *
2248 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2249 }
2250
2251 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2252 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2253 FMF, CostKind);
2254
2255 return (LT.first - 1) +
2256 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2257}
2258
2262 assert(OpInfo.isConstant() && "non constant operand?");
2263 if (!isa<VectorType>(Ty))
2264 // FIXME: We need to account for immediate materialization here, but doing
2265 // a decent job requires more knowledge about the immediate than we
2266 // currently have here.
2267 return 0;
2268
2269 if (OpInfo.isUniform())
2270 // vmv.v.i, vmv.v.x, or vfmv.v.f
2271 // We ignore the cost of the scalar constant materialization to be consistent
2272 // with how we treat scalar constants themselves just above.
2273 return 1;
2274
2275 return getConstantPoolLoadCost(Ty, CostKind);
2276}
2277
2279 Align Alignment,
2280 unsigned AddressSpace,
2282 TTI::OperandValueInfo OpInfo,
2283 const Instruction *I) const {
2284 EVT VT = TLI->getValueType(DL, Src, true);
2285 // Type legalization can't handle structs
2286 if (VT == MVT::Other)
2287 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2288 CostKind, OpInfo, I);
2289
2291 if (Opcode == Instruction::Store && OpInfo.isConstant())
2292 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2293
2294 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2295
2296 InstructionCost BaseCost = [&]() {
2297 InstructionCost Cost = LT.first;
2299 return Cost;
2300
2301 // Our actual lowering for the case where a wider legal type is available
2302 // uses the a VL predicated load on the wider type. This is reflected in
2303 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2304 // widened cases are scalarized.
2305 const DataLayout &DL = this->getDataLayout();
2306 if (Src->isVectorTy() && LT.second.isVector() &&
2307 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2308 LT.second.getSizeInBits()))
2309 return Cost;
2310
2311 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2312 CostKind, OpInfo, I);
2313 }();
2314
2315 // Assume memory ops cost scale with the number of vector registers
2316 // possible accessed by the instruction. Note that BasicTTI already
2317 // handles the LT.first term for us.
2318 if (ST->hasVInstructions() && LT.second.isVector() &&
2320 BaseCost *= TLI->getLMULCost(LT.second);
2321 return Cost + BaseCost;
2322}
2323
2325 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2327 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2329 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2330 Op1Info, Op2Info, I);
2331
2332 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2333 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2334 Op1Info, Op2Info, I);
2335
2336 // Skip if scalar size of ValTy is bigger than ELEN.
2337 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2338 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2339 Op1Info, Op2Info, I);
2340
2341 auto GetConstantMatCost =
2342 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2343 if (OpInfo.isUniform())
2344 // We return 0 we currently ignore the cost of materializing scalar
2345 // constants in GPRs.
2346 return 0;
2347
2348 return getConstantPoolLoadCost(ValTy, CostKind);
2349 };
2350
2351 InstructionCost ConstantMatCost;
2352 if (Op1Info.isConstant())
2353 ConstantMatCost += GetConstantMatCost(Op1Info);
2354 if (Op2Info.isConstant())
2355 ConstantMatCost += GetConstantMatCost(Op2Info);
2356
2357 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2358 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2359 if (CondTy->isVectorTy()) {
2360 if (ValTy->getScalarSizeInBits() == 1) {
2361 // vmandn.mm v8, v8, v9
2362 // vmand.mm v9, v0, v9
2363 // vmor.mm v0, v9, v8
2364 return ConstantMatCost +
2365 LT.first *
2366 getRISCVInstructionCost(
2367 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2368 LT.second, CostKind);
2369 }
2370 // vselect and max/min are supported natively.
2371 return ConstantMatCost +
2372 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2373 CostKind);
2374 }
2375
2376 if (ValTy->getScalarSizeInBits() == 1) {
2377 // vmv.v.x v9, a0
2378 // vmsne.vi v9, v9, 0
2379 // vmandn.mm v8, v8, v9
2380 // vmand.mm v9, v0, v9
2381 // vmor.mm v0, v9, v8
2382 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2383 return ConstantMatCost +
2384 LT.first *
2385 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2386 InterimVT, CostKind) +
2387 LT.first * getRISCVInstructionCost(
2388 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2389 LT.second, CostKind);
2390 }
2391
2392 // vmv.v.x v10, a0
2393 // vmsne.vi v0, v10, 0
2394 // vmerge.vvm v8, v9, v8, v0
2395 return ConstantMatCost +
2396 LT.first * getRISCVInstructionCost(
2397 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2398 LT.second, CostKind);
2399 }
2400
2401 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2402 CmpInst::isIntPredicate(VecPred)) {
2403 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2404 // provided they incur the same cost across all implementations
2405 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2406 LT.second,
2407 CostKind);
2408 }
2409
2410 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2411 CmpInst::isFPPredicate(VecPred)) {
2412
2413 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2414 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2415 return ConstantMatCost +
2416 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2417
2418 // If we do not support the input floating point vector type, use the base
2419 // one which will calculate as:
2420 // ScalarizeCost + Num * Cost for fixed vector,
2421 // InvalidCost for scalable vector.
2422 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2423 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2424 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2425 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2426 Op1Info, Op2Info, I);
2427
2428 // Assuming vector fp compare and mask instructions are all the same cost
2429 // until a need arises to differentiate them.
2430 switch (VecPred) {
2431 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2432 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2433 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2434 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2435 return ConstantMatCost +
2436 LT.first * getRISCVInstructionCost(
2437 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2438 LT.second, CostKind);
2439
2440 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2441 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2442 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2443 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2444 return ConstantMatCost +
2445 LT.first *
2446 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2447 LT.second, CostKind);
2448
2449 case CmpInst::FCMP_OEQ: // vmfeq.vv
2450 case CmpInst::FCMP_OGT: // vmflt.vv
2451 case CmpInst::FCMP_OGE: // vmfle.vv
2452 case CmpInst::FCMP_OLT: // vmflt.vv
2453 case CmpInst::FCMP_OLE: // vmfle.vv
2454 case CmpInst::FCMP_UNE: // vmfne.vv
2455 return ConstantMatCost +
2456 LT.first *
2457 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2458 default:
2459 break;
2460 }
2461 }
2462
2463 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2464 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2465 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2466 // be (0 + select instr cost).
2467 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2468 ValTy->isIntegerTy() && !I->user_empty()) {
2469 if (all_of(I->users(), [&](const User *U) {
2470 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2471 U->getType()->isIntegerTy() &&
2472 !isa<ConstantData>(U->getOperand(1)) &&
2473 !isa<ConstantData>(U->getOperand(2));
2474 }))
2475 return 0;
2476 }
2477
2478 // TODO: Add cost for scalar type.
2479
2480 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2481 Op1Info, Op2Info, I);
2482}
2483
2486 const Instruction *I) const {
2488 return Opcode == Instruction::PHI ? 0 : 1;
2489 // Branches are assumed to be predicted.
2490 return 0;
2491}
2492
2494 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2495 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2496 assert(Val->isVectorTy() && "This must be a vector type");
2497
2498 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2499 // For now, skip all fixed vector cost analysis when P extension is available
2500 // to avoid crashes in getMinRVVVectorSizeInBits()
2501 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2502 return 1; // Treat as single instruction cost for now
2503 }
2504
2505 if (Opcode != Instruction::ExtractElement &&
2506 Opcode != Instruction::InsertElement)
2507 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2508 VIC);
2509
2510 // Legalize the type.
2511 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2512
2513 // This type is legalized to a scalar type.
2514 if (!LT.second.isVector()) {
2515 auto *FixedVecTy = cast<FixedVectorType>(Val);
2516 // If Index is a known constant, cost is zero.
2517 if (Index != -1U)
2518 return 0;
2519 // Extract/InsertElement with non-constant index is very costly when
2520 // scalarized; estimate cost of loads/stores sequence via the stack:
2521 // ExtractElement cost: store vector to stack, load scalar;
2522 // InsertElement cost: store vector to stack, store scalar, load vector.
2523 Type *ElemTy = FixedVecTy->getElementType();
2524 auto NumElems = FixedVecTy->getNumElements();
2525 auto Align = DL.getPrefTypeAlign(ElemTy);
2526 InstructionCost LoadCost =
2527 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2528 InstructionCost StoreCost =
2529 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2530 return Opcode == Instruction::ExtractElement
2531 ? StoreCost * NumElems + LoadCost
2532 : (StoreCost + LoadCost) * NumElems + StoreCost;
2533 }
2534
2535 // For unsupported scalable vector.
2536 if (LT.second.isScalableVector() && !LT.first.isValid())
2537 return LT.first;
2538
2539 // Mask vector extract/insert is expanded via e8.
2540 if (Val->getScalarSizeInBits() == 1) {
2541 VectorType *WideTy =
2543 cast<VectorType>(Val)->getElementCount());
2544 if (Opcode == Instruction::ExtractElement) {
2545 InstructionCost ExtendCost
2546 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2548 InstructionCost ExtractCost
2549 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2550 return ExtendCost + ExtractCost;
2551 }
2552 InstructionCost ExtendCost
2553 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2555 InstructionCost InsertCost
2556 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2557 InstructionCost TruncCost
2558 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2560 return ExtendCost + InsertCost + TruncCost;
2561 }
2562
2563
2564 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2565 // and vslideup + vmv.s.x to insert element to vector.
2566 unsigned BaseCost = 1;
2567 // When insertelement we should add the index with 1 as the input of vslideup.
2568 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2569
2570 if (Index != -1U) {
2571 // The type may be split. For fixed-width vectors we can normalize the
2572 // index to the new type.
2573 if (LT.second.isFixedLengthVector()) {
2574 unsigned Width = LT.second.getVectorNumElements();
2575 Index = Index % Width;
2576 }
2577
2578 // If exact VLEN is known, we will insert/extract into the appropriate
2579 // subvector with no additional subvector insert/extract cost.
2580 if (auto VLEN = ST->getRealVLen()) {
2581 unsigned EltSize = LT.second.getScalarSizeInBits();
2582 unsigned M1Max = *VLEN / EltSize;
2583 Index = Index % M1Max;
2584 }
2585
2586 if (Index == 0)
2587 // We can extract/insert the first element without vslidedown/vslideup.
2588 SlideCost = 0;
2589 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2590 Val->getScalarType()->isIntegerTy())
2591 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2592 else if (Opcode == Instruction::InsertElement)
2593 SlideCost = 1; // With a constant index, we do not need to use addi.
2594 }
2595
2596 // When the vector needs to split into multiple register groups and the index
2597 // exceeds single vector register group, we need to insert/extract the element
2598 // via stack.
2599 if (LT.first > 1 &&
2600 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2601 LT.second.isScalableVector()))) {
2602 Type *ScalarType = Val->getScalarType();
2603 Align VecAlign = DL.getPrefTypeAlign(Val);
2604 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2605 // Extra addi for unknown index.
2606 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2607
2608 // Store all split vectors into stack and load the target element.
2609 if (Opcode == Instruction::ExtractElement)
2610 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2611 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2612 CostKind) +
2613 IdxCost;
2614
2615 // Store all split vectors into stack and store the target element and load
2616 // vectors back.
2617 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2618 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2619 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2620 CostKind) +
2621 IdxCost;
2622 }
2623
2624 // Extract i64 in the target that has XLEN=32 need more instruction.
2625 if (Val->getScalarType()->isIntegerTy() &&
2626 ST->getXLen() < Val->getScalarSizeInBits()) {
2627 // For extractelement, we need the following instructions:
2628 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2629 // vslidedown.vx v8, v8, a0
2630 // vmv.x.s a0, v8
2631 // li a1, 32
2632 // vsrl.vx v8, v8, a1
2633 // vmv.x.s a1, v8
2634
2635 // For insertelement, we need the following instructions:
2636 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2637 // vmv.v.i v12, 0
2638 // vslide1up.vx v16, v12, a1
2639 // vslide1up.vx v12, v16, a0
2640 // addi a0, a2, 1
2641 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2642 // vslideup.vx v8, v12, a2
2643
2644 // TODO: should we count these special vsetvlis?
2645 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2646 }
2647 return BaseCost + SlideCost;
2648}
2649
2653 unsigned Index) const {
2654 if (isa<FixedVectorType>(Val))
2656 Index);
2657
2658 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2659 // for the cost of extracting the last lane of a scalable vector. It probably
2660 // needs a more accurate cost.
2661 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2662 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2663 return getVectorInstrCost(Opcode, Val, CostKind,
2664 EC.getKnownMinValue() - 1 - Index, nullptr,
2665 nullptr);
2666}
2667
2669 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2671 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2672
2673 // TODO: Handle more cost kinds.
2675 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2676 Args, CxtI);
2677
2678 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2679 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2680 Args, CxtI);
2681
2682 // Skip if scalar size of Ty is bigger than ELEN.
2683 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2685 Args, CxtI);
2686
2687 // Legalize the type.
2688 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2689 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2690
2691 // TODO: Handle scalar type.
2692 if (!LT.second.isVector()) {
2693 static const CostTblEntry DivTbl[]{
2694 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2695 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2696 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2697 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2698 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2699 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2700 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2701 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2702 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2703 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2704 return Entry->Cost * LT.first;
2705
2706 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2707 Args, CxtI);
2708 }
2709
2710 // f16 with zvfhmin and bf16 will be promoted to f32.
2711 // FIXME: nxv32[b]f16 will be custom lowered and split.
2712 InstructionCost CastCost = 0;
2713 if ((LT.second.getVectorElementType() == MVT::f16 ||
2714 LT.second.getVectorElementType() == MVT::bf16) &&
2715 TLI->getOperationAction(ISDOpcode, LT.second) ==
2717 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2718 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2719 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2720 // Add cost of extending arguments
2721 CastCost += LT.first * Args.size() *
2722 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2724 // Add cost of truncating result
2725 CastCost +=
2726 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2728 // Compute cost of op in promoted type
2729 LT.second = PromotedVT;
2730 }
2731
2732 auto getConstantMatCost =
2733 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2734 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2735 // Two sub-cases:
2736 // * Has a 5 bit immediate operand which can be splatted.
2737 // * Has a larger immediate which must be materialized in scalar register
2738 // We return 0 for both as we currently ignore the cost of materializing
2739 // scalar constants in GPRs.
2740 return 0;
2741
2742 return getConstantPoolLoadCost(Ty, CostKind);
2743 };
2744
2745 // Add the cost of materializing any constant vectors required.
2746 InstructionCost ConstantMatCost = 0;
2747 if (Op1Info.isConstant())
2748 ConstantMatCost += getConstantMatCost(0, Op1Info);
2749 if (Op2Info.isConstant())
2750 ConstantMatCost += getConstantMatCost(1, Op2Info);
2751
2752 unsigned Op;
2753 switch (ISDOpcode) {
2754 case ISD::ADD:
2755 case ISD::SUB:
2756 Op = RISCV::VADD_VV;
2757 break;
2758 case ISD::SHL:
2759 case ISD::SRL:
2760 case ISD::SRA:
2761 Op = RISCV::VSLL_VV;
2762 break;
2763 case ISD::AND:
2764 case ISD::OR:
2765 case ISD::XOR:
2766 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2767 break;
2768 case ISD::MUL:
2769 case ISD::MULHS:
2770 case ISD::MULHU:
2771 Op = RISCV::VMUL_VV;
2772 break;
2773 case ISD::SDIV:
2774 case ISD::UDIV:
2775 Op = RISCV::VDIV_VV;
2776 break;
2777 case ISD::SREM:
2778 case ISD::UREM:
2779 Op = RISCV::VREM_VV;
2780 break;
2781 case ISD::FADD:
2782 case ISD::FSUB:
2783 Op = RISCV::VFADD_VV;
2784 break;
2785 case ISD::FMUL:
2786 Op = RISCV::VFMUL_VV;
2787 break;
2788 case ISD::FDIV:
2789 Op = RISCV::VFDIV_VV;
2790 break;
2791 case ISD::FNEG:
2792 Op = RISCV::VFSGNJN_VV;
2793 break;
2794 default:
2795 // Assuming all other instructions have the same cost until a need arises to
2796 // differentiate them.
2797 return CastCost + ConstantMatCost +
2798 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2799 Args, CxtI);
2800 }
2801
2802 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2803 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2804 // ops are twice as expensive as integer ops. Do the same for vectors so
2805 // scalar floating point ops aren't cheaper than their vector equivalents.
2806 if (Ty->isFPOrFPVectorTy())
2807 InstrCost *= 2;
2808 return CastCost + ConstantMatCost + LT.first * InstrCost;
2809}
2810
2811// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2813 ArrayRef<const Value *> Ptrs, const Value *Base,
2814 const TTI::PointersChainInfo &Info, Type *AccessTy,
2817 // In the basic model we take into account GEP instructions only
2818 // (although here can come alloca instruction, a value, constants and/or
2819 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2820 // pointer). Typically, if Base is a not a GEP-instruction and all the
2821 // pointers are relative to the same base address, all the rest are
2822 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2823 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2824 // any their index is a non-const.
2825 // If no known dependencies between the pointers cost is calculated as a sum
2826 // of costs of GEP instructions.
2827 for (auto [I, V] : enumerate(Ptrs)) {
2828 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2829 if (!GEP)
2830 continue;
2831 if (Info.isSameBase() && V != Base) {
2832 if (GEP->hasAllConstantIndices())
2833 continue;
2834 // If the chain is unit-stride and BaseReg + stride*i is a legal
2835 // addressing mode, then presume the base GEP is sitting around in a
2836 // register somewhere and check if we can fold the offset relative to
2837 // it.
2838 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2839 if (Info.isUnitStride() &&
2840 isLegalAddressingMode(AccessTy,
2841 /* BaseGV */ nullptr,
2842 /* BaseOffset */ Stride * I,
2843 /* HasBaseReg */ true,
2844 /* Scale */ 0,
2845 GEP->getType()->getPointerAddressSpace()))
2846 continue;
2847 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2848 {TTI::OK_AnyValue, TTI::OP_None},
2849 {TTI::OK_AnyValue, TTI::OP_None}, {});
2850 } else {
2851 SmallVector<const Value *> Indices(GEP->indices());
2852 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2853 Indices, AccessTy, CostKind);
2854 }
2855 }
2856 return Cost;
2857}
2858
2861 OptimizationRemarkEmitter *ORE) const {
2862 // TODO: More tuning on benchmarks and metrics with changes as needed
2863 // would apply to all settings below to enable performance.
2864
2865
2866 if (ST->enableDefaultUnroll())
2867 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2868
2869 // Enable Upper bound unrolling universally, not dependent upon the conditions
2870 // below.
2871 UP.UpperBound = true;
2872
2873 // Disable loop unrolling for Oz and Os.
2874 UP.OptSizeThreshold = 0;
2876 if (L->getHeader()->getParent()->hasOptSize())
2877 return;
2878
2879 SmallVector<BasicBlock *, 4> ExitingBlocks;
2880 L->getExitingBlocks(ExitingBlocks);
2881 LLVM_DEBUG(dbgs() << "Loop has:\n"
2882 << "Blocks: " << L->getNumBlocks() << "\n"
2883 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2884
2885 // Only allow another exit other than the latch. This acts as an early exit
2886 // as it mirrors the profitability calculation of the runtime unroller.
2887 if (ExitingBlocks.size() > 2)
2888 return;
2889
2890 // Limit the CFG of the loop body for targets with a branch predictor.
2891 // Allowing 4 blocks permits if-then-else diamonds in the body.
2892 if (L->getNumBlocks() > 4)
2893 return;
2894
2895 // Scan the loop: don't unroll loops with calls as this could prevent
2896 // inlining. Don't unroll auto-vectorized loops either, though do allow
2897 // unrolling of the scalar remainder.
2898 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2900 for (auto *BB : L->getBlocks()) {
2901 for (auto &I : *BB) {
2902 // Both auto-vectorized loops and the scalar remainder have the
2903 // isvectorized attribute, so differentiate between them by the presence
2904 // of vector instructions.
2905 if (IsVectorized && (I.getType()->isVectorTy() ||
2906 llvm::any_of(I.operand_values(), [](Value *V) {
2907 return V->getType()->isVectorTy();
2908 })))
2909 return;
2910
2911 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2912 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2913 if (!isLoweredToCall(F))
2914 continue;
2915 }
2916 return;
2917 }
2918
2919 SmallVector<const Value *> Operands(I.operand_values());
2920 Cost += getInstructionCost(&I, Operands,
2922 }
2923 }
2924
2925 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2926
2927 UP.Partial = true;
2928 UP.Runtime = true;
2929 UP.UnrollRemainder = true;
2930 UP.UnrollAndJam = true;
2931
2932 // Force unrolling small loops can be very useful because of the branch
2933 // taken cost of the backedge.
2934 if (Cost < 12)
2935 UP.Force = true;
2936}
2937
2942
2944 MemIntrinsicInfo &Info) const {
2945 const DataLayout &DL = getDataLayout();
2946 Intrinsic::ID IID = Inst->getIntrinsicID();
2947 LLVMContext &C = Inst->getContext();
2948 bool HasMask = false;
2949
2950 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2951 bool IsWrite) -> int64_t {
2952 if (auto *TarExtTy =
2953 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2954 return TarExtTy->getIntParameter(0);
2955
2956 return 1;
2957 };
2958
2959 switch (IID) {
2960 case Intrinsic::riscv_vle_mask:
2961 case Intrinsic::riscv_vse_mask:
2962 case Intrinsic::riscv_vlseg2_mask:
2963 case Intrinsic::riscv_vlseg3_mask:
2964 case Intrinsic::riscv_vlseg4_mask:
2965 case Intrinsic::riscv_vlseg5_mask:
2966 case Intrinsic::riscv_vlseg6_mask:
2967 case Intrinsic::riscv_vlseg7_mask:
2968 case Intrinsic::riscv_vlseg8_mask:
2969 case Intrinsic::riscv_vsseg2_mask:
2970 case Intrinsic::riscv_vsseg3_mask:
2971 case Intrinsic::riscv_vsseg4_mask:
2972 case Intrinsic::riscv_vsseg5_mask:
2973 case Intrinsic::riscv_vsseg6_mask:
2974 case Intrinsic::riscv_vsseg7_mask:
2975 case Intrinsic::riscv_vsseg8_mask:
2976 HasMask = true;
2977 [[fallthrough]];
2978 case Intrinsic::riscv_vle:
2979 case Intrinsic::riscv_vse:
2980 case Intrinsic::riscv_vlseg2:
2981 case Intrinsic::riscv_vlseg3:
2982 case Intrinsic::riscv_vlseg4:
2983 case Intrinsic::riscv_vlseg5:
2984 case Intrinsic::riscv_vlseg6:
2985 case Intrinsic::riscv_vlseg7:
2986 case Intrinsic::riscv_vlseg8:
2987 case Intrinsic::riscv_vsseg2:
2988 case Intrinsic::riscv_vsseg3:
2989 case Intrinsic::riscv_vsseg4:
2990 case Intrinsic::riscv_vsseg5:
2991 case Intrinsic::riscv_vsseg6:
2992 case Intrinsic::riscv_vsseg7:
2993 case Intrinsic::riscv_vsseg8: {
2994 // Intrinsic interface:
2995 // riscv_vle(merge, ptr, vl)
2996 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2997 // riscv_vse(val, ptr, vl)
2998 // riscv_vse_mask(val, ptr, mask, vl, policy)
2999 // riscv_vlseg#(merge, ptr, vl, sew)
3000 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3001 // riscv_vsseg#(val, ptr, vl, sew)
3002 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3003 bool IsWrite = Inst->getType()->isVoidTy();
3004 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3005 // The results of segment loads are TargetExtType.
3006 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3007 unsigned SEW =
3008 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3009 ->getZExtValue();
3010 Ty = TarExtTy->getTypeParameter(0U);
3012 IntegerType::get(C, SEW),
3013 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3014 }
3015 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3016 unsigned VLIndex = RVVIInfo->VLOperand;
3017 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3018 MaybeAlign Alignment =
3019 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3020 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3021 Value *Mask = ConstantInt::getTrue(MaskType);
3022 if (HasMask)
3023 Mask = Inst->getArgOperand(VLIndex - 1);
3024 Value *EVL = Inst->getArgOperand(VLIndex);
3025 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3026 // RVV uses contiguous elements as a segment.
3027 if (SegNum > 1) {
3028 unsigned ElemSize = Ty->getScalarSizeInBits();
3029 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3030 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3031 }
3032 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3033 Alignment, Mask, EVL);
3034 return true;
3035 }
3036 case Intrinsic::riscv_vlse_mask:
3037 case Intrinsic::riscv_vsse_mask:
3038 case Intrinsic::riscv_vlsseg2_mask:
3039 case Intrinsic::riscv_vlsseg3_mask:
3040 case Intrinsic::riscv_vlsseg4_mask:
3041 case Intrinsic::riscv_vlsseg5_mask:
3042 case Intrinsic::riscv_vlsseg6_mask:
3043 case Intrinsic::riscv_vlsseg7_mask:
3044 case Intrinsic::riscv_vlsseg8_mask:
3045 case Intrinsic::riscv_vssseg2_mask:
3046 case Intrinsic::riscv_vssseg3_mask:
3047 case Intrinsic::riscv_vssseg4_mask:
3048 case Intrinsic::riscv_vssseg5_mask:
3049 case Intrinsic::riscv_vssseg6_mask:
3050 case Intrinsic::riscv_vssseg7_mask:
3051 case Intrinsic::riscv_vssseg8_mask:
3052 HasMask = true;
3053 [[fallthrough]];
3054 case Intrinsic::riscv_vlse:
3055 case Intrinsic::riscv_vsse:
3056 case Intrinsic::riscv_vlsseg2:
3057 case Intrinsic::riscv_vlsseg3:
3058 case Intrinsic::riscv_vlsseg4:
3059 case Intrinsic::riscv_vlsseg5:
3060 case Intrinsic::riscv_vlsseg6:
3061 case Intrinsic::riscv_vlsseg7:
3062 case Intrinsic::riscv_vlsseg8:
3063 case Intrinsic::riscv_vssseg2:
3064 case Intrinsic::riscv_vssseg3:
3065 case Intrinsic::riscv_vssseg4:
3066 case Intrinsic::riscv_vssseg5:
3067 case Intrinsic::riscv_vssseg6:
3068 case Intrinsic::riscv_vssseg7:
3069 case Intrinsic::riscv_vssseg8: {
3070 // Intrinsic interface:
3071 // riscv_vlse(merge, ptr, stride, vl)
3072 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3073 // riscv_vsse(val, ptr, stride, vl)
3074 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3075 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3076 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3077 // riscv_vssseg#(val, ptr, offset, vl, sew)
3078 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3079 bool IsWrite = Inst->getType()->isVoidTy();
3080 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3081 // The results of segment loads are TargetExtType.
3082 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3083 unsigned SEW =
3084 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3085 ->getZExtValue();
3086 Ty = TarExtTy->getTypeParameter(0U);
3088 IntegerType::get(C, SEW),
3089 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3090 }
3091 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3092 unsigned VLIndex = RVVIInfo->VLOperand;
3093 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3094 MaybeAlign Alignment =
3095 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3096
3097 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3098 // Use the pointer alignment as the element alignment if the stride is a
3099 // multiple of the pointer alignment. Otherwise, the element alignment
3100 // should be the greatest common divisor of pointer alignment and stride.
3101 // For simplicity, just consider unalignment for elements.
3102 unsigned PointerAlign = Alignment.valueOrOne().value();
3103 if (!isa<ConstantInt>(Stride) ||
3104 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3105 Alignment = Align(1);
3106
3107 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3108 Value *Mask = ConstantInt::getTrue(MaskType);
3109 if (HasMask)
3110 Mask = Inst->getArgOperand(VLIndex - 1);
3111 Value *EVL = Inst->getArgOperand(VLIndex);
3112 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3113 // RVV uses contiguous elements as a segment.
3114 if (SegNum > 1) {
3115 unsigned ElemSize = Ty->getScalarSizeInBits();
3116 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3117 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3118 }
3119 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3120 Alignment, Mask, EVL, Stride);
3121 return true;
3122 }
3123 case Intrinsic::riscv_vloxei_mask:
3124 case Intrinsic::riscv_vluxei_mask:
3125 case Intrinsic::riscv_vsoxei_mask:
3126 case Intrinsic::riscv_vsuxei_mask:
3127 case Intrinsic::riscv_vloxseg2_mask:
3128 case Intrinsic::riscv_vloxseg3_mask:
3129 case Intrinsic::riscv_vloxseg4_mask:
3130 case Intrinsic::riscv_vloxseg5_mask:
3131 case Intrinsic::riscv_vloxseg6_mask:
3132 case Intrinsic::riscv_vloxseg7_mask:
3133 case Intrinsic::riscv_vloxseg8_mask:
3134 case Intrinsic::riscv_vluxseg2_mask:
3135 case Intrinsic::riscv_vluxseg3_mask:
3136 case Intrinsic::riscv_vluxseg4_mask:
3137 case Intrinsic::riscv_vluxseg5_mask:
3138 case Intrinsic::riscv_vluxseg6_mask:
3139 case Intrinsic::riscv_vluxseg7_mask:
3140 case Intrinsic::riscv_vluxseg8_mask:
3141 case Intrinsic::riscv_vsoxseg2_mask:
3142 case Intrinsic::riscv_vsoxseg3_mask:
3143 case Intrinsic::riscv_vsoxseg4_mask:
3144 case Intrinsic::riscv_vsoxseg5_mask:
3145 case Intrinsic::riscv_vsoxseg6_mask:
3146 case Intrinsic::riscv_vsoxseg7_mask:
3147 case Intrinsic::riscv_vsoxseg8_mask:
3148 case Intrinsic::riscv_vsuxseg2_mask:
3149 case Intrinsic::riscv_vsuxseg3_mask:
3150 case Intrinsic::riscv_vsuxseg4_mask:
3151 case Intrinsic::riscv_vsuxseg5_mask:
3152 case Intrinsic::riscv_vsuxseg6_mask:
3153 case Intrinsic::riscv_vsuxseg7_mask:
3154 case Intrinsic::riscv_vsuxseg8_mask:
3155 HasMask = true;
3156 [[fallthrough]];
3157 case Intrinsic::riscv_vloxei:
3158 case Intrinsic::riscv_vluxei:
3159 case Intrinsic::riscv_vsoxei:
3160 case Intrinsic::riscv_vsuxei:
3161 case Intrinsic::riscv_vloxseg2:
3162 case Intrinsic::riscv_vloxseg3:
3163 case Intrinsic::riscv_vloxseg4:
3164 case Intrinsic::riscv_vloxseg5:
3165 case Intrinsic::riscv_vloxseg6:
3166 case Intrinsic::riscv_vloxseg7:
3167 case Intrinsic::riscv_vloxseg8:
3168 case Intrinsic::riscv_vluxseg2:
3169 case Intrinsic::riscv_vluxseg3:
3170 case Intrinsic::riscv_vluxseg4:
3171 case Intrinsic::riscv_vluxseg5:
3172 case Intrinsic::riscv_vluxseg6:
3173 case Intrinsic::riscv_vluxseg7:
3174 case Intrinsic::riscv_vluxseg8:
3175 case Intrinsic::riscv_vsoxseg2:
3176 case Intrinsic::riscv_vsoxseg3:
3177 case Intrinsic::riscv_vsoxseg4:
3178 case Intrinsic::riscv_vsoxseg5:
3179 case Intrinsic::riscv_vsoxseg6:
3180 case Intrinsic::riscv_vsoxseg7:
3181 case Intrinsic::riscv_vsoxseg8:
3182 case Intrinsic::riscv_vsuxseg2:
3183 case Intrinsic::riscv_vsuxseg3:
3184 case Intrinsic::riscv_vsuxseg4:
3185 case Intrinsic::riscv_vsuxseg5:
3186 case Intrinsic::riscv_vsuxseg6:
3187 case Intrinsic::riscv_vsuxseg7:
3188 case Intrinsic::riscv_vsuxseg8: {
3189 // Intrinsic interface (only listed ordered version):
3190 // riscv_vloxei(merge, ptr, index, vl)
3191 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3192 // riscv_vsoxei(val, ptr, index, vl)
3193 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3194 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3195 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3196 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3197 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3198 bool IsWrite = Inst->getType()->isVoidTy();
3199 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3200 // The results of segment loads are TargetExtType.
3201 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3202 unsigned SEW =
3203 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3204 ->getZExtValue();
3205 Ty = TarExtTy->getTypeParameter(0U);
3207 IntegerType::get(C, SEW),
3208 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3209 }
3210 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3211 unsigned VLIndex = RVVIInfo->VLOperand;
3212 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3213 Value *Mask;
3214 if (HasMask) {
3215 Mask = Inst->getArgOperand(VLIndex - 1);
3216 } else {
3217 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3218 // and casting that to scalar i64 triggers a vector/scalar mismatch
3219 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3220 // via extractelement instead.
3221 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3222 Mask = ConstantInt::getTrue(MaskType);
3223 }
3224 Value *EVL = Inst->getArgOperand(VLIndex);
3225 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3226 // RVV uses contiguous elements as a segment.
3227 if (SegNum > 1) {
3228 unsigned ElemSize = Ty->getScalarSizeInBits();
3229 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3230 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3231 }
3232 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3233 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3234 Align(1), Mask, EVL,
3235 /* Stride */ nullptr, OffsetOp);
3236 return true;
3237 }
3238 }
3239 return false;
3240}
3241
3243 if (Ty->isVectorTy()) {
3244 // f16 with only zvfhmin and bf16 will be promoted to f32
3245 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3246 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3247 EltTy->isBFloatTy())
3248 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3249 cast<VectorType>(Ty));
3250
3251 TypeSize Size = DL.getTypeSizeInBits(Ty);
3252 if (Size.isScalable() && ST->hasVInstructions())
3253 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3254
3255 if (ST->useRVVForFixedLengthVectors())
3256 return divideCeil(Size, ST->getRealMinVLen());
3257 }
3258
3259 return BaseT::getRegUsageForType(Ty);
3260}
3261
3262unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3263 if (SLPMaxVF.getNumOccurrences())
3264 return SLPMaxVF;
3265
3266 // Return how many elements can fit in getRegisterBitwidth. This is the
3267 // same routine as used in LoopVectorizer. We should probably be
3268 // accounting for whether we actually have instructions with the right
3269 // lane type, but we don't have enough information to do that without
3270 // some additional plumbing which hasn't been justified yet.
3271 TypeSize RegWidth =
3273 // If no vector registers, or absurd element widths, disable
3274 // vectorization by returning 1.
3275 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3276}
3277
3281
3283 return ST->enableUnalignedVectorMem();
3284}
3285
3288 ScalarEvolution *SE) const {
3289 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3290 return TTI::AMK_PostIndexed;
3291
3293}
3294
3296 const TargetTransformInfo::LSRCost &C2) const {
3297 // RISC-V specific here are "instruction number 1st priority".
3298 // If we need to emit adds inside the loop to add up base registers, then
3299 // we need at least one extra temporary register.
3300 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3301 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3302 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3303 C1.NumIVMuls, C1.NumBaseAdds,
3304 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3305 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3306 C2.NumIVMuls, C2.NumBaseAdds,
3307 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3308}
3309
3311 Align Alignment) const {
3312 auto *VTy = dyn_cast<VectorType>(DataTy);
3313 if (!VTy || VTy->isScalableTy())
3314 return false;
3315
3316 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3317 return false;
3318
3319 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3320 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3321 if (VTy->getElementType()->isIntegerTy(8))
3322 if (VTy->getElementCount().getFixedValue() > 256)
3323 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3324 ST->getMaxLMULForFixedLengthVectors();
3325 return true;
3326}
3327
3329 Align Alignment) const {
3330 auto *VTy = dyn_cast<VectorType>(DataTy);
3331 if (!VTy || VTy->isScalableTy())
3332 return false;
3333
3334 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3335 return false;
3336 return true;
3337}
3338
3339/// See if \p I should be considered for address type promotion. We check if \p
3340/// I is a sext with right type and used in memory accesses. If it used in a
3341/// "complex" getelementptr, we allow it to be promoted without finding other
3342/// sext instructions that sign extended the same initial value. A getelementptr
3343/// is considered as "complex" if it has more than 2 operands.
3345 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3346 bool Considerable = false;
3347 AllowPromotionWithoutCommonHeader = false;
3348 if (!isa<SExtInst>(&I))
3349 return false;
3350 Type *ConsideredSExtType =
3351 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3352 if (I.getType() != ConsideredSExtType)
3353 return false;
3354 // See if the sext is the one with the right type and used in at least one
3355 // GetElementPtrInst.
3356 for (const User *U : I.users()) {
3357 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3358 Considerable = true;
3359 // A getelementptr is considered as "complex" if it has more than 2
3360 // operands. We will promote a SExt used in such complex GEP as we
3361 // expect some computation to be merged if they are done on 64 bits.
3362 if (GEPInst->getNumOperands() > 2) {
3363 AllowPromotionWithoutCommonHeader = true;
3364 break;
3365 }
3366 }
3367 }
3368 return Considerable;
3369}
3370
3371bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3372 switch (Opcode) {
3373 case Instruction::Add:
3374 case Instruction::Sub:
3375 case Instruction::Mul:
3376 case Instruction::And:
3377 case Instruction::Or:
3378 case Instruction::Xor:
3379 case Instruction::FAdd:
3380 case Instruction::FSub:
3381 case Instruction::FMul:
3382 case Instruction::FDiv:
3383 case Instruction::ICmp:
3384 case Instruction::FCmp:
3385 return true;
3386 case Instruction::Shl:
3387 case Instruction::LShr:
3388 case Instruction::AShr:
3389 case Instruction::UDiv:
3390 case Instruction::SDiv:
3391 case Instruction::URem:
3392 case Instruction::SRem:
3393 case Instruction::Select:
3394 return Operand == 1;
3395 default:
3396 return false;
3397 }
3398}
3399
3401 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3402 return false;
3403
3404 if (canSplatOperand(I->getOpcode(), Operand))
3405 return true;
3406
3407 auto *II = dyn_cast<IntrinsicInst>(I);
3408 if (!II)
3409 return false;
3410
3411 switch (II->getIntrinsicID()) {
3412 case Intrinsic::fma:
3413 case Intrinsic::vp_fma:
3414 case Intrinsic::fmuladd:
3415 case Intrinsic::vp_fmuladd:
3416 return Operand == 0 || Operand == 1;
3417 case Intrinsic::vp_shl:
3418 case Intrinsic::vp_lshr:
3419 case Intrinsic::vp_ashr:
3420 case Intrinsic::vp_udiv:
3421 case Intrinsic::vp_sdiv:
3422 case Intrinsic::vp_urem:
3423 case Intrinsic::vp_srem:
3424 case Intrinsic::ssub_sat:
3425 case Intrinsic::vp_ssub_sat:
3426 case Intrinsic::usub_sat:
3427 case Intrinsic::vp_usub_sat:
3428 case Intrinsic::vp_select:
3429 return Operand == 1;
3430 // These intrinsics are commutative.
3431 case Intrinsic::vp_add:
3432 case Intrinsic::vp_mul:
3433 case Intrinsic::vp_and:
3434 case Intrinsic::vp_or:
3435 case Intrinsic::vp_xor:
3436 case Intrinsic::vp_fadd:
3437 case Intrinsic::vp_fmul:
3438 case Intrinsic::vp_icmp:
3439 case Intrinsic::vp_fcmp:
3440 case Intrinsic::smin:
3441 case Intrinsic::vp_smin:
3442 case Intrinsic::umin:
3443 case Intrinsic::vp_umin:
3444 case Intrinsic::smax:
3445 case Intrinsic::vp_smax:
3446 case Intrinsic::umax:
3447 case Intrinsic::vp_umax:
3448 case Intrinsic::sadd_sat:
3449 case Intrinsic::vp_sadd_sat:
3450 case Intrinsic::uadd_sat:
3451 case Intrinsic::vp_uadd_sat:
3452 // These intrinsics have 'vr' versions.
3453 case Intrinsic::vp_sub:
3454 case Intrinsic::vp_fsub:
3455 case Intrinsic::vp_fdiv:
3456 return Operand == 0 || Operand == 1;
3457 default:
3458 return false;
3459 }
3460}
3461
3462/// Check if sinking \p I's operands to I's basic block is profitable, because
3463/// the operands can be folded into a target instruction, e.g.
3464/// splats of scalars can fold into vector instructions.
3467 using namespace llvm::PatternMatch;
3468
3469 if (I->isBitwiseLogicOp()) {
3470 if (!I->getType()->isVectorTy()) {
3471 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3472 for (auto &Op : I->operands()) {
3473 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3474 if (match(Op.get(), m_Not(m_Value()))) {
3475 Ops.push_back(&Op);
3476 return true;
3477 }
3478 }
3479 }
3480 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3481 for (auto &Op : I->operands()) {
3482 // (and X, (not Y)) -> (vandn.vv X, Y)
3483 if (match(Op.get(), m_Not(m_Value()))) {
3484 Ops.push_back(&Op);
3485 return true;
3486 }
3487 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3489 m_ZeroInt()),
3490 m_Value(), m_ZeroMask()))) {
3491 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3492 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3493 Ops.push_back(&Not);
3494 Ops.push_back(&InsertElt);
3495 Ops.push_back(&Op);
3496 return true;
3497 }
3498 }
3499 }
3500 }
3501
3502 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3503 return false;
3504
3505 // Don't sink splat operands if the target prefers it. Some targets requires
3506 // S2V transfer buffers and we can run out of them copying the same value
3507 // repeatedly.
3508 // FIXME: It could still be worth doing if it would improve vector register
3509 // pressure and prevent a vector spill.
3510 if (!ST->sinkSplatOperands())
3511 return false;
3512
3513 for (auto OpIdx : enumerate(I->operands())) {
3514 if (!canSplatOperand(I, OpIdx.index()))
3515 continue;
3516
3517 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3518 // Make sure we are not already sinking this operand
3519 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3520 continue;
3521
3522 // We are looking for a splat that can be sunk.
3524 m_Value(), m_ZeroMask())))
3525 continue;
3526
3527 // Don't sink i1 splats.
3528 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3529 continue;
3530
3531 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3532 // and vector registers
3533 for (Use &U : Op->uses()) {
3534 Instruction *Insn = cast<Instruction>(U.getUser());
3535 if (!canSplatOperand(Insn, U.getOperandNo()))
3536 return false;
3537 }
3538
3539 // Sink any fpexts since they might be used in a widening fp pattern.
3540 Use *InsertEltUse = &Op->getOperandUse(0);
3541 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3542 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3543 Ops.push_back(&InsertElt->getOperandUse(1));
3544 Ops.push_back(InsertEltUse);
3545 Ops.push_back(&OpIdx.value());
3546 }
3547 return true;
3548}
3549
3551RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3553 // TODO: Enable expansion when unaligned access is not supported after we fix
3554 // issues in ExpandMemcmp.
3555 if (!ST->enableUnalignedScalarMem())
3556 return Options;
3557
3558 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3559 return Options;
3560
3561 Options.AllowOverlappingLoads = true;
3562 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3563 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3564 if (ST->is64Bit()) {
3565 Options.LoadSizes = {8, 4, 2, 1};
3566 Options.AllowedTailExpansions = {3, 5, 6};
3567 } else {
3568 Options.LoadSizes = {4, 2, 1};
3569 Options.AllowedTailExpansions = {3};
3570 }
3571
3572 if (IsZeroCmp && ST->hasVInstructions()) {
3573 unsigned VLenB = ST->getRealMinVLen() / 8;
3574 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3575 // `VLenB * MaxLMUL` so that it fits in a single register group.
3576 unsigned MinSize = ST->getXLen() / 8 + 1;
3577 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3578 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3579 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3580 }
3581 return Options;
3582}
3583
3585 const Instruction *I) const {
3587 // For the binary operators (e.g. or) we need to be more careful than
3588 // selects, here we only transform them if they are already at a natural
3589 // break point in the code - the end of a block with an unconditional
3590 // terminator.
3591 if (I->getOpcode() == Instruction::Or &&
3592 isa<UncondBrInst>(I->getNextNode()))
3593 return true;
3594
3595 if (I->getOpcode() == Instruction::Add ||
3596 I->getOpcode() == Instruction::Sub)
3597 return true;
3598 }
3600}
3601
3603 const Function *Caller, const Attribute &Attr) const {
3604 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3605 // restrictions on their signatures). We can outline from the bodies of these
3606 // handlers, but when we do we need to make sure we don't mark the outlined
3607 // function as an interrupt handler too.
3608 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3609 return false;
3610
3612}
3613
3614std::optional<Instruction *>
3616 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3617 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3618 // creating redundant masks.
3619 const DataLayout &DL = IC.getDataLayout();
3620 if (II.user_empty())
3621 return {};
3622 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3623 if (!TargetVecTy)
3624 return {};
3625 const APInt *Scalar;
3626 uint64_t VL;
3628 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3629 !all_of(II.users(), [TargetVecTy](User *U) {
3630 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3631 }))
3632 return {};
3633 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3634 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3635 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3636 if (TargetEltBW % SourceEltBW)
3637 return {};
3638 unsigned TargetScale = TargetEltBW / SourceEltBW;
3639 if (VL % TargetScale)
3640 return {};
3641 Type *VLTy = II.getOperand(2)->getType();
3642 ElementCount SourceEC = SourceVecTy->getElementCount();
3643 unsigned NewEltBW = SourceEltBW * TargetScale;
3644 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3645 !DL.fitsInLegalInteger(NewEltBW))
3646 return {};
3647 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3648 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3649 return {};
3650 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3651 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3652 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3653 "Lossless bitcast between types expected");
3654 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3655 return IC.replaceInstUsesWith(
3656 II,
3659 RetTy, Intrinsic::riscv_vmv_v_x,
3660 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3661 ConstantInt::get(VLTy, VL / TargetScale)}),
3662 SourceVecTy));
3663}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).