LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdota4* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
704 *this, LT.second, ST->getRealVLen(),
705 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(DeinterleaveMask, Mask))
731 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
732 LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
739 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
745 FixedVectorType *DestTp =
747 std::pair<InstructionCost, MVT> DestLT =
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
753 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
754 CostKind, InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
773 }
774 break;
775 }
778
779 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
792 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(DstTy);
859 return LT.first *
860 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
877 Instruction::InsertElement);
878 if (LT.second.getScalarSizeInBits() == 1) {
879 if (HasScalar) {
880 // Example sequence:
881 // andi a0, a0, 1
882 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
883 // vmv.v.x v8, a0
884 // vmsne.vi v0, v8, 0
885 return LT.first *
886 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
887 LT.second, CostKind));
888 }
889 // Example sequence:
890 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
891 // vmv.v.i v8, 0
892 // vmerge.vim v8, v8, 1, v0
893 // vmv.x.s a0, v8
894 // andi a0, a0, 1
895 // vmv.v.x v8, a0
896 // vmsne.vi v0, v8, 0
897
898 return LT.first *
899 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
900 RISCV::VMV_X_S, RISCV::VMV_V_X,
901 RISCV::VMSNE_VI},
902 LT.second, CostKind));
903 }
904
905 if (HasScalar) {
906 // Example sequence:
907 // vmv.v.x v8, a0
908 return LT.first *
909 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
910 }
911
912 // Example sequence:
913 // vrgather.vi v9, v8, 0
914 return LT.first *
915 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
916 }
917 case TTI::SK_Splice: {
918 // vslidedown+vslideup.
919 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
920 // of similar code, but I think we expand through memory.
921 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
922 if (Index >= 0 && Index < 32)
923 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
924 else if (Index < 0 && Index > -32)
925 Opcodes[1] = RISCV::VSLIDEUP_VI;
926 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
927 }
928 case TTI::SK_Reverse: {
929
930 if (!LT.second.isVector())
932
933 // TODO: Cases to improve here:
934 // * Illegal vector types
935 // * i64 on RV32
936 if (SrcTy->getElementType()->isIntegerTy(1)) {
937 VectorType *WideTy =
938 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
939 cast<VectorType>(SrcTy)->getElementCount());
940 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
942 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
943 nullptr) +
944 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
946 }
947
948 MVT ContainerVT = LT.second;
949 if (LT.second.isFixedLengthVector())
950 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
951 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
952 if (ContainerVT.bitsLE(M1VT)) {
953 // Example sequence:
954 // csrr a0, vlenb
955 // srli a0, a0, 3
956 // addi a0, a0, -1
957 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
958 // vid.v v9
959 // vrsub.vx v10, v9, a0
960 // vrgather.vv v9, v8, v10
961 InstructionCost LenCost = 3;
962 if (LT.second.isFixedLengthVector())
963 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
964 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
965 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
966 if (LT.second.isFixedLengthVector() &&
967 isInt<5>(LT.second.getVectorNumElements() - 1))
968 Opcodes[1] = RISCV::VRSUB_VI;
969 InstructionCost GatherCost =
970 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
971 return LT.first * (LenCost + GatherCost);
972 }
973
974 // At high LMUL, we split into a series of M1 reverses (see
975 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
976 // the resulting gap at the bottom (for fixed vectors only). The important
977 // bit is that the cost scales linearly, not quadratically with LMUL.
978 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
979 InstructionCost FixedCost =
980 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
981 unsigned Ratio =
983 InstructionCost GatherCost =
984 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
985 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
986 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
987 return FixedCost + LT.first * (GatherCost + SlideCost);
988 }
989 }
990 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
991 SubTp);
992}
993
994static unsigned isM1OrSmaller(MVT VT) {
996 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1000}
1001
1003 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1004 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1005 TTI::VectorInstrContext VIC) const {
1008
1009 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1010 // For now, skip all fixed vector cost analysis when P extension is available
1011 // to avoid crashes in getMinRVVVectorSizeInBits()
1012 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1013 return 1; // Treat as single instruction cost for now
1014 }
1015
1016 // A build_vector (which is m1 sized or smaller) can be done in no
1017 // worse than one vslide1down.vx per element in the type. We could
1018 // in theory do an explode_vector in the inverse manner, but our
1019 // lowering today does not have a first class node for this pattern.
1021 Ty, DemandedElts, Insert, Extract, CostKind);
1022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1023 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1024 if (Ty->getScalarSizeInBits() == 1) {
1025 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1026 // Note: Implicit scalar anyextend is assumed to be free since the i1
1027 // must be stored in a GPR.
1028 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1029 CostKind) +
1030 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1032 }
1033
1034 assert(LT.second.isFixedLengthVector());
1035 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1036 if (isM1OrSmaller(ContainerVT)) {
1037 InstructionCost BV =
1038 cast<FixedVectorType>(Ty)->getNumElements() *
1039 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1040 if (BV < Cost)
1041 Cost = BV;
1042 }
1043 }
1044 return Cost;
1045}
1046
1050 Type *DataTy = MICA.getDataType();
1051 Align Alignment = MICA.getAlignment();
1052 switch (MICA.getID()) {
1053 case Intrinsic::vp_load_ff: {
1054 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1055 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1057
1058 unsigned AS = MICA.getAddressSpace();
1059 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1060 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1061 }
1062 case Intrinsic::experimental_vp_strided_load:
1063 case Intrinsic::experimental_vp_strided_store:
1064 return getStridedMemoryOpCost(MICA, CostKind);
1065 case Intrinsic::masked_compressstore:
1066 case Intrinsic::masked_expandload:
1068 case Intrinsic::vp_scatter:
1069 case Intrinsic::vp_gather:
1070 case Intrinsic::masked_scatter:
1071 case Intrinsic::masked_gather:
1072 return getGatherScatterOpCost(MICA, CostKind);
1073 case Intrinsic::vp_load:
1074 case Intrinsic::vp_store:
1075 case Intrinsic::masked_load:
1076 case Intrinsic::masked_store:
1077 return getMaskedMemoryOpCost(MICA, CostKind);
1078 }
1080}
1081
1085 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1086 : Instruction::Store;
1087 Type *Src = MICA.getDataType();
1088 Align Alignment = MICA.getAlignment();
1089 unsigned AddressSpace = MICA.getAddressSpace();
1090
1091 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1094
1095 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1096}
1097
1099 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1100 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1101 bool UseMaskForCond, bool UseMaskForGaps) const {
1102
1103 // The interleaved memory access pass will lower (de)interleave ops combined
1104 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1105 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1106 // gap).
1107 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1108 auto *VTy = cast<VectorType>(VecTy);
1109 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1110 // Need to make sure type has't been scalarized
1111 if (LT.second.isVector()) {
1112 auto *SubVecTy =
1113 VectorType::get(VTy->getElementType(),
1114 VTy->getElementCount().divideCoefficientBy(Factor));
1115 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1116 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1117 AddressSpace, DL)) {
1118
1119 // Some processors optimize segment loads/stores as one wide memory op +
1120 // Factor * LMUL shuffle ops.
1121 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1123 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1124 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1125 Cost += Factor * TLI->getLMULCost(SubVecVT);
1126 return LT.first * Cost;
1127 }
1128
1129 // Otherwise, the cost is proportional to the number of elements (VL *
1130 // Factor ops).
1131 InstructionCost MemOpCost =
1132 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1133 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1134 unsigned NumLoads = getEstimatedVLFor(VTy);
1135 return NumLoads * MemOpCost;
1136 }
1137 }
1138 }
1139
1140 // TODO: Return the cost of interleaved accesses for scalable vector when
1141 // unable to convert to segment accesses instructions.
1142 if (isa<ScalableVectorType>(VecTy))
1144
1145 auto *FVTy = cast<FixedVectorType>(VecTy);
1146 InstructionCost MemCost =
1147 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1148 unsigned VF = FVTy->getNumElements() / Factor;
1149
1150 // An interleaved load will look like this for Factor=3:
1151 // %wide.vec = load <12 x i32>, ptr %3, align 4
1152 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1153 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1154 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1155 if (Opcode == Instruction::Load) {
1156 InstructionCost Cost = MemCost;
1157 for (unsigned Index : Indices) {
1158 FixedVectorType *VecTy =
1159 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1160 auto Mask = createStrideMask(Index, Factor, VF);
1161 Mask.resize(VF * Factor, -1);
1162 InstructionCost ShuffleCost =
1164 Mask, CostKind, 0, nullptr, {});
1165 Cost += ShuffleCost;
1166 }
1167 return Cost;
1168 }
1169
1170 // TODO: Model for NF > 2
1171 // We'll need to enhance getShuffleCost to model shuffles that are just
1172 // inserts and extracts into subvectors, since they won't have the full cost
1173 // of a vrgather.
1174 // An interleaved store for 3 vectors of 4 lanes will look like
1175 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1176 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1177 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1178 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1179 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1180 if (Factor != 2)
1181 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1182 Alignment, AddressSpace, CostKind,
1183 UseMaskForCond, UseMaskForGaps);
1184
1185 assert(Opcode == Instruction::Store && "Opcode must be a store");
1186 // For an interleaving store of 2 vectors, we perform one large interleaving
1187 // shuffle that goes into the wide store
1188 auto Mask = createInterleaveMask(VF, Factor);
1189 InstructionCost ShuffleCost =
1191 CostKind, 0, nullptr, {});
1192 return MemCost + ShuffleCost;
1193}
1194
1198
1199 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1200 MICA.getID() == Intrinsic::vp_gather;
1201 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1202 Type *DataTy = MICA.getDataType();
1203 Align Alignment = MICA.getAlignment();
1206
1207 if ((Opcode == Instruction::Load &&
1208 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1209 (Opcode == Instruction::Store &&
1210 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1212
1213 // Cost is proportional to the number of memory operations implied. For
1214 // scalable vectors, we use an estimate on that number since we don't
1215 // know exactly what VL will be.
1216 auto &VTy = *cast<VectorType>(DataTy);
1217 unsigned NumLoads = getEstimatedVLFor(&VTy);
1218 return NumLoads * TTI::TCC_Basic;
1219}
1220
1222 const MemIntrinsicCostAttributes &MICA,
1224 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1225 ? Instruction::Load
1226 : Instruction::Store;
1227 Type *DataTy = MICA.getDataType();
1228 bool VariableMask = MICA.getVariableMask();
1229 Align Alignment = MICA.getAlignment();
1230 bool IsLegal = (Opcode == Instruction::Store &&
1231 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1232 (Opcode == Instruction::Load &&
1233 isLegalMaskedExpandLoad(DataTy, Alignment));
1234 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1236 // Example compressstore sequence:
1237 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1238 // vcompress.vm v10, v8, v0
1239 // vcpop.m a1, v0
1240 // vsetvli zero, a1, e32, m2, ta, ma
1241 // vse32.v v10, (a0)
1242 // Example expandload sequence:
1243 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1244 // vcpop.m a1, v0
1245 // vsetvli zero, a1, e32, m2, ta, ma
1246 // vle32.v v10, (a0)
1247 // vsetivli zero, 8, e32, m2, ta, ma
1248 // viota.m v12, v0
1249 // vrgather.vv v8, v10, v12, v0.t
1250 auto MemOpCost =
1251 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1252 auto LT = getTypeLegalizationCost(DataTy);
1253 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1254 if (VariableMask)
1255 Opcodes.push_back(RISCV::VCPOP_M);
1256 if (Opcode == Instruction::Store)
1257 Opcodes.append({RISCV::VCOMPRESS_VM});
1258 else
1259 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1260 return MemOpCost +
1261 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1262}
1263
1267
1268 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1269 ? Instruction::Load
1270 : Instruction::Store;
1271
1272 Type *DataTy = MICA.getDataType();
1273 Align Alignment = MICA.getAlignment();
1274 const Instruction *I = MICA.getInst();
1275
1276 if (!isLegalStridedLoadStore(DataTy, Alignment))
1278
1280 return TTI::TCC_Basic;
1281
1282 // Cost is proportional to the number of memory operations implied. For
1283 // scalable vectors, we use an estimate on that number since we don't
1284 // know exactly what VL will be.
1285 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1286 auto &VTy = *cast<VectorType>(DataTy);
1287 InstructionCost MemOpCost =
1288 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1289 {TTI::OK_AnyValue, TTI::OP_None}, I);
1290 unsigned NumLoads = getEstimatedVLFor(&VTy);
1291 return NumLoads * MemOpCost;
1292}
1293
1296 // FIXME: This is a property of the default vector convention, not
1297 // all possible calling conventions. Fixing that will require
1298 // some TTI API and SLP rework.
1301 for (auto *Ty : Tys) {
1302 if (!Ty->isVectorTy())
1303 continue;
1304 Align A = DL.getPrefTypeAlign(Ty);
1305 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1306 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1307 }
1308 return Cost;
1309}
1310
1311// Currently, these represent both throughput and codesize costs
1312// for the respective intrinsics. The costs in this table are simply
1313// instruction counts with the following adjustments made:
1314// * One vsetvli is considered free.
1316 {Intrinsic::floor, MVT::f32, 9},
1317 {Intrinsic::floor, MVT::f64, 9},
1318 {Intrinsic::ceil, MVT::f32, 9},
1319 {Intrinsic::ceil, MVT::f64, 9},
1320 {Intrinsic::trunc, MVT::f32, 7},
1321 {Intrinsic::trunc, MVT::f64, 7},
1322 {Intrinsic::round, MVT::f32, 9},
1323 {Intrinsic::round, MVT::f64, 9},
1324 {Intrinsic::roundeven, MVT::f32, 9},
1325 {Intrinsic::roundeven, MVT::f64, 9},
1326 {Intrinsic::rint, MVT::f32, 7},
1327 {Intrinsic::rint, MVT::f64, 7},
1328 {Intrinsic::nearbyint, MVT::f32, 9},
1329 {Intrinsic::nearbyint, MVT::f64, 9},
1330 {Intrinsic::bswap, MVT::i16, 3},
1331 {Intrinsic::bswap, MVT::i32, 12},
1332 {Intrinsic::bswap, MVT::i64, 31},
1333 {Intrinsic::vp_bswap, MVT::i16, 3},
1334 {Intrinsic::vp_bswap, MVT::i32, 12},
1335 {Intrinsic::vp_bswap, MVT::i64, 31},
1336 {Intrinsic::vp_fshl, MVT::i8, 7},
1337 {Intrinsic::vp_fshl, MVT::i16, 7},
1338 {Intrinsic::vp_fshl, MVT::i32, 7},
1339 {Intrinsic::vp_fshl, MVT::i64, 7},
1340 {Intrinsic::vp_fshr, MVT::i8, 7},
1341 {Intrinsic::vp_fshr, MVT::i16, 7},
1342 {Intrinsic::vp_fshr, MVT::i32, 7},
1343 {Intrinsic::vp_fshr, MVT::i64, 7},
1344 {Intrinsic::bitreverse, MVT::i8, 17},
1345 {Intrinsic::bitreverse, MVT::i16, 24},
1346 {Intrinsic::bitreverse, MVT::i32, 33},
1347 {Intrinsic::bitreverse, MVT::i64, 52},
1348 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1349 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1350 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1351 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1352 {Intrinsic::ctpop, MVT::i8, 12},
1353 {Intrinsic::ctpop, MVT::i16, 19},
1354 {Intrinsic::ctpop, MVT::i32, 20},
1355 {Intrinsic::ctpop, MVT::i64, 21},
1356 {Intrinsic::ctlz, MVT::i8, 19},
1357 {Intrinsic::ctlz, MVT::i16, 28},
1358 {Intrinsic::ctlz, MVT::i32, 31},
1359 {Intrinsic::ctlz, MVT::i64, 35},
1360 {Intrinsic::cttz, MVT::i8, 16},
1361 {Intrinsic::cttz, MVT::i16, 23},
1362 {Intrinsic::cttz, MVT::i32, 24},
1363 {Intrinsic::cttz, MVT::i64, 25},
1364 {Intrinsic::vp_ctpop, MVT::i8, 12},
1365 {Intrinsic::vp_ctpop, MVT::i16, 19},
1366 {Intrinsic::vp_ctpop, MVT::i32, 20},
1367 {Intrinsic::vp_ctpop, MVT::i64, 21},
1368 {Intrinsic::vp_ctlz, MVT::i8, 19},
1369 {Intrinsic::vp_ctlz, MVT::i16, 28},
1370 {Intrinsic::vp_ctlz, MVT::i32, 31},
1371 {Intrinsic::vp_ctlz, MVT::i64, 35},
1372 {Intrinsic::vp_cttz, MVT::i8, 16},
1373 {Intrinsic::vp_cttz, MVT::i16, 23},
1374 {Intrinsic::vp_cttz, MVT::i32, 24},
1375 {Intrinsic::vp_cttz, MVT::i64, 25},
1376};
1377
1381 auto *RetTy = ICA.getReturnType();
1382 switch (ICA.getID()) {
1383 case Intrinsic::lrint:
1384 case Intrinsic::llrint:
1385 case Intrinsic::lround:
1386 case Intrinsic::llround: {
1387 auto LT = getTypeLegalizationCost(RetTy);
1388 Type *SrcTy = ICA.getArgTypes().front();
1389 auto SrcLT = getTypeLegalizationCost(SrcTy);
1390 if (ST->hasVInstructions() && LT.second.isVector()) {
1392 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1393 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1394 if (LT.second.getVectorElementType() == MVT::bf16) {
1395 if (!ST->hasVInstructionsBF16Minimal())
1397 if (DstEltSz == 32)
1398 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1399 else
1400 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1401 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1402 !ST->hasVInstructionsF16()) {
1403 if (!ST->hasVInstructionsF16Minimal())
1405 if (DstEltSz == 32)
1406 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1407 else
1408 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1409
1410 } else if (SrcEltSz > DstEltSz) {
1411 Ops = {RISCV::VFNCVT_X_F_W};
1412 } else if (SrcEltSz < DstEltSz) {
1413 Ops = {RISCV::VFWCVT_X_F_V};
1414 } else {
1415 Ops = {RISCV::VFCVT_X_F_V};
1416 }
1417
1418 // We need to use the source LMUL in the case of a narrowing op, and the
1419 // destination LMUL otherwise.
1420 if (SrcEltSz > DstEltSz)
1421 return SrcLT.first *
1422 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1423 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1424 }
1425 break;
1426 }
1427 case Intrinsic::ceil:
1428 case Intrinsic::floor:
1429 case Intrinsic::trunc:
1430 case Intrinsic::rint:
1431 case Intrinsic::round:
1432 case Intrinsic::roundeven: {
1433 // These all use the same code.
1434 auto LT = getTypeLegalizationCost(RetTy);
1435 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1436 return LT.first * 8;
1437 break;
1438 }
1439 case Intrinsic::umin:
1440 case Intrinsic::umax:
1441 case Intrinsic::smin:
1442 case Intrinsic::smax: {
1443 auto LT = getTypeLegalizationCost(RetTy);
1444 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1445 return LT.first;
1446
1447 if (ST->hasVInstructions() && LT.second.isVector()) {
1448 unsigned Op;
1449 switch (ICA.getID()) {
1450 case Intrinsic::umin:
1451 Op = RISCV::VMINU_VV;
1452 break;
1453 case Intrinsic::umax:
1454 Op = RISCV::VMAXU_VV;
1455 break;
1456 case Intrinsic::smin:
1457 Op = RISCV::VMIN_VV;
1458 break;
1459 case Intrinsic::smax:
1460 Op = RISCV::VMAX_VV;
1461 break;
1462 }
1463 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1464 }
1465 break;
1466 }
1467 case Intrinsic::sadd_sat:
1468 case Intrinsic::ssub_sat:
1469 case Intrinsic::uadd_sat:
1470 case Intrinsic::usub_sat: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (ST->hasVInstructions() && LT.second.isVector()) {
1473 unsigned Op;
1474 switch (ICA.getID()) {
1475 case Intrinsic::sadd_sat:
1476 Op = RISCV::VSADD_VV;
1477 break;
1478 case Intrinsic::ssub_sat:
1479 Op = RISCV::VSSUB_VV;
1480 break;
1481 case Intrinsic::uadd_sat:
1482 Op = RISCV::VSADDU_VV;
1483 break;
1484 case Intrinsic::usub_sat:
1485 Op = RISCV::VSSUBU_VV;
1486 break;
1487 }
1488 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1489 }
1490 break;
1491 }
1492 case Intrinsic::fma:
1493 case Intrinsic::fmuladd: {
1494 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1495 auto LT = getTypeLegalizationCost(RetTy);
1496 if (ST->hasVInstructions() && LT.second.isVector())
1497 return LT.first *
1498 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1499 break;
1500 }
1501 case Intrinsic::fabs: {
1502 auto LT = getTypeLegalizationCost(RetTy);
1503 if (ST->hasVInstructions() && LT.second.isVector()) {
1504 // lui a0, 8
1505 // addi a0, a0, -1
1506 // vsetvli a1, zero, e16, m1, ta, ma
1507 // vand.vx v8, v8, a0
1508 // f16 with zvfhmin and bf16 with zvfhbmin
1509 if (LT.second.getVectorElementType() == MVT::bf16 ||
1510 (LT.second.getVectorElementType() == MVT::f16 &&
1511 !ST->hasVInstructionsF16()))
1512 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1513 CostKind) +
1514 2;
1515 else
1516 return LT.first *
1517 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1518 }
1519 break;
1520 }
1521 case Intrinsic::sqrt: {
1522 auto LT = getTypeLegalizationCost(RetTy);
1523 if (ST->hasVInstructions() && LT.second.isVector()) {
1526 MVT ConvType = LT.second;
1527 MVT FsqrtType = LT.second;
1528 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1529 // will be spilt.
1530 if (LT.second.getVectorElementType() == MVT::bf16) {
1531 if (LT.second == MVT::nxv32bf16) {
1532 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1533 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1534 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1535 ConvType = MVT::nxv16f16;
1536 FsqrtType = MVT::nxv16f32;
1537 } else {
1538 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1539 FsqrtOp = {RISCV::VFSQRT_V};
1540 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1541 }
1542 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1543 !ST->hasVInstructionsF16()) {
1544 if (LT.second == MVT::nxv32f16) {
1545 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1546 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1547 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1548 ConvType = MVT::nxv16f16;
1549 FsqrtType = MVT::nxv16f32;
1550 } else {
1551 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1552 FsqrtOp = {RISCV::VFSQRT_V};
1553 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1554 }
1555 } else {
1556 FsqrtOp = {RISCV::VFSQRT_V};
1557 }
1558
1559 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1560 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1561 }
1562 break;
1563 }
1564 case Intrinsic::cttz:
1565 case Intrinsic::ctlz:
1566 case Intrinsic::ctpop: {
1567 auto LT = getTypeLegalizationCost(RetTy);
1568 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1569 unsigned Op;
1570 switch (ICA.getID()) {
1571 case Intrinsic::cttz:
1572 Op = RISCV::VCTZ_V;
1573 break;
1574 case Intrinsic::ctlz:
1575 Op = RISCV::VCLZ_V;
1576 break;
1577 case Intrinsic::ctpop:
1578 Op = RISCV::VCPOP_V;
1579 break;
1580 }
1581 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1582 }
1583 break;
1584 }
1585 case Intrinsic::abs: {
1586 auto LT = getTypeLegalizationCost(RetTy);
1587 if (ST->hasVInstructions() && LT.second.isVector()) {
1588 // vabs.v v10, v8
1589 if (ST->hasStdExtZvabd())
1590 return LT.first *
1591 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1592
1593 // vrsub.vi v10, v8, 0
1594 // vmax.vv v8, v8, v10
1595 return LT.first *
1596 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1597 LT.second, CostKind);
1598 }
1599 break;
1600 }
1601 case Intrinsic::fshl:
1602 case Intrinsic::fshr: {
1603 if (ICA.getArgs().empty())
1604 break;
1605
1606 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1607 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1608 // instruction.
1609 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1610 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1611 (RetTy->getIntegerBitWidth() == 32 ||
1612 RetTy->getIntegerBitWidth() == 64) &&
1613 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1614 return 1;
1615 }
1616 break;
1617 }
1618 case Intrinsic::get_active_lane_mask: {
1619 if (ST->hasVInstructions()) {
1620 Type *ExpRetTy = VectorType::get(
1621 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1622 auto LT = getTypeLegalizationCost(ExpRetTy);
1623
1624 // vid.v v8 // considered hoisted
1625 // vsaddu.vx v8, v8, a0
1626 // vmsltu.vx v0, v8, a1
1627 return LT.first *
1628 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1629 LT.second, CostKind);
1630 }
1631 break;
1632 }
1633 // TODO: add more intrinsic
1634 case Intrinsic::stepvector: {
1635 auto LT = getTypeLegalizationCost(RetTy);
1636 // Legalisation of illegal types involves an `index' instruction plus
1637 // (LT.first - 1) vector adds.
1638 if (ST->hasVInstructions())
1639 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1640 (LT.first - 1) *
1641 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1642 return 1 + (LT.first - 1);
1643 }
1644 case Intrinsic::vector_splice_left:
1645 case Intrinsic::vector_splice_right: {
1646 auto LT = getTypeLegalizationCost(RetTy);
1647 // Constant offsets fall through to getShuffleCost.
1648 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1649 break;
1650 if (ST->hasVInstructions() && LT.second.isVector()) {
1651 return LT.first *
1652 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1653 LT.second, CostKind);
1654 }
1655 break;
1656 }
1657 case Intrinsic::experimental_cttz_elts: {
1658 Type *ArgTy = ICA.getArgTypes()[0];
1659 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1660 if (getTLI()->shouldExpandCttzElements(ArgType))
1661 break;
1662 InstructionCost Cost = getRISCVInstructionCost(
1663 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1664
1665 // If zero_is_poison is false, then we will generate additional
1666 // cmp + select instructions to convert -1 to EVL.
1667 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1668 if (ICA.getArgs().size() > 1 &&
1669 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1670 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1672 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1674
1675 return Cost;
1676 }
1677 case Intrinsic::experimental_vp_splice: {
1678 // To support type-based query from vectorizer, set the index to 0.
1679 // Note that index only change the cost from vslide.vx to vslide.vi and in
1680 // current implementations they have same costs.
1682 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1684 }
1685 case Intrinsic::fptoui_sat:
1686 case Intrinsic::fptosi_sat: {
1688 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1689 Type *SrcTy = ICA.getArgTypes()[0];
1690
1691 auto SrcLT = getTypeLegalizationCost(SrcTy);
1692 auto DstLT = getTypeLegalizationCost(RetTy);
1693 if (!SrcTy->isVectorTy())
1694 break;
1695
1696 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1698
1699 Cost +=
1700 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1701 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1702
1703 // Handle NaN.
1704 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1705 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1706 Type *CondTy = RetTy->getWithNewBitWidth(1);
1707 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1709 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1711 return Cost;
1712 }
1713 case Intrinsic::experimental_vector_extract_last_active: {
1714 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1715 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1716
1717 auto ValLT = getTypeLegalizationCost(ValTy);
1718 auto MaskLT = getTypeLegalizationCost(MaskTy);
1719
1720 // TODO: Return cheaper cost when the entire lane is inactive.
1721 // The expected asm sequence is:
1722 // vcpop.m a0, v0
1723 // beqz a0, exit # Return passthru when the entire lane is inactive.
1724 // vid v10, v0.t
1725 // vredmaxu.vs v10, v10, v10
1726 // vmv.x.s a0, v10
1727 // zext.b a0, a0
1728 // vslidedown.vx v8, v8, a0
1729 // vmv.x.s a0, v8
1730 // exit:
1731 // ...
1732
1733 // Find a suitable type for a stepvector.
1734 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1735 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1736 TLI->getVectorIdxTy(getDataLayout()), MaskTy->getElementCount(),
1737 /*ZeroIsPoison=*/true, &VScaleRange);
1738 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1739 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1740 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1741 auto StepLT = getTypeLegalizationCost(StepVecTy);
1742
1743 // Currently expandVectorFindLastActive cannot handle step vector split.
1744 // So return invalid when the type needs split.
1745 // FIXME: Remove this if expandVectorFindLastActive supports split vector.
1746 if (StepLT.first > 1)
1748
1750 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1751
1752 Cost += MaskLT.first *
1753 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1754 Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);
1755 Cost += StepLT.first *
1756 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1757 Cost += getCastInstrCost(Instruction::ZExt,
1758 Type::getInt64Ty(ValTy->getContext()), StepTy,
1760 Cost += ValLT.first *
1761 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1762 ValLT.second, CostKind);
1763 return Cost;
1764 }
1765 }
1766
1767 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1768 if (auto LT = getTypeLegalizationCost(RetTy);
1769 LT.second.isVector()) {
1770 MVT EltTy = LT.second.getVectorElementType();
1771 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1772 ICA.getID(), EltTy))
1773 return LT.first * Entry->Cost;
1774 }
1775 }
1776
1778}
1779
1782 const SCEV *Ptr,
1784 // Address computations for vector indexed load/store likely require an offset
1785 // and/or scaling.
1786 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1787 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1788
1789 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1790}
1791
1793 Type *Src,
1796 const Instruction *I) const {
1797 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1798 if (!IsVectorType)
1799 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1800
1801 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1802 // For now, skip all fixed vector cost analysis when P extension is available
1803 // to avoid crashes in getMinRVVVectorSizeInBits()
1804 if (ST->hasStdExtP() &&
1806 return 1; // Treat as single instruction cost for now
1807 }
1808
1809 // FIXME: Need to compute legalizing cost for illegal types. The current
1810 // code handles only legal types and those which can be trivially
1811 // promoted to legal.
1812 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1813 Dst->getScalarSizeInBits() > ST->getELen())
1814 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1815
1816 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1817 assert(ISD && "Invalid opcode");
1818 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1819 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1820
1821 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1822 // The shared implementation doesn't model vector widening during legalization
1823 // and instead assumes scalarization. In order to scalarize an <N x i1>
1824 // vector, we need to extend/trunc to/from i8. If we don't special case
1825 // this, we can get an infinite recursion cycle.
1826 switch (ISD) {
1827 default:
1828 break;
1829 case ISD::SIGN_EXTEND:
1830 case ISD::ZERO_EXTEND:
1831 if (Src->getScalarSizeInBits() == 1) {
1832 // We do not use vsext/vzext to extend from mask vector.
1833 // Instead we use the following instructions to extend from mask vector:
1834 // vmv.v.i v8, 0
1835 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1836 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1837 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1838 DstLT.second, CostKind) +
1839 DstLT.first - 1;
1840 }
1841 break;
1842 case ISD::TRUNCATE:
1843 if (Dst->getScalarSizeInBits() == 1) {
1844 // We do not use several vncvt to truncate to mask vector. So we could
1845 // not use PowDiff to calculate it.
1846 // Instead we use the following instructions to truncate to mask vector:
1847 // vand.vi v8, v8, 1
1848 // vmsne.vi v0, v8, 0
1849 return SrcLT.first *
1850 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1851 SrcLT.second, CostKind) +
1852 SrcLT.first - 1;
1853 }
1854 break;
1855 };
1856
1857 // Our actual lowering for the case where a wider legal type is available
1858 // uses promotion to the wider type. This is reflected in the result of
1859 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1860 // scalarized if the legalized Src and Dst are not equal sized.
1861 const DataLayout &DL = this->getDataLayout();
1862 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1863 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1864 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1865 SrcLT.second.getSizeInBits()) ||
1866 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1867 DstLT.second.getSizeInBits()) ||
1868 SrcLT.first > 1 || DstLT.first > 1)
1869 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1870
1871 // The split cost is handled by the base getCastInstrCost
1872 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1873
1874 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1875 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1876 switch (ISD) {
1877 case ISD::SIGN_EXTEND:
1878 case ISD::ZERO_EXTEND: {
1879 if ((PowDiff < 1) || (PowDiff > 3))
1880 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1881 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1882 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1883 unsigned Op =
1884 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1885 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1886 }
1887 case ISD::TRUNCATE:
1888 case ISD::FP_EXTEND:
1889 case ISD::FP_ROUND: {
1890 // Counts of narrow/widen instructions.
1891 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1892 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1893
1894 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1895 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1896 : RISCV::VFNCVT_F_F_W;
1898 for (; SrcEltSize != DstEltSize;) {
1899 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1900 ? MVT::getIntegerVT(DstEltSize)
1901 : MVT::getFloatingPointVT(DstEltSize);
1902 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1903 DstEltSize =
1904 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1905 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1906 }
1907 return Cost;
1908 }
1909 case ISD::FP_TO_SINT:
1910 case ISD::FP_TO_UINT: {
1911 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1912 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1913 unsigned FWCVT =
1914 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1915 unsigned FNCVT =
1916 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1917 unsigned SrcEltSize = Src->getScalarSizeInBits();
1918 unsigned DstEltSize = Dst->getScalarSizeInBits();
1920 if ((SrcEltSize == 16) &&
1921 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1922 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1923 // pre-widening to f32 and then convert f32 to integer
1924 VectorType *VecF32Ty =
1925 VectorType::get(Type::getFloatTy(Dst->getContext()),
1926 cast<VectorType>(Dst)->getElementCount());
1927 std::pair<InstructionCost, MVT> VecF32LT =
1928 getTypeLegalizationCost(VecF32Ty);
1929 Cost +=
1930 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1931 VecF32LT.second, CostKind);
1932 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1933 return Cost;
1934 }
1935 if (DstEltSize == SrcEltSize)
1936 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1937 else if (DstEltSize > SrcEltSize)
1938 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1939 else { // (SrcEltSize > DstEltSize)
1940 // First do a narrowing conversion to an integer half the size, then
1941 // truncate if needed.
1942 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1943 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1944 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1945 if ((SrcEltSize / 2) > DstEltSize) {
1946 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1947 Cost +=
1948 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1949 }
1950 }
1951 return Cost;
1952 }
1953 case ISD::SINT_TO_FP:
1954 case ISD::UINT_TO_FP: {
1955 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1956 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1957 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1958 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1959 unsigned SrcEltSize = Src->getScalarSizeInBits();
1960 unsigned DstEltSize = Dst->getScalarSizeInBits();
1961
1963 if ((DstEltSize == 16) &&
1964 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1965 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1966 // it is converted to f32 and then converted to f16
1967 VectorType *VecF32Ty =
1968 VectorType::get(Type::getFloatTy(Dst->getContext()),
1969 cast<VectorType>(Dst)->getElementCount());
1970 std::pair<InstructionCost, MVT> VecF32LT =
1971 getTypeLegalizationCost(VecF32Ty);
1972 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1973 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1974 DstLT.second, CostKind);
1975 return Cost;
1976 }
1977
1978 if (DstEltSize == SrcEltSize)
1979 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1980 else if (DstEltSize > SrcEltSize) {
1981 if ((DstEltSize / 2) > SrcEltSize) {
1982 VectorType *VecTy =
1983 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1984 cast<VectorType>(Dst)->getElementCount());
1985 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1986 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1987 }
1988 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1989 } else
1990 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1991 return Cost;
1992 }
1993 }
1994 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1995}
1996
1997unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1998 if (isa<ScalableVectorType>(Ty)) {
1999 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
2000 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
2001 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
2002 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
2003 }
2004 return cast<FixedVectorType>(Ty)->getNumElements();
2005}
2006
2009 FastMathFlags FMF,
2011 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2012 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2013
2014 // Skip if scalar size of Ty is bigger than ELEN.
2015 if (Ty->getScalarSizeInBits() > ST->getELen())
2016 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2017
2018 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2019 if (Ty->getElementType()->isIntegerTy(1)) {
2020 // SelectionDAGBuilder does following transforms:
2021 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2022 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2023 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2024 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2025 else
2026 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2027 }
2028
2029 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2031 InstructionCost ExtraCost = 0;
2032 switch (IID) {
2033 case Intrinsic::maximum:
2034 if (FMF.noNaNs()) {
2035 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2036 } else {
2037 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2038 RISCV::VFMV_F_S};
2039 // Cost of Canonical Nan + branch
2040 // lui a0, 523264
2041 // fmv.w.x fa0, a0
2042 Type *DstTy = Ty->getScalarType();
2043 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2044 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2045 ExtraCost = 1 +
2046 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2048 getCFInstrCost(Instruction::CondBr, CostKind);
2049 }
2050 break;
2051
2052 case Intrinsic::minimum:
2053 if (FMF.noNaNs()) {
2054 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2055 } else {
2056 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2057 RISCV::VFMV_F_S};
2058 // Cost of Canonical Nan + branch
2059 // lui a0, 523264
2060 // fmv.w.x fa0, a0
2061 Type *DstTy = Ty->getScalarType();
2062 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2063 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2064 ExtraCost = 1 +
2065 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2067 getCFInstrCost(Instruction::CondBr, CostKind);
2068 }
2069 break;
2070 }
2071 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2072 }
2073
2074 // IR Reduction is composed by one rvv reduction instruction and vmv
2075 unsigned SplitOp;
2077 switch (IID) {
2078 default:
2079 llvm_unreachable("Unsupported intrinsic");
2080 case Intrinsic::smax:
2081 SplitOp = RISCV::VMAX_VV;
2082 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2083 break;
2084 case Intrinsic::smin:
2085 SplitOp = RISCV::VMIN_VV;
2086 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2087 break;
2088 case Intrinsic::umax:
2089 SplitOp = RISCV::VMAXU_VV;
2090 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2091 break;
2092 case Intrinsic::umin:
2093 SplitOp = RISCV::VMINU_VV;
2094 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2095 break;
2096 case Intrinsic::maxnum:
2097 SplitOp = RISCV::VFMAX_VV;
2098 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2099 break;
2100 case Intrinsic::minnum:
2101 SplitOp = RISCV::VFMIN_VV;
2102 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2103 break;
2104 }
2105 // Add a cost for data larger than LMUL8
2106 InstructionCost SplitCost =
2107 (LT.first > 1) ? (LT.first - 1) *
2108 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2109 : 0;
2110 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2111}
2112
2115 std::optional<FastMathFlags> FMF,
2117 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2118 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2119
2120 // Skip if scalar size of Ty is bigger than ELEN.
2121 if (Ty->getScalarSizeInBits() > ST->getELen())
2122 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2123
2124 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2125 assert(ISD && "Invalid opcode");
2126
2127 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2128 ISD != ISD::FADD)
2129 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2130
2131 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2132 Type *ElementTy = Ty->getElementType();
2133 if (ElementTy->isIntegerTy(1)) {
2134 // Example sequences:
2135 // vfirst.m a0, v0
2136 // seqz a0, a0
2137 if (LT.second == MVT::v1i1)
2138 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2139 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2141
2142 if (ISD == ISD::AND) {
2143 // Example sequences:
2144 // vmand.mm v8, v9, v8 ; needed every time type is split
2145 // vmnot.m v8, v0 ; alias for vmnand
2146 // vcpop.m a0, v8
2147 // seqz a0, a0
2148
2149 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2150 // For LMUL <= 8, there is no splitting,
2151 // the sequences are vmnot, vcpop and seqz.
2152 // When LMUL > 8 and split = 1,
2153 // the sequences are vmnand, vcpop and seqz.
2154 // When LMUL > 8 and split > 1,
2155 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2156 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2157 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2158 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2159 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2160 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2162 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2163 // Example sequences:
2164 // vsetvli a0, zero, e8, mf8, ta, ma
2165 // vmxor.mm v8, v0, v8 ; needed every time type is split
2166 // vcpop.m a0, v8
2167 // andi a0, a0, 1
2168 return (LT.first - 1) *
2169 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2170 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2171 } else {
2172 assert(ISD == ISD::OR);
2173 // Example sequences:
2174 // vsetvli a0, zero, e8, mf8, ta, ma
2175 // vmor.mm v8, v9, v8 ; needed every time type is split
2176 // vcpop.m a0, v0
2177 // snez a0, a0
2178 return (LT.first - 1) *
2179 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2180 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2181 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2183 }
2184 }
2185
2186 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2187 // instruction, and others is composed by two vmv and one rvv reduction
2188 // instruction
2189 unsigned SplitOp;
2191 switch (ISD) {
2192 case ISD::ADD:
2193 SplitOp = RISCV::VADD_VV;
2194 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2195 break;
2196 case ISD::OR:
2197 SplitOp = RISCV::VOR_VV;
2198 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2199 break;
2200 case ISD::XOR:
2201 SplitOp = RISCV::VXOR_VV;
2202 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2203 break;
2204 case ISD::AND:
2205 SplitOp = RISCV::VAND_VV;
2206 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2207 break;
2208 case ISD::FADD:
2209 // We can't promote f16/bf16 fadd reductions.
2210 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2211 LT.second.getScalarType() == MVT::bf16)
2212 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2214 Opcodes.push_back(RISCV::VFMV_S_F);
2215 for (unsigned i = 0; i < LT.first.getValue(); i++)
2216 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2217 Opcodes.push_back(RISCV::VFMV_F_S);
2218 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2219 }
2220 SplitOp = RISCV::VFADD_VV;
2221 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2222 break;
2223 }
2224 // Add a cost for data larger than LMUL8
2225 InstructionCost SplitCost =
2226 (LT.first > 1) ? (LT.first - 1) *
2227 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2228 : 0;
2229 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2230}
2231
2233 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2234 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2235 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2236 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2237 FMF, CostKind);
2238
2239 // Skip if scalar size of ResTy is bigger than ELEN.
2240 if (ResTy->getScalarSizeInBits() > ST->getELen())
2241 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2242 FMF, CostKind);
2243
2244 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2245 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2246 FMF, CostKind);
2247
2248 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2249
2250 if (IsUnsigned && Opcode == Instruction::Add &&
2251 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2252 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2253 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2254 return LT.first *
2255 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2256 }
2257
2258 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2259 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2260 FMF, CostKind);
2261
2262 return (LT.first - 1) +
2263 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2264}
2265
2269 assert(OpInfo.isConstant() && "non constant operand?");
2270 if (!isa<VectorType>(Ty))
2271 // FIXME: We need to account for immediate materialization here, but doing
2272 // a decent job requires more knowledge about the immediate than we
2273 // currently have here.
2274 return 0;
2275
2276 if (OpInfo.isUniform())
2277 // vmv.v.i, vmv.v.x, or vfmv.v.f
2278 // We ignore the cost of the scalar constant materialization to be consistent
2279 // with how we treat scalar constants themselves just above.
2280 return 1;
2281
2282 return getConstantPoolLoadCost(Ty, CostKind);
2283}
2284
2286 Align Alignment,
2287 unsigned AddressSpace,
2289 TTI::OperandValueInfo OpInfo,
2290 const Instruction *I) const {
2291 EVT VT = TLI->getValueType(DL, Src, true);
2292 // Type legalization can't handle structs
2293 if (VT == MVT::Other)
2294 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2295 CostKind, OpInfo, I);
2296
2298 if (Opcode == Instruction::Store && OpInfo.isConstant())
2299 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2300
2301 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2302
2303 InstructionCost BaseCost = [&]() {
2304 InstructionCost Cost = LT.first;
2306 return Cost;
2307
2308 // Our actual lowering for the case where a wider legal type is available
2309 // uses the a VL predicated load on the wider type. This is reflected in
2310 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2311 // widened cases are scalarized.
2312 const DataLayout &DL = this->getDataLayout();
2313 if (Src->isVectorTy() && LT.second.isVector() &&
2314 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2315 LT.second.getSizeInBits()))
2316 return Cost;
2317
2318 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2319 CostKind, OpInfo, I);
2320 }();
2321
2322 // Assume memory ops cost scale with the number of vector registers
2323 // possible accessed by the instruction. Note that BasicTTI already
2324 // handles the LT.first term for us.
2325 if (ST->hasVInstructions() && LT.second.isVector() &&
2327 BaseCost *= TLI->getLMULCost(LT.second);
2328 return Cost + BaseCost;
2329}
2330
2332 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2334 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2336 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2337 Op1Info, Op2Info, I);
2338
2339 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2340 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2341 Op1Info, Op2Info, I);
2342
2343 // Skip if scalar size of ValTy is bigger than ELEN.
2344 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2345 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2346 Op1Info, Op2Info, I);
2347
2348 auto GetConstantMatCost =
2349 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2350 if (OpInfo.isUniform())
2351 // We return 0 we currently ignore the cost of materializing scalar
2352 // constants in GPRs.
2353 return 0;
2354
2355 return getConstantPoolLoadCost(ValTy, CostKind);
2356 };
2357
2358 InstructionCost ConstantMatCost;
2359 if (Op1Info.isConstant())
2360 ConstantMatCost += GetConstantMatCost(Op1Info);
2361 if (Op2Info.isConstant())
2362 ConstantMatCost += GetConstantMatCost(Op2Info);
2363
2364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2365 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2366 if (CondTy->isVectorTy()) {
2367 if (ValTy->getScalarSizeInBits() == 1) {
2368 // vmandn.mm v8, v8, v9
2369 // vmand.mm v9, v0, v9
2370 // vmor.mm v0, v9, v8
2371 return ConstantMatCost +
2372 LT.first *
2373 getRISCVInstructionCost(
2374 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2375 LT.second, CostKind);
2376 }
2377 // vselect and max/min are supported natively.
2378 return ConstantMatCost +
2379 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2380 CostKind);
2381 }
2382
2383 if (ValTy->getScalarSizeInBits() == 1) {
2384 // vmv.v.x v9, a0
2385 // vmsne.vi v9, v9, 0
2386 // vmandn.mm v8, v8, v9
2387 // vmand.mm v9, v0, v9
2388 // vmor.mm v0, v9, v8
2389 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2390 return ConstantMatCost +
2391 LT.first *
2392 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2393 InterimVT, CostKind) +
2394 LT.first * getRISCVInstructionCost(
2395 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2396 LT.second, CostKind);
2397 }
2398
2399 // vmv.v.x v10, a0
2400 // vmsne.vi v0, v10, 0
2401 // vmerge.vvm v8, v9, v8, v0
2402 return ConstantMatCost +
2403 LT.first * getRISCVInstructionCost(
2404 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2405 LT.second, CostKind);
2406 }
2407
2408 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2409 CmpInst::isIntPredicate(VecPred)) {
2410 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2411 // provided they incur the same cost across all implementations
2412 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2413 LT.second,
2414 CostKind);
2415 }
2416
2417 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2418 CmpInst::isFPPredicate(VecPred)) {
2419
2420 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2421 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2422 return ConstantMatCost +
2423 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2424
2425 // If we do not support the input floating point vector type, use the base
2426 // one which will calculate as:
2427 // ScalarizeCost + Num * Cost for fixed vector,
2428 // InvalidCost for scalable vector.
2429 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2430 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2431 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2432 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2433 Op1Info, Op2Info, I);
2434
2435 // Assuming vector fp compare and mask instructions are all the same cost
2436 // until a need arises to differentiate them.
2437 switch (VecPred) {
2438 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2439 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2440 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2441 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2442 return ConstantMatCost +
2443 LT.first * getRISCVInstructionCost(
2444 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2445 LT.second, CostKind);
2446
2447 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2448 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2449 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2450 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2451 return ConstantMatCost +
2452 LT.first *
2453 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2454 LT.second, CostKind);
2455
2456 case CmpInst::FCMP_OEQ: // vmfeq.vv
2457 case CmpInst::FCMP_OGT: // vmflt.vv
2458 case CmpInst::FCMP_OGE: // vmfle.vv
2459 case CmpInst::FCMP_OLT: // vmflt.vv
2460 case CmpInst::FCMP_OLE: // vmfle.vv
2461 case CmpInst::FCMP_UNE: // vmfne.vv
2462 return ConstantMatCost +
2463 LT.first *
2464 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2465 default:
2466 break;
2467 }
2468 }
2469
2470 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2471 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2472 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2473 // be (0 + select instr cost).
2474 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2475 ValTy->isIntegerTy() && !I->user_empty()) {
2476 if (all_of(I->users(), [&](const User *U) {
2477 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2478 U->getType()->isIntegerTy() &&
2479 !isa<ConstantData>(U->getOperand(1)) &&
2480 !isa<ConstantData>(U->getOperand(2));
2481 }))
2482 return 0;
2483 }
2484
2485 // TODO: Add cost for scalar type.
2486
2487 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2488 Op1Info, Op2Info, I);
2489}
2490
2493 const Instruction *I) const {
2495 return Opcode == Instruction::PHI ? 0 : 1;
2496 // Branches are assumed to be predicted.
2497 return 0;
2498}
2499
2501 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2502 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2503 assert(Val->isVectorTy() && "This must be a vector type");
2504
2505 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2506 // For now, skip all fixed vector cost analysis when P extension is available
2507 // to avoid crashes in getMinRVVVectorSizeInBits()
2508 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2509 return 1; // Treat as single instruction cost for now
2510 }
2511
2512 if (Opcode != Instruction::ExtractElement &&
2513 Opcode != Instruction::InsertElement)
2514 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2515 VIC);
2516
2517 // Legalize the type.
2518 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2519
2520 // This type is legalized to a scalar type.
2521 if (!LT.second.isVector()) {
2522 auto *FixedVecTy = cast<FixedVectorType>(Val);
2523 // If Index is a known constant, cost is zero.
2524 if (Index != -1U)
2525 return 0;
2526 // Extract/InsertElement with non-constant index is very costly when
2527 // scalarized; estimate cost of loads/stores sequence via the stack:
2528 // ExtractElement cost: store vector to stack, load scalar;
2529 // InsertElement cost: store vector to stack, store scalar, load vector.
2530 Type *ElemTy = FixedVecTy->getElementType();
2531 auto NumElems = FixedVecTy->getNumElements();
2532 auto Align = DL.getPrefTypeAlign(ElemTy);
2533 InstructionCost LoadCost =
2534 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2535 InstructionCost StoreCost =
2536 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2537 return Opcode == Instruction::ExtractElement
2538 ? StoreCost * NumElems + LoadCost
2539 : (StoreCost + LoadCost) * NumElems + StoreCost;
2540 }
2541
2542 // For unsupported scalable vector.
2543 if (LT.second.isScalableVector() && !LT.first.isValid())
2544 return LT.first;
2545
2546 // Mask vector extract/insert is expanded via e8.
2547 if (Val->getScalarSizeInBits() == 1) {
2548 VectorType *WideTy =
2550 cast<VectorType>(Val)->getElementCount());
2551 if (Opcode == Instruction::ExtractElement) {
2552 InstructionCost ExtendCost
2553 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2555 InstructionCost ExtractCost
2556 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2557 return ExtendCost + ExtractCost;
2558 }
2559 InstructionCost ExtendCost
2560 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2562 InstructionCost InsertCost
2563 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2564 InstructionCost TruncCost
2565 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2567 return ExtendCost + InsertCost + TruncCost;
2568 }
2569
2570
2571 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2572 // and vslideup + vmv.s.x to insert element to vector.
2573 unsigned BaseCost = 1;
2574 // When insertelement we should add the index with 1 as the input of vslideup.
2575 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2576
2577 if (Index != -1U) {
2578 // The type may be split. For fixed-width vectors we can normalize the
2579 // index to the new type.
2580 if (LT.second.isFixedLengthVector()) {
2581 unsigned Width = LT.second.getVectorNumElements();
2582 Index = Index % Width;
2583 }
2584
2585 // If exact VLEN is known, we will insert/extract into the appropriate
2586 // subvector with no additional subvector insert/extract cost.
2587 if (auto VLEN = ST->getRealVLen()) {
2588 unsigned EltSize = LT.second.getScalarSizeInBits();
2589 unsigned M1Max = *VLEN / EltSize;
2590 Index = Index % M1Max;
2591 }
2592
2593 if (Index == 0)
2594 // We can extract/insert the first element without vslidedown/vslideup.
2595 SlideCost = 0;
2596 else if (Opcode == Instruction::InsertElement)
2597 SlideCost = 1; // With a constant index, we do not need to use addi.
2598 }
2599
2600 // When the vector needs to split into multiple register groups and the index
2601 // exceeds single vector register group, we need to insert/extract the element
2602 // via stack.
2603 if (LT.first > 1 &&
2604 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2605 LT.second.isScalableVector()))) {
2606 Type *ScalarType = Val->getScalarType();
2607 Align VecAlign = DL.getPrefTypeAlign(Val);
2608 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2609 // Extra addi for unknown index.
2610 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2611
2612 // Store all split vectors into stack and load the target element.
2613 if (Opcode == Instruction::ExtractElement)
2614 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2615 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2616 CostKind) +
2617 IdxCost;
2618
2619 // Store all split vectors into stack and store the target element and load
2620 // vectors back.
2621 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2622 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2623 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2624 CostKind) +
2625 IdxCost;
2626 }
2627
2628 // Extract i64 in the target that has XLEN=32 need more instruction.
2629 if (Val->getScalarType()->isIntegerTy() &&
2630 ST->getXLen() < Val->getScalarSizeInBits()) {
2631 // For extractelement, we need the following instructions:
2632 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2633 // vslidedown.vx v8, v8, a0
2634 // vmv.x.s a0, v8
2635 // li a1, 32
2636 // vsrl.vx v8, v8, a1
2637 // vmv.x.s a1, v8
2638
2639 // For insertelement, we need the following instructions:
2640 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2641 // vmv.v.i v12, 0
2642 // vslide1up.vx v16, v12, a1
2643 // vslide1up.vx v12, v16, a0
2644 // addi a0, a2, 1
2645 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2646 // vslideup.vx v8, v12, a2
2647
2648 // TODO: should we count these special vsetvlis?
2649 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2650 }
2651 return BaseCost + SlideCost;
2652}
2653
2657 unsigned Index) const {
2658 if (isa<FixedVectorType>(Val))
2660 Index);
2661
2662 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2663 // for the cost of extracting the last lane of a scalable vector. It probably
2664 // needs a more accurate cost.
2665 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2666 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2667 return getVectorInstrCost(Opcode, Val, CostKind,
2668 EC.getKnownMinValue() - 1 - Index, nullptr,
2669 nullptr);
2670}
2671
2673 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2675 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2676
2677 // TODO: Handle more cost kinds.
2679 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2680 Args, CxtI);
2681
2682 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2684 Args, CxtI);
2685
2686 // Skip if scalar size of Ty is bigger than ELEN.
2687 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2688 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2689 Args, CxtI);
2690
2691 // Legalize the type.
2692 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2693 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2694
2695 // TODO: Handle scalar type.
2696 if (!LT.second.isVector()) {
2697 static const CostTblEntry DivTbl[]{
2698 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2699 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2700 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2701 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2702 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2703 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2704 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2705 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2706 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2707 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2708 return Entry->Cost * LT.first;
2709
2710 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2711 Args, CxtI);
2712 }
2713
2714 // f16 with zvfhmin and bf16 will be promoted to f32.
2715 // FIXME: nxv32[b]f16 will be custom lowered and split.
2716 InstructionCost CastCost = 0;
2717 if ((LT.second.getVectorElementType() == MVT::f16 ||
2718 LT.second.getVectorElementType() == MVT::bf16) &&
2719 TLI->getOperationAction(ISDOpcode, LT.second) ==
2721 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2722 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2723 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2724 // Add cost of extending arguments
2725 CastCost += LT.first * Args.size() *
2726 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2728 // Add cost of truncating result
2729 CastCost +=
2730 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2732 // Compute cost of op in promoted type
2733 LT.second = PromotedVT;
2734 }
2735
2736 auto getConstantMatCost =
2737 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2738 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2739 // Two sub-cases:
2740 // * Has a 5 bit immediate operand which can be splatted.
2741 // * Has a larger immediate which must be materialized in scalar register
2742 // We return 0 for both as we currently ignore the cost of materializing
2743 // scalar constants in GPRs.
2744 return 0;
2745
2746 return getConstantPoolLoadCost(Ty, CostKind);
2747 };
2748
2749 // Add the cost of materializing any constant vectors required.
2750 InstructionCost ConstantMatCost = 0;
2751 if (Op1Info.isConstant())
2752 ConstantMatCost += getConstantMatCost(0, Op1Info);
2753 if (Op2Info.isConstant())
2754 ConstantMatCost += getConstantMatCost(1, Op2Info);
2755
2756 unsigned Op;
2757 switch (ISDOpcode) {
2758 case ISD::ADD:
2759 case ISD::SUB:
2760 Op = RISCV::VADD_VV;
2761 break;
2762 case ISD::SHL:
2763 case ISD::SRL:
2764 case ISD::SRA:
2765 Op = RISCV::VSLL_VV;
2766 break;
2767 case ISD::AND:
2768 case ISD::OR:
2769 case ISD::XOR:
2770 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2771 break;
2772 case ISD::MUL:
2773 case ISD::MULHS:
2774 case ISD::MULHU:
2775 Op = RISCV::VMUL_VV;
2776 break;
2777 case ISD::SDIV:
2778 case ISD::UDIV:
2779 Op = RISCV::VDIV_VV;
2780 break;
2781 case ISD::SREM:
2782 case ISD::UREM:
2783 Op = RISCV::VREM_VV;
2784 break;
2785 case ISD::FADD:
2786 case ISD::FSUB:
2787 Op = RISCV::VFADD_VV;
2788 break;
2789 case ISD::FMUL:
2790 Op = RISCV::VFMUL_VV;
2791 break;
2792 case ISD::FDIV:
2793 Op = RISCV::VFDIV_VV;
2794 break;
2795 case ISD::FNEG:
2796 Op = RISCV::VFSGNJN_VV;
2797 break;
2798 default:
2799 // Assuming all other instructions have the same cost until a need arises to
2800 // differentiate them.
2801 return CastCost + ConstantMatCost +
2802 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2803 Args, CxtI);
2804 }
2805
2806 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2807 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2808 // ops are twice as expensive as integer ops. Do the same for vectors so
2809 // scalar floating point ops aren't cheaper than their vector equivalents.
2810 if (Ty->isFPOrFPVectorTy())
2811 InstrCost *= 2;
2812 return CastCost + ConstantMatCost + LT.first * InstrCost;
2813}
2814
2815// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2817 ArrayRef<const Value *> Ptrs, const Value *Base,
2818 const TTI::PointersChainInfo &Info, Type *AccessTy,
2821 // In the basic model we take into account GEP instructions only
2822 // (although here can come alloca instruction, a value, constants and/or
2823 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2824 // pointer). Typically, if Base is a not a GEP-instruction and all the
2825 // pointers are relative to the same base address, all the rest are
2826 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2827 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2828 // any their index is a non-const.
2829 // If no known dependencies between the pointers cost is calculated as a sum
2830 // of costs of GEP instructions.
2831 for (auto [I, V] : enumerate(Ptrs)) {
2832 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2833 if (!GEP)
2834 continue;
2835 if (Info.isSameBase() && V != Base) {
2836 if (GEP->hasAllConstantIndices())
2837 continue;
2838 // If the chain is unit-stride and BaseReg + stride*i is a legal
2839 // addressing mode, then presume the base GEP is sitting around in a
2840 // register somewhere and check if we can fold the offset relative to
2841 // it.
2842 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2843 if (Info.isUnitStride() &&
2844 isLegalAddressingMode(AccessTy,
2845 /* BaseGV */ nullptr,
2846 /* BaseOffset */ Stride * I,
2847 /* HasBaseReg */ true,
2848 /* Scale */ 0,
2849 GEP->getType()->getPointerAddressSpace()))
2850 continue;
2851 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2852 {TTI::OK_AnyValue, TTI::OP_None},
2853 {TTI::OK_AnyValue, TTI::OP_None}, {});
2854 } else {
2855 SmallVector<const Value *> Indices(GEP->indices());
2856 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2857 Indices, AccessTy, CostKind);
2858 }
2859 }
2860 return Cost;
2861}
2862
2865 OptimizationRemarkEmitter *ORE) const {
2866 // TODO: More tuning on benchmarks and metrics with changes as needed
2867 // would apply to all settings below to enable performance.
2868
2869
2870 if (ST->enableDefaultUnroll())
2871 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2872
2873 // Enable Upper bound unrolling universally, not dependent upon the conditions
2874 // below.
2875 UP.UpperBound = true;
2876
2877 // Disable loop unrolling for Oz and Os.
2878 UP.OptSizeThreshold = 0;
2880 if (L->getHeader()->getParent()->hasOptSize())
2881 return;
2882
2883 SmallVector<BasicBlock *, 4> ExitingBlocks;
2884 L->getExitingBlocks(ExitingBlocks);
2885 LLVM_DEBUG(dbgs() << "Loop has:\n"
2886 << "Blocks: " << L->getNumBlocks() << "\n"
2887 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2888
2889 // Only allow another exit other than the latch. This acts as an early exit
2890 // as it mirrors the profitability calculation of the runtime unroller.
2891 if (ExitingBlocks.size() > 2)
2892 return;
2893
2894 // Limit the CFG of the loop body for targets with a branch predictor.
2895 // Allowing 4 blocks permits if-then-else diamonds in the body.
2896 if (L->getNumBlocks() > 4)
2897 return;
2898
2899 // Scan the loop: don't unroll loops with calls as this could prevent
2900 // inlining. Don't unroll auto-vectorized loops either, though do allow
2901 // unrolling of the scalar remainder.
2902 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2904 for (auto *BB : L->getBlocks()) {
2905 for (auto &I : *BB) {
2906 // Both auto-vectorized loops and the scalar remainder have the
2907 // isvectorized attribute, so differentiate between them by the presence
2908 // of vector instructions.
2909 if (IsVectorized && (I.getType()->isVectorTy() ||
2910 llvm::any_of(I.operand_values(), [](Value *V) {
2911 return V->getType()->isVectorTy();
2912 })))
2913 return;
2914
2915 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2916 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2917 if (!isLoweredToCall(F))
2918 continue;
2919 }
2920 return;
2921 }
2922
2923 SmallVector<const Value *> Operands(I.operand_values());
2924 Cost += getInstructionCost(&I, Operands,
2926 }
2927 }
2928
2929 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2930
2931 UP.Partial = true;
2932 UP.Runtime = true;
2933 UP.UnrollRemainder = true;
2934 UP.UnrollAndJam = true;
2935
2936 // Force unrolling small loops can be very useful because of the branch
2937 // taken cost of the backedge.
2938 if (Cost < 12)
2939 UP.Force = true;
2940}
2941
2946
2948 MemIntrinsicInfo &Info) const {
2949 const DataLayout &DL = getDataLayout();
2950 Intrinsic::ID IID = Inst->getIntrinsicID();
2951 LLVMContext &C = Inst->getContext();
2952 bool HasMask = false;
2953
2954 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2955 bool IsWrite) -> int64_t {
2956 if (auto *TarExtTy =
2957 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2958 return TarExtTy->getIntParameter(0);
2959
2960 return 1;
2961 };
2962
2963 switch (IID) {
2964 case Intrinsic::riscv_vle_mask:
2965 case Intrinsic::riscv_vse_mask:
2966 case Intrinsic::riscv_vlseg2_mask:
2967 case Intrinsic::riscv_vlseg3_mask:
2968 case Intrinsic::riscv_vlseg4_mask:
2969 case Intrinsic::riscv_vlseg5_mask:
2970 case Intrinsic::riscv_vlseg6_mask:
2971 case Intrinsic::riscv_vlseg7_mask:
2972 case Intrinsic::riscv_vlseg8_mask:
2973 case Intrinsic::riscv_vsseg2_mask:
2974 case Intrinsic::riscv_vsseg3_mask:
2975 case Intrinsic::riscv_vsseg4_mask:
2976 case Intrinsic::riscv_vsseg5_mask:
2977 case Intrinsic::riscv_vsseg6_mask:
2978 case Intrinsic::riscv_vsseg7_mask:
2979 case Intrinsic::riscv_vsseg8_mask:
2980 HasMask = true;
2981 [[fallthrough]];
2982 case Intrinsic::riscv_vle:
2983 case Intrinsic::riscv_vse:
2984 case Intrinsic::riscv_vlseg2:
2985 case Intrinsic::riscv_vlseg3:
2986 case Intrinsic::riscv_vlseg4:
2987 case Intrinsic::riscv_vlseg5:
2988 case Intrinsic::riscv_vlseg6:
2989 case Intrinsic::riscv_vlseg7:
2990 case Intrinsic::riscv_vlseg8:
2991 case Intrinsic::riscv_vsseg2:
2992 case Intrinsic::riscv_vsseg3:
2993 case Intrinsic::riscv_vsseg4:
2994 case Intrinsic::riscv_vsseg5:
2995 case Intrinsic::riscv_vsseg6:
2996 case Intrinsic::riscv_vsseg7:
2997 case Intrinsic::riscv_vsseg8: {
2998 // Intrinsic interface:
2999 // riscv_vle(merge, ptr, vl)
3000 // riscv_vle_mask(merge, ptr, mask, vl, policy)
3001 // riscv_vse(val, ptr, vl)
3002 // riscv_vse_mask(val, ptr, mask, vl, policy)
3003 // riscv_vlseg#(merge, ptr, vl, sew)
3004 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3005 // riscv_vsseg#(val, ptr, vl, sew)
3006 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3007 bool IsWrite = Inst->getType()->isVoidTy();
3008 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3009 // The results of segment loads are TargetExtType.
3010 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3011 unsigned SEW =
3012 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3013 ->getZExtValue();
3014 Ty = TarExtTy->getTypeParameter(0U);
3016 IntegerType::get(C, SEW),
3017 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3018 }
3019 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3020 unsigned VLIndex = RVVIInfo->VLOperand;
3021 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3022 MaybeAlign Alignment =
3023 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3024 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3025 Value *Mask = ConstantInt::getTrue(MaskType);
3026 if (HasMask)
3027 Mask = Inst->getArgOperand(VLIndex - 1);
3028 Value *EVL = Inst->getArgOperand(VLIndex);
3029 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3030 // RVV uses contiguous elements as a segment.
3031 if (SegNum > 1) {
3032 unsigned ElemSize = Ty->getScalarSizeInBits();
3033 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3034 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3035 }
3036 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3037 Alignment, Mask, EVL);
3038 return true;
3039 }
3040 case Intrinsic::riscv_vlse_mask:
3041 case Intrinsic::riscv_vsse_mask:
3042 case Intrinsic::riscv_vlsseg2_mask:
3043 case Intrinsic::riscv_vlsseg3_mask:
3044 case Intrinsic::riscv_vlsseg4_mask:
3045 case Intrinsic::riscv_vlsseg5_mask:
3046 case Intrinsic::riscv_vlsseg6_mask:
3047 case Intrinsic::riscv_vlsseg7_mask:
3048 case Intrinsic::riscv_vlsseg8_mask:
3049 case Intrinsic::riscv_vssseg2_mask:
3050 case Intrinsic::riscv_vssseg3_mask:
3051 case Intrinsic::riscv_vssseg4_mask:
3052 case Intrinsic::riscv_vssseg5_mask:
3053 case Intrinsic::riscv_vssseg6_mask:
3054 case Intrinsic::riscv_vssseg7_mask:
3055 case Intrinsic::riscv_vssseg8_mask:
3056 HasMask = true;
3057 [[fallthrough]];
3058 case Intrinsic::riscv_vlse:
3059 case Intrinsic::riscv_vsse:
3060 case Intrinsic::riscv_vlsseg2:
3061 case Intrinsic::riscv_vlsseg3:
3062 case Intrinsic::riscv_vlsseg4:
3063 case Intrinsic::riscv_vlsseg5:
3064 case Intrinsic::riscv_vlsseg6:
3065 case Intrinsic::riscv_vlsseg7:
3066 case Intrinsic::riscv_vlsseg8:
3067 case Intrinsic::riscv_vssseg2:
3068 case Intrinsic::riscv_vssseg3:
3069 case Intrinsic::riscv_vssseg4:
3070 case Intrinsic::riscv_vssseg5:
3071 case Intrinsic::riscv_vssseg6:
3072 case Intrinsic::riscv_vssseg7:
3073 case Intrinsic::riscv_vssseg8: {
3074 // Intrinsic interface:
3075 // riscv_vlse(merge, ptr, stride, vl)
3076 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3077 // riscv_vsse(val, ptr, stride, vl)
3078 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3079 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3080 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3081 // riscv_vssseg#(val, ptr, offset, vl, sew)
3082 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3083 bool IsWrite = Inst->getType()->isVoidTy();
3084 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3085 // The results of segment loads are TargetExtType.
3086 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3087 unsigned SEW =
3088 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3089 ->getZExtValue();
3090 Ty = TarExtTy->getTypeParameter(0U);
3092 IntegerType::get(C, SEW),
3093 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3094 }
3095 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3096 unsigned VLIndex = RVVIInfo->VLOperand;
3097 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3098 MaybeAlign Alignment =
3099 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3100
3101 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3102 // Use the pointer alignment as the element alignment if the stride is a
3103 // multiple of the pointer alignment. Otherwise, the element alignment
3104 // should be the greatest common divisor of pointer alignment and stride.
3105 // For simplicity, just consider unalignment for elements.
3106 unsigned PointerAlign = Alignment.valueOrOne().value();
3107 if (!isa<ConstantInt>(Stride) ||
3108 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3109 Alignment = Align(1);
3110
3111 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3112 Value *Mask = ConstantInt::getTrue(MaskType);
3113 if (HasMask)
3114 Mask = Inst->getArgOperand(VLIndex - 1);
3115 Value *EVL = Inst->getArgOperand(VLIndex);
3116 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3117 // RVV uses contiguous elements as a segment.
3118 if (SegNum > 1) {
3119 unsigned ElemSize = Ty->getScalarSizeInBits();
3120 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3121 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3122 }
3123 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3124 Alignment, Mask, EVL, Stride);
3125 return true;
3126 }
3127 case Intrinsic::riscv_vloxei_mask:
3128 case Intrinsic::riscv_vluxei_mask:
3129 case Intrinsic::riscv_vsoxei_mask:
3130 case Intrinsic::riscv_vsuxei_mask:
3131 case Intrinsic::riscv_vloxseg2_mask:
3132 case Intrinsic::riscv_vloxseg3_mask:
3133 case Intrinsic::riscv_vloxseg4_mask:
3134 case Intrinsic::riscv_vloxseg5_mask:
3135 case Intrinsic::riscv_vloxseg6_mask:
3136 case Intrinsic::riscv_vloxseg7_mask:
3137 case Intrinsic::riscv_vloxseg8_mask:
3138 case Intrinsic::riscv_vluxseg2_mask:
3139 case Intrinsic::riscv_vluxseg3_mask:
3140 case Intrinsic::riscv_vluxseg4_mask:
3141 case Intrinsic::riscv_vluxseg5_mask:
3142 case Intrinsic::riscv_vluxseg6_mask:
3143 case Intrinsic::riscv_vluxseg7_mask:
3144 case Intrinsic::riscv_vluxseg8_mask:
3145 case Intrinsic::riscv_vsoxseg2_mask:
3146 case Intrinsic::riscv_vsoxseg3_mask:
3147 case Intrinsic::riscv_vsoxseg4_mask:
3148 case Intrinsic::riscv_vsoxseg5_mask:
3149 case Intrinsic::riscv_vsoxseg6_mask:
3150 case Intrinsic::riscv_vsoxseg7_mask:
3151 case Intrinsic::riscv_vsoxseg8_mask:
3152 case Intrinsic::riscv_vsuxseg2_mask:
3153 case Intrinsic::riscv_vsuxseg3_mask:
3154 case Intrinsic::riscv_vsuxseg4_mask:
3155 case Intrinsic::riscv_vsuxseg5_mask:
3156 case Intrinsic::riscv_vsuxseg6_mask:
3157 case Intrinsic::riscv_vsuxseg7_mask:
3158 case Intrinsic::riscv_vsuxseg8_mask:
3159 HasMask = true;
3160 [[fallthrough]];
3161 case Intrinsic::riscv_vloxei:
3162 case Intrinsic::riscv_vluxei:
3163 case Intrinsic::riscv_vsoxei:
3164 case Intrinsic::riscv_vsuxei:
3165 case Intrinsic::riscv_vloxseg2:
3166 case Intrinsic::riscv_vloxseg3:
3167 case Intrinsic::riscv_vloxseg4:
3168 case Intrinsic::riscv_vloxseg5:
3169 case Intrinsic::riscv_vloxseg6:
3170 case Intrinsic::riscv_vloxseg7:
3171 case Intrinsic::riscv_vloxseg8:
3172 case Intrinsic::riscv_vluxseg2:
3173 case Intrinsic::riscv_vluxseg3:
3174 case Intrinsic::riscv_vluxseg4:
3175 case Intrinsic::riscv_vluxseg5:
3176 case Intrinsic::riscv_vluxseg6:
3177 case Intrinsic::riscv_vluxseg7:
3178 case Intrinsic::riscv_vluxseg8:
3179 case Intrinsic::riscv_vsoxseg2:
3180 case Intrinsic::riscv_vsoxseg3:
3181 case Intrinsic::riscv_vsoxseg4:
3182 case Intrinsic::riscv_vsoxseg5:
3183 case Intrinsic::riscv_vsoxseg6:
3184 case Intrinsic::riscv_vsoxseg7:
3185 case Intrinsic::riscv_vsoxseg8:
3186 case Intrinsic::riscv_vsuxseg2:
3187 case Intrinsic::riscv_vsuxseg3:
3188 case Intrinsic::riscv_vsuxseg4:
3189 case Intrinsic::riscv_vsuxseg5:
3190 case Intrinsic::riscv_vsuxseg6:
3191 case Intrinsic::riscv_vsuxseg7:
3192 case Intrinsic::riscv_vsuxseg8: {
3193 // Intrinsic interface (only listed ordered version):
3194 // riscv_vloxei(merge, ptr, index, vl)
3195 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3196 // riscv_vsoxei(val, ptr, index, vl)
3197 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3198 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3199 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3200 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3201 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3202 bool IsWrite = Inst->getType()->isVoidTy();
3203 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3204 // The results of segment loads are TargetExtType.
3205 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3206 unsigned SEW =
3207 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3208 ->getZExtValue();
3209 Ty = TarExtTy->getTypeParameter(0U);
3211 IntegerType::get(C, SEW),
3212 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3213 }
3214 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3215 unsigned VLIndex = RVVIInfo->VLOperand;
3216 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3217 Value *Mask;
3218 if (HasMask) {
3219 Mask = Inst->getArgOperand(VLIndex - 1);
3220 } else {
3221 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3222 // and casting that to scalar i64 triggers a vector/scalar mismatch
3223 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3224 // via extractelement instead.
3225 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3226 Mask = ConstantInt::getTrue(MaskType);
3227 }
3228 Value *EVL = Inst->getArgOperand(VLIndex);
3229 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3230 // RVV uses contiguous elements as a segment.
3231 if (SegNum > 1) {
3232 unsigned ElemSize = Ty->getScalarSizeInBits();
3233 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3234 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3235 }
3236 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3237 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3238 Align(1), Mask, EVL,
3239 /* Stride */ nullptr, OffsetOp);
3240 return true;
3241 }
3242 }
3243 return false;
3244}
3245
3247 if (Ty->isVectorTy()) {
3248 // f16 with only zvfhmin and bf16 will be promoted to f32
3249 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3250 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3251 EltTy->isBFloatTy())
3252 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3253 cast<VectorType>(Ty));
3254
3255 TypeSize Size = DL.getTypeSizeInBits(Ty);
3256 if (Size.isScalable() && ST->hasVInstructions())
3257 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3258
3259 if (ST->useRVVForFixedLengthVectors())
3260 return divideCeil(Size, ST->getRealMinVLen());
3261 }
3262
3263 return BaseT::getRegUsageForType(Ty);
3264}
3265
3266unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3267 if (SLPMaxVF.getNumOccurrences())
3268 return SLPMaxVF;
3269
3270 // Return how many elements can fit in getRegisterBitwidth. This is the
3271 // same routine as used in LoopVectorizer. We should probably be
3272 // accounting for whether we actually have instructions with the right
3273 // lane type, but we don't have enough information to do that without
3274 // some additional plumbing which hasn't been justified yet.
3275 TypeSize RegWidth =
3277 // If no vector registers, or absurd element widths, disable
3278 // vectorization by returning 1.
3279 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3280}
3281
3285
3287 return ST->enableUnalignedVectorMem();
3288}
3289
3292 ScalarEvolution *SE) const {
3293 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3294 return TTI::AMK_PostIndexed;
3295
3297}
3298
3300 const TargetTransformInfo::LSRCost &C2) const {
3301 // RISC-V specific here are "instruction number 1st priority".
3302 // If we need to emit adds inside the loop to add up base registers, then
3303 // we need at least one extra temporary register.
3304 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3305 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3306 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3307 C1.NumIVMuls, C1.NumBaseAdds,
3308 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3309 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3310 C2.NumIVMuls, C2.NumBaseAdds,
3311 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3312}
3313
3315 Align Alignment) const {
3316 auto *VTy = dyn_cast<VectorType>(DataTy);
3317 if (!VTy || VTy->isScalableTy())
3318 return false;
3319
3320 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3321 return false;
3322
3323 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3324 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3325 if (VTy->getElementType()->isIntegerTy(8))
3326 if (VTy->getElementCount().getFixedValue() > 256)
3327 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3328 ST->getMaxLMULForFixedLengthVectors();
3329 return true;
3330}
3331
3333 Align Alignment) const {
3334 auto *VTy = dyn_cast<VectorType>(DataTy);
3335 if (!VTy || VTy->isScalableTy())
3336 return false;
3337
3338 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3339 return false;
3340 return true;
3341}
3342
3343/// See if \p I should be considered for address type promotion. We check if \p
3344/// I is a sext with right type and used in memory accesses. If it used in a
3345/// "complex" getelementptr, we allow it to be promoted without finding other
3346/// sext instructions that sign extended the same initial value. A getelementptr
3347/// is considered as "complex" if it has more than 2 operands.
3349 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3350 bool Considerable = false;
3351 AllowPromotionWithoutCommonHeader = false;
3352 if (!isa<SExtInst>(&I))
3353 return false;
3354 Type *ConsideredSExtType =
3355 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3356 if (I.getType() != ConsideredSExtType)
3357 return false;
3358 // See if the sext is the one with the right type and used in at least one
3359 // GetElementPtrInst.
3360 for (const User *U : I.users()) {
3361 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3362 Considerable = true;
3363 // A getelementptr is considered as "complex" if it has more than 2
3364 // operands. We will promote a SExt used in such complex GEP as we
3365 // expect some computation to be merged if they are done on 64 bits.
3366 if (GEPInst->getNumOperands() > 2) {
3367 AllowPromotionWithoutCommonHeader = true;
3368 break;
3369 }
3370 }
3371 }
3372 return Considerable;
3373}
3374
3375bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3376 switch (Opcode) {
3377 case Instruction::Add:
3378 case Instruction::Sub:
3379 case Instruction::Mul:
3380 case Instruction::And:
3381 case Instruction::Or:
3382 case Instruction::Xor:
3383 case Instruction::FAdd:
3384 case Instruction::FSub:
3385 case Instruction::FMul:
3386 case Instruction::FDiv:
3387 case Instruction::ICmp:
3388 case Instruction::FCmp:
3389 return true;
3390 case Instruction::Shl:
3391 case Instruction::LShr:
3392 case Instruction::AShr:
3393 case Instruction::UDiv:
3394 case Instruction::SDiv:
3395 case Instruction::URem:
3396 case Instruction::SRem:
3397 case Instruction::Select:
3398 return Operand == 1;
3399 default:
3400 return false;
3401 }
3402}
3403
3405 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3406 return false;
3407
3408 if (canSplatOperand(I->getOpcode(), Operand))
3409 return true;
3410
3411 auto *II = dyn_cast<IntrinsicInst>(I);
3412 if (!II)
3413 return false;
3414
3415 switch (II->getIntrinsicID()) {
3416 case Intrinsic::fma:
3417 case Intrinsic::vp_fma:
3418 case Intrinsic::fmuladd:
3419 case Intrinsic::vp_fmuladd:
3420 return Operand == 0 || Operand == 1;
3421 case Intrinsic::vp_shl:
3422 case Intrinsic::vp_lshr:
3423 case Intrinsic::vp_ashr:
3424 case Intrinsic::vp_udiv:
3425 case Intrinsic::vp_sdiv:
3426 case Intrinsic::vp_urem:
3427 case Intrinsic::vp_srem:
3428 case Intrinsic::ssub_sat:
3429 case Intrinsic::vp_ssub_sat:
3430 case Intrinsic::usub_sat:
3431 case Intrinsic::vp_usub_sat:
3432 case Intrinsic::vp_select:
3433 return Operand == 1;
3434 // These intrinsics are commutative.
3435 case Intrinsic::vp_add:
3436 case Intrinsic::vp_mul:
3437 case Intrinsic::vp_and:
3438 case Intrinsic::vp_or:
3439 case Intrinsic::vp_xor:
3440 case Intrinsic::vp_fadd:
3441 case Intrinsic::vp_fmul:
3442 case Intrinsic::vp_icmp:
3443 case Intrinsic::vp_fcmp:
3444 case Intrinsic::smin:
3445 case Intrinsic::vp_smin:
3446 case Intrinsic::umin:
3447 case Intrinsic::vp_umin:
3448 case Intrinsic::smax:
3449 case Intrinsic::vp_smax:
3450 case Intrinsic::umax:
3451 case Intrinsic::vp_umax:
3452 case Intrinsic::sadd_sat:
3453 case Intrinsic::vp_sadd_sat:
3454 case Intrinsic::uadd_sat:
3455 case Intrinsic::vp_uadd_sat:
3456 // These intrinsics have 'vr' versions.
3457 case Intrinsic::vp_sub:
3458 case Intrinsic::vp_fsub:
3459 case Intrinsic::vp_fdiv:
3460 return Operand == 0 || Operand == 1;
3461 default:
3462 return false;
3463 }
3464}
3465
3466/// Check if sinking \p I's operands to I's basic block is profitable, because
3467/// the operands can be folded into a target instruction, e.g.
3468/// splats of scalars can fold into vector instructions.
3471 using namespace llvm::PatternMatch;
3472
3473 if (I->isBitwiseLogicOp()) {
3474 if (!I->getType()->isVectorTy()) {
3475 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3476 for (auto &Op : I->operands()) {
3477 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3478 if (match(Op.get(), m_Not(m_Value()))) {
3479 Ops.push_back(&Op);
3480 return true;
3481 }
3482 }
3483 }
3484 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3485 for (auto &Op : I->operands()) {
3486 // (and X, (not Y)) -> (vandn.vv X, Y)
3487 if (match(Op.get(), m_Not(m_Value()))) {
3488 Ops.push_back(&Op);
3489 return true;
3490 }
3491 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3493 m_ZeroInt()),
3494 m_Value(), m_ZeroMask()))) {
3495 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3496 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3497 Ops.push_back(&Not);
3498 Ops.push_back(&InsertElt);
3499 Ops.push_back(&Op);
3500 return true;
3501 }
3502 }
3503 }
3504 }
3505
3506 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3507 return false;
3508
3509 // Don't sink splat operands if the target prefers it. Some targets requires
3510 // S2V transfer buffers and we can run out of them copying the same value
3511 // repeatedly.
3512 // FIXME: It could still be worth doing if it would improve vector register
3513 // pressure and prevent a vector spill.
3514 if (!ST->sinkSplatOperands())
3515 return false;
3516
3517 for (auto OpIdx : enumerate(I->operands())) {
3518 if (!canSplatOperand(I, OpIdx.index()))
3519 continue;
3520
3521 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3522 // Make sure we are not already sinking this operand
3523 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3524 continue;
3525
3526 // We are looking for a splat that can be sunk.
3528 m_Value(), m_ZeroMask())))
3529 continue;
3530
3531 // Don't sink i1 splats.
3532 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3533 continue;
3534
3535 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3536 // and vector registers
3537 for (Use &U : Op->uses()) {
3538 Instruction *Insn = cast<Instruction>(U.getUser());
3539 if (!canSplatOperand(Insn, U.getOperandNo()))
3540 return false;
3541 }
3542
3543 // Sink any fpexts since they might be used in a widening fp pattern.
3544 Use *InsertEltUse = &Op->getOperandUse(0);
3545 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3546 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3547 Ops.push_back(&InsertElt->getOperandUse(1));
3548 Ops.push_back(InsertEltUse);
3549 Ops.push_back(&OpIdx.value());
3550 }
3551 return true;
3552}
3553
3555RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3557 // TODO: Enable expansion when unaligned access is not supported after we fix
3558 // issues in ExpandMemcmp.
3559 if (!ST->enableUnalignedScalarMem())
3560 return Options;
3561
3562 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3563 return Options;
3564
3565 Options.AllowOverlappingLoads = true;
3566 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3567 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3568 if (ST->is64Bit()) {
3569 Options.LoadSizes = {8, 4, 2, 1};
3570 Options.AllowedTailExpansions = {3, 5, 6};
3571 } else {
3572 Options.LoadSizes = {4, 2, 1};
3573 Options.AllowedTailExpansions = {3};
3574 }
3575
3576 if (IsZeroCmp && ST->hasVInstructions()) {
3577 unsigned VLenB = ST->getRealMinVLen() / 8;
3578 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3579 // `VLenB * MaxLMUL` so that it fits in a single register group.
3580 unsigned MinSize = ST->getXLen() / 8 + 1;
3581 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3582 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3583 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3584 }
3585 return Options;
3586}
3587
3589 const Instruction *I) const {
3591 // For the binary operators (e.g. or) we need to be more careful than
3592 // selects, here we only transform them if they are already at a natural
3593 // break point in the code - the end of a block with an unconditional
3594 // terminator.
3595 if (I->getOpcode() == Instruction::Or &&
3596 isa<UncondBrInst>(I->getNextNode()))
3597 return true;
3598
3599 if (I->getOpcode() == Instruction::Add ||
3600 I->getOpcode() == Instruction::Sub)
3601 return true;
3602 }
3604}
3605
3607 const Function *Caller, const Attribute &Attr) const {
3608 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3609 // restrictions on their signatures). We can outline from the bodies of these
3610 // handlers, but when we do we need to make sure we don't mark the outlined
3611 // function as an interrupt handler too.
3612 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3613 return false;
3614
3616}
3617
3618std::optional<Instruction *>
3620 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3621 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3622 // creating redundant masks.
3623 const DataLayout &DL = IC.getDataLayout();
3624 if (II.user_empty())
3625 return {};
3626 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3627 if (!TargetVecTy)
3628 return {};
3629 const APInt *Scalar;
3630 uint64_t VL;
3632 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3633 !all_of(II.users(), [TargetVecTy](User *U) {
3634 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3635 }))
3636 return {};
3637 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3638 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3639 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3640 if (TargetEltBW % SourceEltBW)
3641 return {};
3642 unsigned TargetScale = TargetEltBW / SourceEltBW;
3643 if (VL % TargetScale)
3644 return {};
3645 Type *VLTy = II.getOperand(2)->getType();
3646 ElementCount SourceEC = SourceVecTy->getElementCount();
3647 unsigned NewEltBW = SourceEltBW * TargetScale;
3648 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3649 !DL.fitsInLegalInteger(NewEltBW))
3650 return {};
3651 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3652 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3653 return {};
3654 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3655 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3656 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3657 "Lossless bitcast between types expected");
3658 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3659 return IC.replaceInstUsesWith(
3660 II,
3663 RetTy, Intrinsic::riscv_vmv_v_x,
3664 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3665 ConstantInt::get(VLTy, VL / TargetScale)}),
3666 SourceVecTy));
3667}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:345
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).