LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
47static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
48 cl::init(true), cl::Hidden);
49
51RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
53 // Check if the type is valid for all CostKind
54 if (!VT.isVector())
56 size_t NumInstr = OpCodes.size();
58 return NumInstr;
59 InstructionCost LMULCost = TLI->getLMULCost(VT);
61 return LMULCost * NumInstr;
62 InstructionCost Cost = 0;
63 for (auto Op : OpCodes) {
64 switch (Op) {
65 case RISCV::VRGATHER_VI:
66 Cost += TLI->getVRGatherVICost(VT);
67 break;
68 case RISCV::VRGATHER_VV:
69 Cost += TLI->getVRGatherVVCost(VT);
70 break;
71 case RISCV::VSLIDEUP_VI:
72 case RISCV::VSLIDEDOWN_VI:
73 Cost += TLI->getVSlideVICost(VT);
74 break;
75 case RISCV::VSLIDEUP_VX:
76 case RISCV::VSLIDEDOWN_VX:
77 Cost += TLI->getVSlideVXCost(VT);
78 break;
79 case RISCV::VREDMAX_VS:
80 case RISCV::VREDMIN_VS:
81 case RISCV::VREDMAXU_VS:
82 case RISCV::VREDMINU_VS:
83 case RISCV::VREDSUM_VS:
84 case RISCV::VREDAND_VS:
85 case RISCV::VREDOR_VS:
86 case RISCV::VREDXOR_VS:
87 case RISCV::VFREDMAX_VS:
88 case RISCV::VFREDMIN_VS:
89 case RISCV::VFREDUSUM_VS: {
90 unsigned VL = VT.getVectorMinNumElements();
91 if (!VT.isFixedLengthVector())
92 VL *= *getVScaleForTuning();
93 Cost += Log2_32_Ceil(VL);
94 break;
95 }
96 case RISCV::VFREDOSUM_VS: {
97 unsigned VL = VT.getVectorMinNumElements();
98 if (!VT.isFixedLengthVector())
99 VL *= *getVScaleForTuning();
100 Cost += VL;
101 break;
102 }
103 case RISCV::VMV_X_S:
104 case RISCV::VMV_S_X:
105 case RISCV::VFMV_F_S:
106 case RISCV::VFMV_S_F:
107 case RISCV::VMOR_MM:
108 case RISCV::VMXOR_MM:
109 case RISCV::VMAND_MM:
110 case RISCV::VMANDN_MM:
111 case RISCV::VMNAND_MM:
112 case RISCV::VCPOP_M:
113 case RISCV::VFIRST_M:
114 Cost += 1;
115 break;
116 case RISCV::VDIV_VV:
117 case RISCV::VREM_VV:
118 Cost += LMULCost * TTI::TCC_Expensive;
119 break;
120 default:
121 Cost += LMULCost;
122 }
123 }
124 return Cost;
125}
126
128 const RISCVSubtarget *ST,
129 const APInt &Imm, Type *Ty,
131 bool FreeZeroes) {
132 assert(Ty->isIntegerTy() &&
133 "getIntImmCost can only estimate cost of materialising integers");
134
135 // We have a Zero register, so 0 is always free.
136 if (Imm == 0)
137 return TTI::TCC_Free;
138
139 // Otherwise, we check how many instructions it will take to materialise.
140 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
141 /*CompressionCost=*/false, FreeZeroes);
142}
143
147 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
148}
149
150// Look for patterns of shift followed by AND that can be turned into a pair of
151// shifts. We won't need to materialize an immediate for the AND so these can
152// be considered free.
153static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
154 uint64_t Mask = Imm.getZExtValue();
155 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
156 if (!BO || !BO->hasOneUse())
157 return false;
158
159 if (BO->getOpcode() != Instruction::Shl)
160 return false;
161
162 if (!isa<ConstantInt>(BO->getOperand(1)))
163 return false;
164
165 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
166 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
167 // is a mask shifted by c2 bits with c3 leading zeros.
168 if (isShiftedMask_64(Mask)) {
169 unsigned Trailing = llvm::countr_zero(Mask);
170 if (ShAmt == Trailing)
171 return true;
172 }
173
174 return false;
175}
176
177// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
178// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
179// the type will be split so only the lower 32 bits need to be compared using
180// (srai/srli X, C) == C2.
181static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
182 if (!Inst->hasOneUse())
183 return false;
184
185 // Look for equality comparison.
186 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
187 if (!Cmp || !Cmp->isEquality())
188 return false;
189
190 // Right hand side of comparison should be a constant.
191 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
192 if (!C)
193 return false;
194
195 uint64_t Mask = Imm.getZExtValue();
196
197 // Mask should be of the form -(1 << C) in the lower 32 bits.
198 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
199 return false;
200
201 // Comparison constant should be a subset of Mask.
202 uint64_t CmpC = C->getZExtValue();
203 if ((CmpC & Mask) != CmpC)
204 return false;
205
206 // We'll need to sign extend the comparison constant and shift it right. Make
207 // sure the new constant can use addi/xori+seqz/snez.
208 unsigned ShiftBits = llvm::countr_zero(Mask);
209 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
210 return NewCmpC >= -2048 && NewCmpC <= 2048;
211}
212
214 const APInt &Imm, Type *Ty,
216 Instruction *Inst) const {
217 assert(Ty->isIntegerTy() &&
218 "getIntImmCost can only estimate cost of materialising integers");
219
220 // We have a Zero register, so 0 is always free.
221 if (Imm == 0)
222 return TTI::TCC_Free;
223
224 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
225 // commutative, in others the immediate comes from a specific argument index.
226 bool Takes12BitImm = false;
227 unsigned ImmArgIdx = ~0U;
228
229 switch (Opcode) {
230 case Instruction::GetElementPtr:
231 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
232 // split up large offsets in GEP into better parts than ConstantHoisting
233 // can.
234 return TTI::TCC_Free;
235 case Instruction::Store: {
236 // Use the materialization cost regardless of if it's the address or the
237 // value that is constant, except for if the store is misaligned and
238 // misaligned accesses are not legal (experience shows constant hoisting
239 // can sometimes be harmful in such cases).
240 if (Idx == 1 || !Inst)
241 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
242 /*FreeZeroes=*/true);
243
244 StoreInst *ST = cast<StoreInst>(Inst);
245 if (!getTLI()->allowsMemoryAccessForAlignment(
246 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
247 ST->getPointerAddressSpace(), ST->getAlign()))
248 return TTI::TCC_Free;
249
250 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
251 /*FreeZeroes=*/true);
252 }
253 case Instruction::Load:
254 // If the address is a constant, use the materialization cost.
255 return getIntImmCost(Imm, Ty, CostKind);
256 case Instruction::And:
257 // zext.h
258 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
259 return TTI::TCC_Free;
260 // zext.w
261 if (Imm == UINT64_C(0xffffffff) &&
262 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
263 return TTI::TCC_Free;
264 // bclri
265 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
266 return TTI::TCC_Free;
267 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
268 canUseShiftPair(Inst, Imm))
269 return TTI::TCC_Free;
270 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
271 canUseShiftCmp(Inst, Imm))
272 return TTI::TCC_Free;
273 Takes12BitImm = true;
274 break;
275 case Instruction::Add:
276 Takes12BitImm = true;
277 break;
278 case Instruction::Or:
279 case Instruction::Xor:
280 // bseti/binvi
281 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
282 return TTI::TCC_Free;
283 Takes12BitImm = true;
284 break;
285 case Instruction::Mul:
286 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
287 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
288 return TTI::TCC_Free;
289 // One more or less than a power of 2 can use SLLI+ADD/SUB.
290 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
291 return TTI::TCC_Free;
292 // FIXME: There is no MULI instruction.
293 Takes12BitImm = true;
294 break;
295 case Instruction::Sub:
296 case Instruction::Shl:
297 case Instruction::LShr:
298 case Instruction::AShr:
299 Takes12BitImm = true;
300 ImmArgIdx = 1;
301 break;
302 default:
303 break;
304 }
305
306 if (Takes12BitImm) {
307 // Check immediate is the correct argument...
308 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
309 // ... and fits into the 12-bit immediate.
310 if (Imm.getSignificantBits() <= 64 &&
311 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
312 return TTI::TCC_Free;
313 }
314 }
315
316 // Otherwise, use the full materialisation cost.
317 return getIntImmCost(Imm, Ty, CostKind);
318 }
319
320 // By default, prevent hoisting.
321 return TTI::TCC_Free;
322}
323
326 const APInt &Imm, Type *Ty,
328 // Prevent hoisting in unknown cases.
329 return TTI::TCC_Free;
330}
331
333 return ST->hasVInstructions();
334}
335
337RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
338 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
339 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
340}
341
343 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
345 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
346 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
347 if (Opcode == Instruction::FAdd)
349
350 // zve32x is broken for partial_reduce_umla, but let's make sure we
351 // don't generate them.
352 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
353 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
354 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
355 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
357
358 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
359 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
360 // Note: Asuming all vdota4* variants are equal cost
361 return LT.first *
362 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
363}
364
366 // Currently, the ExpandReductions pass can't expand scalable-vector
367 // reductions, but we still request expansion as RVV doesn't support certain
368 // reductions and the SelectionDAG can't legalize them either.
369 switch (II->getIntrinsicID()) {
370 default:
371 return false;
372 // These reductions have no equivalent in RVV
373 case Intrinsic::vector_reduce_mul:
374 case Intrinsic::vector_reduce_fmul:
375 return true;
376 }
377}
378
379std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
380 if (ST->hasVInstructions())
381 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
382 return BaseT::getMaxVScale();
383}
384
385std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
386 if (ST->hasVInstructions())
387 if (unsigned MinVLen = ST->getRealMinVLen();
388 MinVLen >= RISCV::RVVBitsPerBlock)
389 return MinVLen / RISCV::RVVBitsPerBlock;
391}
392
395 unsigned LMUL =
396 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
397 switch (K) {
399 return TypeSize::getFixed(ST->getXLen());
401 return TypeSize::getFixed(
402 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
405 (ST->hasVInstructions() &&
406 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
408 : 0);
409 }
410
411 llvm_unreachable("Unsupported register kind");
412}
413
414InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
415 const TTI::TargetCostKind CostKind) const {
416 switch (CostKind) {
419 // Always 2 instructions
420 return 2;
421 case TTI::TCK_Latency:
423 // Depending on the memory model the address generation will
424 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
425 // have a way of getting this information here, so conservatively
426 // require both.
427 // In practice, these are generally implemented together.
428 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
429 }
430 llvm_unreachable("Unsupported cost kind");
431}
432
434RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
436 // Add a cost of address generation + the cost of the load. The address
437 // is expected to be a PC relative offset to a constant pool entry
438 // using auipc/addi.
439 return getStaticDataAddrGenerationCost(CostKind) +
440 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
441 /*AddressSpace=*/0, CostKind);
442}
443
444static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
445 unsigned Size = Mask.size();
446 if (!isPowerOf2_32(Size))
447 return false;
448 for (unsigned I = 0; I != Size; ++I) {
449 if (static_cast<unsigned>(Mask[I]) == I)
450 continue;
451 if (Mask[I] != 0)
452 return false;
453 if (Size % I != 0)
454 return false;
455 for (unsigned J = I + 1; J != Size; ++J)
456 // Check the pattern is repeated.
457 if (static_cast<unsigned>(Mask[J]) != J % I)
458 return false;
459 SubVectorSize = I;
460 return true;
461 }
462 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
463 return false;
464}
465
467 LLVMContext &C) {
468 assert((DataVT.getScalarSizeInBits() != 8 ||
469 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
470 MVT IndexVT = DataVT.changeTypeToInteger();
471 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
472 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
473 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
474}
475
476/// Attempt to approximate the cost of a shuffle which will require splitting
477/// during legalization. Note that processShuffleMasks is not an exact proxy
478/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
479/// reasonably close upperbound.
481 MVT LegalVT, VectorType *Tp,
482 ArrayRef<int> Mask,
484 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
485 "Expected fixed vector type and non-empty mask");
486 unsigned LegalNumElts = LegalVT.getVectorNumElements();
487 // Number of destination vectors after legalization:
488 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
489 // We are going to permute multiple sources and the result will be in
490 // multiple destinations. Providing an accurate cost only for splits where
491 // the element type remains the same.
492 if (NumOfDests <= 1 ||
494 Tp->getElementType()->getPrimitiveSizeInBits() ||
495 LegalNumElts >= Tp->getElementCount().getFixedValue())
497
498 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
499 unsigned LegalVTSize = LegalVT.getStoreSize();
500 // Number of source vectors after legalization:
501 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
502
503 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
504
505 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
506 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
507 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
508 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
509 assert(NormalizedVF >= Mask.size() &&
510 "Normalized mask expected to be not shorter than original mask.");
511 copy(Mask, NormalizedMask.begin());
512 InstructionCost Cost = 0;
513 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
515 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
516 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
517 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
518 return;
519 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
520 .second)
521 return;
522 Cost += TTI.getShuffleCost(
524 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
525 SingleOpTy, RegMask, CostKind, 0, nullptr);
526 },
527 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
528 Cost += TTI.getShuffleCost(
530 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
531 SingleOpTy, RegMask, CostKind, 0, nullptr);
532 });
533 return Cost;
534}
535
536/// Try to perform better estimation of the permutation.
537/// 1. Split the source/destination vectors into real registers.
538/// 2. Do the mask analysis to identify which real registers are
539/// permuted. If more than 1 source registers are used for the
540/// destination register building, the cost for this destination register
541/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
542/// source register is used, build mask and calculate the cost as a cost
543/// of PermuteSingleSrc.
544/// Also, for the single register permute we try to identify if the
545/// destination register is just a copy of the source register or the
546/// copy of the previous destination register (the cost is
547/// TTI::TCC_Basic). If the source register is just reused, the cost for
548/// this operation is 0.
549static InstructionCost
551 std::optional<unsigned> VLen, VectorType *Tp,
553 assert(LegalVT.isFixedLengthVector());
554 if (!VLen || Mask.empty())
556 MVT ElemVT = LegalVT.getVectorElementType();
557 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
558 LegalVT = TTI.getTypeLegalizationCost(
559 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
560 .second;
561 // Number of destination vectors after legalization:
562 InstructionCost NumOfDests =
563 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
564 if (NumOfDests <= 1 ||
566 Tp->getElementType()->getPrimitiveSizeInBits() ||
567 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
569
570 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
571 unsigned LegalVTSize = LegalVT.getStoreSize();
572 // Number of source vectors after legalization:
573 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
574
575 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
576 LegalVT.getVectorNumElements());
577
578 unsigned E = NumOfDests.getValue();
579 unsigned NormalizedVF =
580 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
581 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
582 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
584 assert(NormalizedVF >= Mask.size() &&
585 "Normalized mask expected to be not shorter than original mask.");
586 copy(Mask, NormalizedMask.begin());
587 InstructionCost Cost = 0;
588 int NumShuffles = 0;
589 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
591 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
592 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
593 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
594 return;
595 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
596 .second)
597 return;
598 ++NumShuffles;
599 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
600 SingleOpTy, RegMask, CostKind, 0, nullptr);
601 },
602 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
603 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
604 SingleOpTy, RegMask, CostKind, 0, nullptr);
605 NumShuffles += 2;
606 });
607 // Note: check that we do not emit too many shuffles here to prevent code
608 // size explosion.
609 // TODO: investigate, if it can be improved by extra analysis of the masks
610 // to check if the code is more profitable.
611 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
612 (NumOfDestRegs <= 2 && NumShuffles < 4))
613 return Cost;
615}
616
617InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
618 ArrayRef<int> Mask,
620 // Avoid missing masks and length changing shuffles
621 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
623
624 int NumElts = Tp->getNumElements();
625 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
626 // Avoid scalarization cases
627 if (!LT.second.isFixedLengthVector())
629
630 // Requires moving elements between parts, which requires additional
631 // unmodeled instructions.
632 if (LT.first != 1)
634
635 auto GetSlideOpcode = [&](int SlideAmt) {
636 assert(SlideAmt != 0);
637 bool IsVI = isUInt<5>(std::abs(SlideAmt));
638 if (SlideAmt < 0)
639 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
640 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
641 };
642
643 std::array<std::pair<int, int>, 2> SrcInfo;
644 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
646
647 if (SrcInfo[1].second == 0)
648 std::swap(SrcInfo[0], SrcInfo[1]);
649
650 InstructionCost FirstSlideCost = 0;
651 if (SrcInfo[0].second != 0) {
652 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
653 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
654 }
655
656 if (SrcInfo[1].first == -1)
657 return FirstSlideCost;
658
659 InstructionCost SecondSlideCost = 0;
660 if (SrcInfo[1].second != 0) {
661 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
662 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
663 } else {
664 SecondSlideCost =
665 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
666 }
667
668 auto EC = Tp->getElementCount();
669 VectorType *MaskTy =
671 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
672 return FirstSlideCost + SecondSlideCost + MaskCost;
673}
674
677 VectorType *SrcTy, ArrayRef<int> Mask,
678 TTI::TargetCostKind CostKind, int Index,
680 const Instruction *CxtI) const {
681 assert((Mask.empty() || DstTy->isScalableTy() ||
682 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
683 "Expected the Mask to match the return size if given");
684 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
685 "Expected the same scalar types");
686
687 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
688 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
689
690 // First, handle cases where having a fixed length vector enables us to
691 // give a more accurate cost than falling back to generic scalable codegen.
692 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
693 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
694 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
696 *this, LT.second, ST->getRealVLen(),
697 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
698 if (VRegSplittingCost.isValid())
699 return VRegSplittingCost;
700 switch (Kind) {
701 default:
702 break;
704 if (Mask.size() >= 2) {
705 MVT EltTp = LT.second.getVectorElementType();
706 // If the size of the element is < ELEN then shuffles of interleaves and
707 // deinterleaves of 2 vectors can be lowered into the following
708 // sequences
709 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
710 // Example sequence:
711 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
712 // vwaddu.vv v10, v8, v9
713 // li a0, -1 (ignored)
714 // vwmaccu.vx v10, a0, v9
715 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
716 return 2 * LT.first * TLI->getLMULCost(LT.second);
717
718 if (Mask[0] == 0 || Mask[0] == 1) {
719 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
720 // Example sequence:
721 // vnsrl.wi v10, v8, 0
722 if (equal(DeinterleaveMask, Mask))
723 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
724 LT.second, CostKind);
725 }
726 }
727 int SubVectorSize;
728 if (LT.second.getScalarSizeInBits() != 1 &&
729 isRepeatedConcatMask(Mask, SubVectorSize)) {
731 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
732 // The cost of extraction from a subvector is 0 if the index is 0.
733 for (unsigned I = 0; I != NumSlides; ++I) {
734 unsigned InsertIndex = SubVectorSize * (1 << I);
735 FixedVectorType *SubTp =
736 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
737 FixedVectorType *DestTp =
739 std::pair<InstructionCost, MVT> DestLT =
741 // Add the cost of whole vector register move because the
742 // destination vector register group for vslideup cannot overlap the
743 // source.
744 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
745 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
746 CostKind, InsertIndex, SubTp);
747 }
748 return Cost;
749 }
750 }
751
752 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
753 SlideCost.isValid())
754 return SlideCost;
755
756 // vrgather + cost of generating the mask constant.
757 // We model this for an unknown mask with a single vrgather.
758 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
759 LT.second.getVectorNumElements() <= 256)) {
760 VectorType *IdxTy =
761 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
762 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
763 return IndexCost +
764 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
765 }
766 break;
767 }
770
771 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
772 SlideCost.isValid())
773 return SlideCost;
774
775 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
776 // register for the second vrgather. We model this for an unknown
777 // (shuffle) mask.
778 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
779 LT.second.getVectorNumElements() <= 256)) {
780 auto &C = SrcTy->getContext();
781 auto EC = SrcTy->getElementCount();
782 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
784 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
785 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
786 return 2 * IndexCost +
787 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
788 LT.second, CostKind) +
789 MaskCost;
790 }
791 break;
792 }
793 }
794
795 auto shouldSplit = [](TTI::ShuffleKind Kind) {
796 switch (Kind) {
797 default:
798 return false;
802 return true;
803 }
804 };
805
806 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
807 shouldSplit(Kind)) {
808 InstructionCost SplitCost =
809 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
810 if (SplitCost.isValid())
811 return SplitCost;
812 }
813 }
814
815 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
816 switch (Kind) {
817 default:
818 // Fallthrough to generic handling.
819 // TODO: Most of these cases will return getInvalid in generic code, and
820 // must be implemented here.
821 break;
823 // Extract at zero is always a subregister extract
824 if (Index == 0)
825 return TTI::TCC_Free;
826
827 // If we're extracting a subvector of at most m1 size at a sub-register
828 // boundary - which unfortunately we need exact vlen to identify - this is
829 // a subregister extract at worst and thus won't require a vslidedown.
830 // TODO: Extend for aligned m2, m4 subvector extracts
831 // TODO: Extend for misalgined (but contained) extracts
832 // TODO: Extend for scalable subvector types
833 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
834 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
835 if (std::optional<unsigned> VLen = ST->getRealVLen();
836 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
837 SubLT.second.getSizeInBits() <= *VLen)
838 return TTI::TCC_Free;
839 }
840
841 // Example sequence:
842 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
843 // vslidedown.vi v8, v9, 2
844 return LT.first *
845 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
847 // Example sequence:
848 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
849 // vslideup.vi v8, v9, 2
850 LT = getTypeLegalizationCost(DstTy);
851 return LT.first *
852 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
853 case TTI::SK_Select: {
854 // Example sequence:
855 // li a0, 90
856 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
857 // vmv.s.x v0, a0
858 // vmerge.vvm v8, v9, v8, v0
859 // We use 2 for the cost of the mask materialization as this is the true
860 // cost for small masks and most shuffles are small. At worst, this cost
861 // should be a very small constant for the constant pool load. As such,
862 // we may bias towards large selects slightly more than truly warranted.
863 return LT.first *
864 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
865 LT.second, CostKind));
866 }
867 case TTI::SK_Broadcast: {
868 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
869 Instruction::InsertElement);
870 if (LT.second.getScalarSizeInBits() == 1) {
871 if (HasScalar) {
872 // Example sequence:
873 // andi a0, a0, 1
874 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
875 // vmv.v.x v8, a0
876 // vmsne.vi v0, v8, 0
877 return LT.first *
878 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
879 LT.second, CostKind));
880 }
881 // Example sequence:
882 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
883 // vmv.v.i v8, 0
884 // vmerge.vim v8, v8, 1, v0
885 // vmv.x.s a0, v8
886 // andi a0, a0, 1
887 // vmv.v.x v8, a0
888 // vmsne.vi v0, v8, 0
889
890 return LT.first *
891 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
892 RISCV::VMV_X_S, RISCV::VMV_V_X,
893 RISCV::VMSNE_VI},
894 LT.second, CostKind));
895 }
896
897 if (HasScalar) {
898 // Example sequence:
899 // vmv.v.x v8, a0
900 return LT.first *
901 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
902 }
903
904 // Example sequence:
905 // vrgather.vi v9, v8, 0
906 return LT.first *
907 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
908 }
909 case TTI::SK_Splice: {
910 // vslidedown+vslideup.
911 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
912 // of similar code, but I think we expand through memory.
913 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
914 if (Index >= 0 && Index < 32)
915 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
916 else if (Index < 0 && Index > -32)
917 Opcodes[1] = RISCV::VSLIDEUP_VI;
918 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
919 }
920 case TTI::SK_Reverse: {
921
922 if (!LT.second.isVector())
924
925 // TODO: Cases to improve here:
926 // * Illegal vector types
927 // * i64 on RV32
928 if (SrcTy->getElementType()->isIntegerTy(1)) {
929 VectorType *WideTy =
930 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
931 cast<VectorType>(SrcTy)->getElementCount());
932 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
934 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
935 nullptr) +
936 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
938 }
939
940 MVT ContainerVT = LT.second;
941 if (LT.second.isFixedLengthVector())
942 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
943 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
944 if (ContainerVT.bitsLE(M1VT)) {
945 // Example sequence:
946 // csrr a0, vlenb
947 // srli a0, a0, 3
948 // addi a0, a0, -1
949 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
950 // vid.v v9
951 // vrsub.vx v10, v9, a0
952 // vrgather.vv v9, v8, v10
953 InstructionCost LenCost = 3;
954 if (LT.second.isFixedLengthVector())
955 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
956 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
957 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
958 if (LT.second.isFixedLengthVector() &&
959 isInt<5>(LT.second.getVectorNumElements() - 1))
960 Opcodes[1] = RISCV::VRSUB_VI;
961 InstructionCost GatherCost =
962 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
963 return LT.first * (LenCost + GatherCost);
964 }
965
966 // At high LMUL, we split into a series of M1 reverses (see
967 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
968 // the resulting gap at the bottom (for fixed vectors only). The important
969 // bit is that the cost scales linearly, not quadratically with LMUL.
970 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
971 InstructionCost FixedCost =
972 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
973 unsigned Ratio =
975 InstructionCost GatherCost =
976 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
977 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
978 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
979 return FixedCost + LT.first * (GatherCost + SlideCost);
980 }
981 }
982 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
983 SubTp);
984}
985
986static unsigned isM1OrSmaller(MVT VT) {
988 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
992}
993
995 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
996 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
997 TTI::VectorInstrContext VIC) const {
1000
1001 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1002 // For now, skip all fixed vector cost analysis when P extension is available
1003 // to avoid crashes in getMinRVVVectorSizeInBits()
1004 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1005 return 1; // Treat as single instruction cost for now
1006 }
1007
1008 // A build_vector (which is m1 sized or smaller) can be done in no
1009 // worse than one vslide1down.vx per element in the type. We could
1010 // in theory do an explode_vector in the inverse manner, but our
1011 // lowering today does not have a first class node for this pattern.
1013 Ty, DemandedElts, Insert, Extract, CostKind);
1014 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1015 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1016 if (Ty->getScalarSizeInBits() == 1) {
1017 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1018 // Note: Implicit scalar anyextend is assumed to be free since the i1
1019 // must be stored in a GPR.
1020 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1021 CostKind) +
1022 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1024 }
1025
1026 assert(LT.second.isFixedLengthVector());
1027 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1028 if (isM1OrSmaller(ContainerVT)) {
1029 InstructionCost BV =
1030 cast<FixedVectorType>(Ty)->getNumElements() *
1031 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1032 if (BV < Cost)
1033 Cost = BV;
1034 }
1035 }
1036 return Cost;
1037}
1038
1042 Type *DataTy = MICA.getDataType();
1043 Align Alignment = MICA.getAlignment();
1044 switch (MICA.getID()) {
1045 case Intrinsic::vp_load_ff: {
1046 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1047 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1049
1050 unsigned AS = MICA.getAddressSpace();
1051 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1052 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1053 }
1054 case Intrinsic::experimental_vp_strided_load:
1055 case Intrinsic::experimental_vp_strided_store:
1056 return getStridedMemoryOpCost(MICA, CostKind);
1057 case Intrinsic::masked_compressstore:
1058 case Intrinsic::masked_expandload:
1060 case Intrinsic::vp_scatter:
1061 case Intrinsic::vp_gather:
1062 case Intrinsic::masked_scatter:
1063 case Intrinsic::masked_gather:
1064 return getGatherScatterOpCost(MICA, CostKind);
1065 case Intrinsic::vp_load:
1066 case Intrinsic::vp_store:
1067 case Intrinsic::masked_load:
1068 case Intrinsic::masked_store:
1069 return getMaskedMemoryOpCost(MICA, CostKind);
1070 }
1072}
1073
1077 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1078 : Instruction::Store;
1079 Type *Src = MICA.getDataType();
1080 Align Alignment = MICA.getAlignment();
1081 unsigned AddressSpace = MICA.getAddressSpace();
1082
1083 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1086
1087 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1088}
1089
1091 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1092 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1093 bool UseMaskForCond, bool UseMaskForGaps) const {
1094
1095 // The interleaved memory access pass will lower (de)interleave ops combined
1096 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1097 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1098 // gap).
1099 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1100 auto *VTy = cast<VectorType>(VecTy);
1101 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1102 // Need to make sure type has't been scalarized
1103 if (LT.second.isVector()) {
1104 auto *SubVecTy =
1105 VectorType::get(VTy->getElementType(),
1106 VTy->getElementCount().divideCoefficientBy(Factor));
1107 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1108 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1109 AddressSpace, DL)) {
1110
1111 // Some processors optimize segment loads/stores as one wide memory op +
1112 // Factor * LMUL shuffle ops.
1113 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1115 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1116 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1117 Cost += Factor * TLI->getLMULCost(SubVecVT);
1118 return LT.first * Cost;
1119 }
1120
1121 // Otherwise, the cost is proportional to the number of elements (VL *
1122 // Factor ops).
1123 InstructionCost MemOpCost =
1124 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1125 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1126 unsigned NumLoads = getEstimatedVLFor(VTy);
1127 return NumLoads * MemOpCost;
1128 }
1129 }
1130 }
1131
1132 // TODO: Return the cost of interleaved accesses for scalable vector when
1133 // unable to convert to segment accesses instructions.
1134 if (isa<ScalableVectorType>(VecTy))
1136
1137 auto *FVTy = cast<FixedVectorType>(VecTy);
1138 InstructionCost MemCost =
1139 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1140 unsigned VF = FVTy->getNumElements() / Factor;
1141
1142 // An interleaved load will look like this for Factor=3:
1143 // %wide.vec = load <12 x i32>, ptr %3, align 4
1144 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1145 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1146 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1147 if (Opcode == Instruction::Load) {
1148 InstructionCost Cost = MemCost;
1149 for (unsigned Index : Indices) {
1150 FixedVectorType *VecTy =
1151 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1152 auto Mask = createStrideMask(Index, Factor, VF);
1153 Mask.resize(VF * Factor, -1);
1154 InstructionCost ShuffleCost =
1156 Mask, CostKind, 0, nullptr, {});
1157 Cost += ShuffleCost;
1158 }
1159 return Cost;
1160 }
1161
1162 // TODO: Model for NF > 2
1163 // We'll need to enhance getShuffleCost to model shuffles that are just
1164 // inserts and extracts into subvectors, since they won't have the full cost
1165 // of a vrgather.
1166 // An interleaved store for 3 vectors of 4 lanes will look like
1167 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1168 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1169 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1170 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1171 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1172 if (Factor != 2)
1173 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1174 Alignment, AddressSpace, CostKind,
1175 UseMaskForCond, UseMaskForGaps);
1176
1177 assert(Opcode == Instruction::Store && "Opcode must be a store");
1178 // For an interleaving store of 2 vectors, we perform one large interleaving
1179 // shuffle that goes into the wide store
1180 auto Mask = createInterleaveMask(VF, Factor);
1181 InstructionCost ShuffleCost =
1183 CostKind, 0, nullptr, {});
1184 return MemCost + ShuffleCost;
1185}
1186
1190
1191 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1192 MICA.getID() == Intrinsic::vp_gather;
1193 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1194 Type *DataTy = MICA.getDataType();
1195 Align Alignment = MICA.getAlignment();
1198
1199 if ((Opcode == Instruction::Load &&
1200 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1201 (Opcode == Instruction::Store &&
1202 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1204
1205 // Cost is proportional to the number of memory operations implied. For
1206 // scalable vectors, we use an estimate on that number since we don't
1207 // know exactly what VL will be.
1208 auto &VTy = *cast<VectorType>(DataTy);
1209 unsigned NumLoads = getEstimatedVLFor(&VTy);
1210 return NumLoads * TTI::TCC_Basic;
1211}
1212
1214 const MemIntrinsicCostAttributes &MICA,
1216 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1217 ? Instruction::Load
1218 : Instruction::Store;
1219 Type *DataTy = MICA.getDataType();
1220 bool VariableMask = MICA.getVariableMask();
1221 Align Alignment = MICA.getAlignment();
1222 bool IsLegal = (Opcode == Instruction::Store &&
1223 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1224 (Opcode == Instruction::Load &&
1225 isLegalMaskedExpandLoad(DataTy, Alignment));
1226 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1228 // Example compressstore sequence:
1229 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1230 // vcompress.vm v10, v8, v0
1231 // vcpop.m a1, v0
1232 // vsetvli zero, a1, e32, m2, ta, ma
1233 // vse32.v v10, (a0)
1234 // Example expandload sequence:
1235 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1236 // vcpop.m a1, v0
1237 // vsetvli zero, a1, e32, m2, ta, ma
1238 // vle32.v v10, (a0)
1239 // vsetivli zero, 8, e32, m2, ta, ma
1240 // viota.m v12, v0
1241 // vrgather.vv v8, v10, v12, v0.t
1242 auto MemOpCost =
1243 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1244 auto LT = getTypeLegalizationCost(DataTy);
1245 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1246 if (VariableMask)
1247 Opcodes.push_back(RISCV::VCPOP_M);
1248 if (Opcode == Instruction::Store)
1249 Opcodes.append({RISCV::VCOMPRESS_VM});
1250 else
1251 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1252 return MemOpCost +
1253 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1254}
1255
1259
1260 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1261 ? Instruction::Load
1262 : Instruction::Store;
1263
1264 Type *DataTy = MICA.getDataType();
1265 Align Alignment = MICA.getAlignment();
1266 const Instruction *I = MICA.getInst();
1267
1268 if (!isLegalStridedLoadStore(DataTy, Alignment))
1270
1272 return TTI::TCC_Basic;
1273
1274 // Cost is proportional to the number of memory operations implied. For
1275 // scalable vectors, we use an estimate on that number since we don't
1276 // know exactly what VL will be.
1277 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1278 auto &VTy = *cast<VectorType>(DataTy);
1279 InstructionCost MemOpCost =
1280 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1281 {TTI::OK_AnyValue, TTI::OP_None}, I);
1282 unsigned NumLoads = getEstimatedVLFor(&VTy);
1283 return NumLoads * MemOpCost;
1284}
1285
1288 // FIXME: This is a property of the default vector convention, not
1289 // all possible calling conventions. Fixing that will require
1290 // some TTI API and SLP rework.
1293 for (auto *Ty : Tys) {
1294 if (!Ty->isVectorTy())
1295 continue;
1296 Align A = DL.getPrefTypeAlign(Ty);
1297 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1298 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1299 }
1300 return Cost;
1301}
1302
1303// Currently, these represent both throughput and codesize costs
1304// for the respective intrinsics. The costs in this table are simply
1305// instruction counts with the following adjustments made:
1306// * One vsetvli is considered free.
1308 {Intrinsic::floor, MVT::f32, 9},
1309 {Intrinsic::floor, MVT::f64, 9},
1310 {Intrinsic::ceil, MVT::f32, 9},
1311 {Intrinsic::ceil, MVT::f64, 9},
1312 {Intrinsic::trunc, MVT::f32, 7},
1313 {Intrinsic::trunc, MVT::f64, 7},
1314 {Intrinsic::round, MVT::f32, 9},
1315 {Intrinsic::round, MVT::f64, 9},
1316 {Intrinsic::roundeven, MVT::f32, 9},
1317 {Intrinsic::roundeven, MVT::f64, 9},
1318 {Intrinsic::rint, MVT::f32, 7},
1319 {Intrinsic::rint, MVT::f64, 7},
1320 {Intrinsic::nearbyint, MVT::f32, 9},
1321 {Intrinsic::nearbyint, MVT::f64, 9},
1322 {Intrinsic::bswap, MVT::i16, 3},
1323 {Intrinsic::bswap, MVT::i32, 12},
1324 {Intrinsic::bswap, MVT::i64, 31},
1325 {Intrinsic::vp_bswap, MVT::i16, 3},
1326 {Intrinsic::vp_bswap, MVT::i32, 12},
1327 {Intrinsic::vp_bswap, MVT::i64, 31},
1328 {Intrinsic::vp_fshl, MVT::i8, 7},
1329 {Intrinsic::vp_fshl, MVT::i16, 7},
1330 {Intrinsic::vp_fshl, MVT::i32, 7},
1331 {Intrinsic::vp_fshl, MVT::i64, 7},
1332 {Intrinsic::vp_fshr, MVT::i8, 7},
1333 {Intrinsic::vp_fshr, MVT::i16, 7},
1334 {Intrinsic::vp_fshr, MVT::i32, 7},
1335 {Intrinsic::vp_fshr, MVT::i64, 7},
1336 {Intrinsic::bitreverse, MVT::i8, 17},
1337 {Intrinsic::bitreverse, MVT::i16, 24},
1338 {Intrinsic::bitreverse, MVT::i32, 33},
1339 {Intrinsic::bitreverse, MVT::i64, 52},
1340 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1341 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1342 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1343 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1344 {Intrinsic::ctpop, MVT::i8, 12},
1345 {Intrinsic::ctpop, MVT::i16, 19},
1346 {Intrinsic::ctpop, MVT::i32, 20},
1347 {Intrinsic::ctpop, MVT::i64, 21},
1348 {Intrinsic::ctlz, MVT::i8, 19},
1349 {Intrinsic::ctlz, MVT::i16, 28},
1350 {Intrinsic::ctlz, MVT::i32, 31},
1351 {Intrinsic::ctlz, MVT::i64, 35},
1352 {Intrinsic::cttz, MVT::i8, 16},
1353 {Intrinsic::cttz, MVT::i16, 23},
1354 {Intrinsic::cttz, MVT::i32, 24},
1355 {Intrinsic::cttz, MVT::i64, 25},
1356 {Intrinsic::vp_ctpop, MVT::i8, 12},
1357 {Intrinsic::vp_ctpop, MVT::i16, 19},
1358 {Intrinsic::vp_ctpop, MVT::i32, 20},
1359 {Intrinsic::vp_ctpop, MVT::i64, 21},
1360 {Intrinsic::vp_ctlz, MVT::i8, 19},
1361 {Intrinsic::vp_ctlz, MVT::i16, 28},
1362 {Intrinsic::vp_ctlz, MVT::i32, 31},
1363 {Intrinsic::vp_ctlz, MVT::i64, 35},
1364 {Intrinsic::vp_cttz, MVT::i8, 16},
1365 {Intrinsic::vp_cttz, MVT::i16, 23},
1366 {Intrinsic::vp_cttz, MVT::i32, 24},
1367 {Intrinsic::vp_cttz, MVT::i64, 25},
1368};
1369
1373 auto *RetTy = ICA.getReturnType();
1374 switch (ICA.getID()) {
1375 case Intrinsic::lrint:
1376 case Intrinsic::llrint:
1377 case Intrinsic::lround:
1378 case Intrinsic::llround: {
1379 auto LT = getTypeLegalizationCost(RetTy);
1380 Type *SrcTy = ICA.getArgTypes().front();
1381 auto SrcLT = getTypeLegalizationCost(SrcTy);
1382 if (ST->hasVInstructions() && LT.second.isVector()) {
1384 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1385 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1386 if (LT.second.getVectorElementType() == MVT::bf16) {
1387 if (!ST->hasVInstructionsBF16Minimal())
1389 if (DstEltSz == 32)
1390 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1391 else
1392 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1393 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1394 !ST->hasVInstructionsF16()) {
1395 if (!ST->hasVInstructionsF16Minimal())
1397 if (DstEltSz == 32)
1398 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1399 else
1400 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1401
1402 } else if (SrcEltSz > DstEltSz) {
1403 Ops = {RISCV::VFNCVT_X_F_W};
1404 } else if (SrcEltSz < DstEltSz) {
1405 Ops = {RISCV::VFWCVT_X_F_V};
1406 } else {
1407 Ops = {RISCV::VFCVT_X_F_V};
1408 }
1409
1410 // We need to use the source LMUL in the case of a narrowing op, and the
1411 // destination LMUL otherwise.
1412 if (SrcEltSz > DstEltSz)
1413 return SrcLT.first *
1414 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1415 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1416 }
1417 break;
1418 }
1419 case Intrinsic::ceil:
1420 case Intrinsic::floor:
1421 case Intrinsic::trunc:
1422 case Intrinsic::rint:
1423 case Intrinsic::round:
1424 case Intrinsic::roundeven: {
1425 // These all use the same code.
1426 auto LT = getTypeLegalizationCost(RetTy);
1427 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1428 return LT.first * 8;
1429 break;
1430 }
1431 case Intrinsic::umin:
1432 case Intrinsic::umax:
1433 case Intrinsic::smin:
1434 case Intrinsic::smax: {
1435 auto LT = getTypeLegalizationCost(RetTy);
1436 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1437 return LT.first;
1438
1439 if (ST->hasVInstructions() && LT.second.isVector()) {
1440 unsigned Op;
1441 switch (ICA.getID()) {
1442 case Intrinsic::umin:
1443 Op = RISCV::VMINU_VV;
1444 break;
1445 case Intrinsic::umax:
1446 Op = RISCV::VMAXU_VV;
1447 break;
1448 case Intrinsic::smin:
1449 Op = RISCV::VMIN_VV;
1450 break;
1451 case Intrinsic::smax:
1452 Op = RISCV::VMAX_VV;
1453 break;
1454 }
1455 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1456 }
1457 break;
1458 }
1459 case Intrinsic::sadd_sat:
1460 case Intrinsic::ssub_sat:
1461 case Intrinsic::uadd_sat:
1462 case Intrinsic::usub_sat: {
1463 auto LT = getTypeLegalizationCost(RetTy);
1464 if (ST->hasVInstructions() && LT.second.isVector()) {
1465 unsigned Op;
1466 switch (ICA.getID()) {
1467 case Intrinsic::sadd_sat:
1468 Op = RISCV::VSADD_VV;
1469 break;
1470 case Intrinsic::ssub_sat:
1471 Op = RISCV::VSSUBU_VV;
1472 break;
1473 case Intrinsic::uadd_sat:
1474 Op = RISCV::VSADDU_VV;
1475 break;
1476 case Intrinsic::usub_sat:
1477 Op = RISCV::VSSUBU_VV;
1478 break;
1479 }
1480 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1481 }
1482 break;
1483 }
1484 case Intrinsic::fma:
1485 case Intrinsic::fmuladd: {
1486 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1487 auto LT = getTypeLegalizationCost(RetTy);
1488 if (ST->hasVInstructions() && LT.second.isVector())
1489 return LT.first *
1490 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1491 break;
1492 }
1493 case Intrinsic::fabs: {
1494 auto LT = getTypeLegalizationCost(RetTy);
1495 if (ST->hasVInstructions() && LT.second.isVector()) {
1496 // lui a0, 8
1497 // addi a0, a0, -1
1498 // vsetvli a1, zero, e16, m1, ta, ma
1499 // vand.vx v8, v8, a0
1500 // f16 with zvfhmin and bf16 with zvfhbmin
1501 if (LT.second.getVectorElementType() == MVT::bf16 ||
1502 (LT.second.getVectorElementType() == MVT::f16 &&
1503 !ST->hasVInstructionsF16()))
1504 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1505 CostKind) +
1506 2;
1507 else
1508 return LT.first *
1509 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1510 }
1511 break;
1512 }
1513 case Intrinsic::sqrt: {
1514 auto LT = getTypeLegalizationCost(RetTy);
1515 if (ST->hasVInstructions() && LT.second.isVector()) {
1518 MVT ConvType = LT.second;
1519 MVT FsqrtType = LT.second;
1520 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1521 // will be spilt.
1522 if (LT.second.getVectorElementType() == MVT::bf16) {
1523 if (LT.second == MVT::nxv32bf16) {
1524 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1525 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1526 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1527 ConvType = MVT::nxv16f16;
1528 FsqrtType = MVT::nxv16f32;
1529 } else {
1530 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1531 FsqrtOp = {RISCV::VFSQRT_V};
1532 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1533 }
1534 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1535 !ST->hasVInstructionsF16()) {
1536 if (LT.second == MVT::nxv32f16) {
1537 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1538 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1539 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1540 ConvType = MVT::nxv16f16;
1541 FsqrtType = MVT::nxv16f32;
1542 } else {
1543 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1544 FsqrtOp = {RISCV::VFSQRT_V};
1545 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1546 }
1547 } else {
1548 FsqrtOp = {RISCV::VFSQRT_V};
1549 }
1550
1551 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1552 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1553 }
1554 break;
1555 }
1556 case Intrinsic::cttz:
1557 case Intrinsic::ctlz:
1558 case Intrinsic::ctpop: {
1559 auto LT = getTypeLegalizationCost(RetTy);
1560 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1561 unsigned Op;
1562 switch (ICA.getID()) {
1563 case Intrinsic::cttz:
1564 Op = RISCV::VCTZ_V;
1565 break;
1566 case Intrinsic::ctlz:
1567 Op = RISCV::VCLZ_V;
1568 break;
1569 case Intrinsic::ctpop:
1570 Op = RISCV::VCPOP_V;
1571 break;
1572 }
1573 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1574 }
1575 break;
1576 }
1577 case Intrinsic::abs: {
1578 auto LT = getTypeLegalizationCost(RetTy);
1579 if (ST->hasVInstructions() && LT.second.isVector()) {
1580 // vabs.v v10, v8
1581 if (ST->hasStdExtZvabd())
1582 return LT.first *
1583 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1584
1585 // vrsub.vi v10, v8, 0
1586 // vmax.vv v8, v8, v10
1587 return LT.first *
1588 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1589 LT.second, CostKind);
1590 }
1591 break;
1592 }
1593 case Intrinsic::fshl:
1594 case Intrinsic::fshr: {
1595 if (ICA.getArgs().empty())
1596 break;
1597
1598 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1599 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1600 // instruction.
1601 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1602 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1603 (RetTy->getIntegerBitWidth() == 32 ||
1604 RetTy->getIntegerBitWidth() == 64) &&
1605 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1606 return 1;
1607 }
1608 break;
1609 }
1610 case Intrinsic::get_active_lane_mask: {
1611 if (ST->hasVInstructions()) {
1612 Type *ExpRetTy = VectorType::get(
1613 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1614 auto LT = getTypeLegalizationCost(ExpRetTy);
1615
1616 // vid.v v8 // considered hoisted
1617 // vsaddu.vx v8, v8, a0
1618 // vmsltu.vx v0, v8, a1
1619 return LT.first *
1620 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1621 LT.second, CostKind);
1622 }
1623 break;
1624 }
1625 // TODO: add more intrinsic
1626 case Intrinsic::stepvector: {
1627 auto LT = getTypeLegalizationCost(RetTy);
1628 // Legalisation of illegal types involves an `index' instruction plus
1629 // (LT.first - 1) vector adds.
1630 if (ST->hasVInstructions())
1631 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1632 (LT.first - 1) *
1633 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1634 return 1 + (LT.first - 1);
1635 }
1636 case Intrinsic::vector_splice_left:
1637 case Intrinsic::vector_splice_right: {
1638 auto LT = getTypeLegalizationCost(RetTy);
1639 // Constant offsets fall through to getShuffleCost.
1640 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1641 break;
1642 if (ST->hasVInstructions() && LT.second.isVector()) {
1643 return LT.first *
1644 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1645 LT.second, CostKind);
1646 }
1647 break;
1648 }
1649 case Intrinsic::experimental_cttz_elts: {
1650 Type *ArgTy = ICA.getArgTypes()[0];
1651 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1652 if (getTLI()->shouldExpandCttzElements(ArgType))
1653 break;
1654 InstructionCost Cost = getRISCVInstructionCost(
1655 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1656
1657 // If zero_is_poison is false, then we will generate additional
1658 // cmp + select instructions to convert -1 to EVL.
1659 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1660 if (ICA.getArgs().size() > 1 &&
1661 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1662 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1664 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1666
1667 return Cost;
1668 }
1669 case Intrinsic::experimental_vp_splice: {
1670 // To support type-based query from vectorizer, set the index to 0.
1671 // Note that index only change the cost from vslide.vx to vslide.vi and in
1672 // current implementations they have same costs.
1674 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1676 }
1677 case Intrinsic::fptoui_sat:
1678 case Intrinsic::fptosi_sat: {
1680 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1681 Type *SrcTy = ICA.getArgTypes()[0];
1682
1683 auto SrcLT = getTypeLegalizationCost(SrcTy);
1684 auto DstLT = getTypeLegalizationCost(RetTy);
1685 if (!SrcTy->isVectorTy())
1686 break;
1687
1688 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1690
1691 Cost +=
1692 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1693 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1694
1695 // Handle NaN.
1696 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1697 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1698 Type *CondTy = RetTy->getWithNewBitWidth(1);
1699 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1701 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1703 return Cost;
1704 }
1705 }
1706
1707 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1708 if (auto LT = getTypeLegalizationCost(RetTy);
1709 LT.second.isVector()) {
1710 MVT EltTy = LT.second.getVectorElementType();
1711 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1712 ICA.getID(), EltTy))
1713 return LT.first * Entry->Cost;
1714 }
1715 }
1716
1718}
1719
1722 const SCEV *Ptr,
1724 // Address computations for vector indexed load/store likely require an offset
1725 // and/or scaling.
1726 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1727 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1728
1729 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1730}
1731
1733 Type *Src,
1736 const Instruction *I) const {
1737 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1738 if (!IsVectorType)
1739 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1740
1741 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1742 // For now, skip all fixed vector cost analysis when P extension is available
1743 // to avoid crashes in getMinRVVVectorSizeInBits()
1744 if (ST->hasStdExtP() &&
1746 return 1; // Treat as single instruction cost for now
1747 }
1748
1749 // FIXME: Need to compute legalizing cost for illegal types. The current
1750 // code handles only legal types and those which can be trivially
1751 // promoted to legal.
1752 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1753 Dst->getScalarSizeInBits() > ST->getELen())
1754 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1755
1756 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1757 assert(ISD && "Invalid opcode");
1758 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1759 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1760
1761 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1762 // The shared implementation doesn't model vector widening during legalization
1763 // and instead assumes scalarization. In order to scalarize an <N x i1>
1764 // vector, we need to extend/trunc to/from i8. If we don't special case
1765 // this, we can get an infinite recursion cycle.
1766 switch (ISD) {
1767 default:
1768 break;
1769 case ISD::SIGN_EXTEND:
1770 case ISD::ZERO_EXTEND:
1771 if (Src->getScalarSizeInBits() == 1) {
1772 // We do not use vsext/vzext to extend from mask vector.
1773 // Instead we use the following instructions to extend from mask vector:
1774 // vmv.v.i v8, 0
1775 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1776 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1777 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1778 DstLT.second, CostKind) +
1779 DstLT.first - 1;
1780 }
1781 break;
1782 case ISD::TRUNCATE:
1783 if (Dst->getScalarSizeInBits() == 1) {
1784 // We do not use several vncvt to truncate to mask vector. So we could
1785 // not use PowDiff to calculate it.
1786 // Instead we use the following instructions to truncate to mask vector:
1787 // vand.vi v8, v8, 1
1788 // vmsne.vi v0, v8, 0
1789 return SrcLT.first *
1790 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1791 SrcLT.second, CostKind) +
1792 SrcLT.first - 1;
1793 }
1794 break;
1795 };
1796
1797 // Our actual lowering for the case where a wider legal type is available
1798 // uses promotion to the wider type. This is reflected in the result of
1799 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1800 // scalarized if the legalized Src and Dst are not equal sized.
1801 const DataLayout &DL = this->getDataLayout();
1802 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1803 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1804 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1805 SrcLT.second.getSizeInBits()) ||
1806 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1807 DstLT.second.getSizeInBits()) ||
1808 SrcLT.first > 1 || DstLT.first > 1)
1809 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1810
1811 // The split cost is handled by the base getCastInstrCost
1812 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1813
1814 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1815 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1816 switch (ISD) {
1817 case ISD::SIGN_EXTEND:
1818 case ISD::ZERO_EXTEND: {
1819 if ((PowDiff < 1) || (PowDiff > 3))
1820 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1821 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1822 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1823 unsigned Op =
1824 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1825 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1826 }
1827 case ISD::TRUNCATE:
1828 case ISD::FP_EXTEND:
1829 case ISD::FP_ROUND: {
1830 // Counts of narrow/widen instructions.
1831 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1832 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1833
1834 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1835 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1836 : RISCV::VFNCVT_F_F_W;
1838 for (; SrcEltSize != DstEltSize;) {
1839 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1840 ? MVT::getIntegerVT(DstEltSize)
1841 : MVT::getFloatingPointVT(DstEltSize);
1842 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1843 DstEltSize =
1844 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1845 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1846 }
1847 return Cost;
1848 }
1849 case ISD::FP_TO_SINT:
1850 case ISD::FP_TO_UINT: {
1851 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1852 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1853 unsigned FWCVT =
1854 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1855 unsigned FNCVT =
1856 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1857 unsigned SrcEltSize = Src->getScalarSizeInBits();
1858 unsigned DstEltSize = Dst->getScalarSizeInBits();
1860 if ((SrcEltSize == 16) &&
1861 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1862 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1863 // pre-widening to f32 and then convert f32 to integer
1864 VectorType *VecF32Ty =
1865 VectorType::get(Type::getFloatTy(Dst->getContext()),
1866 cast<VectorType>(Dst)->getElementCount());
1867 std::pair<InstructionCost, MVT> VecF32LT =
1868 getTypeLegalizationCost(VecF32Ty);
1869 Cost +=
1870 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1871 VecF32LT.second, CostKind);
1872 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1873 return Cost;
1874 }
1875 if (DstEltSize == SrcEltSize)
1876 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1877 else if (DstEltSize > SrcEltSize)
1878 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1879 else { // (SrcEltSize > DstEltSize)
1880 // First do a narrowing conversion to an integer half the size, then
1881 // truncate if needed.
1882 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1883 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1884 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1885 if ((SrcEltSize / 2) > DstEltSize) {
1886 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1887 Cost +=
1888 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1889 }
1890 }
1891 return Cost;
1892 }
1893 case ISD::SINT_TO_FP:
1894 case ISD::UINT_TO_FP: {
1895 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1896 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1897 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1898 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1899 unsigned SrcEltSize = Src->getScalarSizeInBits();
1900 unsigned DstEltSize = Dst->getScalarSizeInBits();
1901
1903 if ((DstEltSize == 16) &&
1904 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1905 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1906 // it is converted to f32 and then converted to f16
1907 VectorType *VecF32Ty =
1908 VectorType::get(Type::getFloatTy(Dst->getContext()),
1909 cast<VectorType>(Dst)->getElementCount());
1910 std::pair<InstructionCost, MVT> VecF32LT =
1911 getTypeLegalizationCost(VecF32Ty);
1912 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1913 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1914 DstLT.second, CostKind);
1915 return Cost;
1916 }
1917
1918 if (DstEltSize == SrcEltSize)
1919 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1920 else if (DstEltSize > SrcEltSize) {
1921 if ((DstEltSize / 2) > SrcEltSize) {
1922 VectorType *VecTy =
1923 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1924 cast<VectorType>(Dst)->getElementCount());
1925 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1926 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1927 }
1928 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1929 } else
1930 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1931 return Cost;
1932 }
1933 }
1934 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1935}
1936
1937unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1938 if (isa<ScalableVectorType>(Ty)) {
1939 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1940 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1941 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1942 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1943 }
1944 return cast<FixedVectorType>(Ty)->getNumElements();
1945}
1946
1949 FastMathFlags FMF,
1951 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1952 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1953
1954 // Skip if scalar size of Ty is bigger than ELEN.
1955 if (Ty->getScalarSizeInBits() > ST->getELen())
1956 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1957
1958 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1959 if (Ty->getElementType()->isIntegerTy(1)) {
1960 // SelectionDAGBuilder does following transforms:
1961 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1962 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1963 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1964 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1965 else
1966 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1967 }
1968
1969 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1971 InstructionCost ExtraCost = 0;
1972 switch (IID) {
1973 case Intrinsic::maximum:
1974 if (FMF.noNaNs()) {
1975 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1976 } else {
1977 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1978 RISCV::VFMV_F_S};
1979 // Cost of Canonical Nan + branch
1980 // lui a0, 523264
1981 // fmv.w.x fa0, a0
1982 Type *DstTy = Ty->getScalarType();
1983 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1984 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1985 ExtraCost = 1 +
1986 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1988 getCFInstrCost(Instruction::Br, CostKind);
1989 }
1990 break;
1991
1992 case Intrinsic::minimum:
1993 if (FMF.noNaNs()) {
1994 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1995 } else {
1996 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1997 RISCV::VFMV_F_S};
1998 // Cost of Canonical Nan + branch
1999 // lui a0, 523264
2000 // fmv.w.x fa0, a0
2001 Type *DstTy = Ty->getScalarType();
2002 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2003 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2004 ExtraCost = 1 +
2005 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2007 getCFInstrCost(Instruction::Br, CostKind);
2008 }
2009 break;
2010 }
2011 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2012 }
2013
2014 // IR Reduction is composed by one rvv reduction instruction and vmv
2015 unsigned SplitOp;
2017 switch (IID) {
2018 default:
2019 llvm_unreachable("Unsupported intrinsic");
2020 case Intrinsic::smax:
2021 SplitOp = RISCV::VMAX_VV;
2022 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2023 break;
2024 case Intrinsic::smin:
2025 SplitOp = RISCV::VMIN_VV;
2026 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2027 break;
2028 case Intrinsic::umax:
2029 SplitOp = RISCV::VMAXU_VV;
2030 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2031 break;
2032 case Intrinsic::umin:
2033 SplitOp = RISCV::VMINU_VV;
2034 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2035 break;
2036 case Intrinsic::maxnum:
2037 SplitOp = RISCV::VFMAX_VV;
2038 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2039 break;
2040 case Intrinsic::minnum:
2041 SplitOp = RISCV::VFMIN_VV;
2042 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2043 break;
2044 }
2045 // Add a cost for data larger than LMUL8
2046 InstructionCost SplitCost =
2047 (LT.first > 1) ? (LT.first - 1) *
2048 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2049 : 0;
2050 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2051}
2052
2055 std::optional<FastMathFlags> FMF,
2057 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2058 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2059
2060 // Skip if scalar size of Ty is bigger than ELEN.
2061 if (Ty->getScalarSizeInBits() > ST->getELen())
2062 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2063
2064 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2065 assert(ISD && "Invalid opcode");
2066
2067 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2068 ISD != ISD::FADD)
2069 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2070
2071 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2072 Type *ElementTy = Ty->getElementType();
2073 if (ElementTy->isIntegerTy(1)) {
2074 // Example sequences:
2075 // vfirst.m a0, v0
2076 // seqz a0, a0
2077 if (LT.second == MVT::v1i1)
2078 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2079 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2081
2082 if (ISD == ISD::AND) {
2083 // Example sequences:
2084 // vmand.mm v8, v9, v8 ; needed every time type is split
2085 // vmnot.m v8, v0 ; alias for vmnand
2086 // vcpop.m a0, v8
2087 // seqz a0, a0
2088
2089 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2090 // For LMUL <= 8, there is no splitting,
2091 // the sequences are vmnot, vcpop and seqz.
2092 // When LMUL > 8 and split = 1,
2093 // the sequences are vmnand, vcpop and seqz.
2094 // When LMUL > 8 and split > 1,
2095 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2096 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2097 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2098 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2099 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2100 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2102 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2103 // Example sequences:
2104 // vsetvli a0, zero, e8, mf8, ta, ma
2105 // vmxor.mm v8, v0, v8 ; needed every time type is split
2106 // vcpop.m a0, v8
2107 // andi a0, a0, 1
2108 return (LT.first - 1) *
2109 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2110 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2111 } else {
2112 assert(ISD == ISD::OR);
2113 // Example sequences:
2114 // vsetvli a0, zero, e8, mf8, ta, ma
2115 // vmor.mm v8, v9, v8 ; needed every time type is split
2116 // vcpop.m a0, v0
2117 // snez a0, a0
2118 return (LT.first - 1) *
2119 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2120 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2121 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2123 }
2124 }
2125
2126 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2127 // instruction, and others is composed by two vmv and one rvv reduction
2128 // instruction
2129 unsigned SplitOp;
2131 switch (ISD) {
2132 case ISD::ADD:
2133 SplitOp = RISCV::VADD_VV;
2134 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2135 break;
2136 case ISD::OR:
2137 SplitOp = RISCV::VOR_VV;
2138 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2139 break;
2140 case ISD::XOR:
2141 SplitOp = RISCV::VXOR_VV;
2142 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2143 break;
2144 case ISD::AND:
2145 SplitOp = RISCV::VAND_VV;
2146 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2147 break;
2148 case ISD::FADD:
2149 // We can't promote f16/bf16 fadd reductions.
2150 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2151 LT.second.getScalarType() == MVT::bf16)
2152 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2154 Opcodes.push_back(RISCV::VFMV_S_F);
2155 for (unsigned i = 0; i < LT.first.getValue(); i++)
2156 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2157 Opcodes.push_back(RISCV::VFMV_F_S);
2158 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2159 }
2160 SplitOp = RISCV::VFADD_VV;
2161 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2162 break;
2163 }
2164 // Add a cost for data larger than LMUL8
2165 InstructionCost SplitCost =
2166 (LT.first > 1) ? (LT.first - 1) *
2167 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2168 : 0;
2169 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2170}
2171
2173 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2174 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2175 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2176 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2177 FMF, CostKind);
2178
2179 // Skip if scalar size of ResTy is bigger than ELEN.
2180 if (ResTy->getScalarSizeInBits() > ST->getELen())
2181 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2182 FMF, CostKind);
2183
2184 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2185 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2186 FMF, CostKind);
2187
2188 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2189
2190 if (IsUnsigned && Opcode == Instruction::Add &&
2191 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2192 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2193 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2194 return LT.first *
2195 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2196 }
2197
2198 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2199 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2200 FMF, CostKind);
2201
2202 return (LT.first - 1) +
2203 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2204}
2205
2209 assert(OpInfo.isConstant() && "non constant operand?");
2210 if (!isa<VectorType>(Ty))
2211 // FIXME: We need to account for immediate materialization here, but doing
2212 // a decent job requires more knowledge about the immediate than we
2213 // currently have here.
2214 return 0;
2215
2216 if (OpInfo.isUniform())
2217 // vmv.v.i, vmv.v.x, or vfmv.v.f
2218 // We ignore the cost of the scalar constant materialization to be consistent
2219 // with how we treat scalar constants themselves just above.
2220 return 1;
2221
2222 return getConstantPoolLoadCost(Ty, CostKind);
2223}
2224
2226 Align Alignment,
2227 unsigned AddressSpace,
2229 TTI::OperandValueInfo OpInfo,
2230 const Instruction *I) const {
2231 EVT VT = TLI->getValueType(DL, Src, true);
2232 // Type legalization can't handle structs
2233 if (VT == MVT::Other)
2234 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2235 CostKind, OpInfo, I);
2236
2238 if (Opcode == Instruction::Store && OpInfo.isConstant())
2239 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2240
2241 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2242
2243 InstructionCost BaseCost = [&]() {
2244 InstructionCost Cost = LT.first;
2246 return Cost;
2247
2248 // Our actual lowering for the case where a wider legal type is available
2249 // uses the a VL predicated load on the wider type. This is reflected in
2250 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2251 // widened cases are scalarized.
2252 const DataLayout &DL = this->getDataLayout();
2253 if (Src->isVectorTy() && LT.second.isVector() &&
2254 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2255 LT.second.getSizeInBits()))
2256 return Cost;
2257
2258 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2259 CostKind, OpInfo, I);
2260 }();
2261
2262 // Assume memory ops cost scale with the number of vector registers
2263 // possible accessed by the instruction. Note that BasicTTI already
2264 // handles the LT.first term for us.
2265 if (ST->hasVInstructions() && LT.second.isVector() &&
2267 BaseCost *= TLI->getLMULCost(LT.second);
2268 return Cost + BaseCost;
2269}
2270
2272 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2274 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2276 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2277 Op1Info, Op2Info, I);
2278
2279 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2280 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2281 Op1Info, Op2Info, I);
2282
2283 // Skip if scalar size of ValTy is bigger than ELEN.
2284 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2285 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2286 Op1Info, Op2Info, I);
2287
2288 auto GetConstantMatCost =
2289 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2290 if (OpInfo.isUniform())
2291 // We return 0 we currently ignore the cost of materializing scalar
2292 // constants in GPRs.
2293 return 0;
2294
2295 return getConstantPoolLoadCost(ValTy, CostKind);
2296 };
2297
2298 InstructionCost ConstantMatCost;
2299 if (Op1Info.isConstant())
2300 ConstantMatCost += GetConstantMatCost(Op1Info);
2301 if (Op2Info.isConstant())
2302 ConstantMatCost += GetConstantMatCost(Op2Info);
2303
2304 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2305 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2306 if (CondTy->isVectorTy()) {
2307 if (ValTy->getScalarSizeInBits() == 1) {
2308 // vmandn.mm v8, v8, v9
2309 // vmand.mm v9, v0, v9
2310 // vmor.mm v0, v9, v8
2311 return ConstantMatCost +
2312 LT.first *
2313 getRISCVInstructionCost(
2314 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2315 LT.second, CostKind);
2316 }
2317 // vselect and max/min are supported natively.
2318 return ConstantMatCost +
2319 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2320 CostKind);
2321 }
2322
2323 if (ValTy->getScalarSizeInBits() == 1) {
2324 // vmv.v.x v9, a0
2325 // vmsne.vi v9, v9, 0
2326 // vmandn.mm v8, v8, v9
2327 // vmand.mm v9, v0, v9
2328 // vmor.mm v0, v9, v8
2329 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2330 return ConstantMatCost +
2331 LT.first *
2332 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2333 InterimVT, CostKind) +
2334 LT.first * getRISCVInstructionCost(
2335 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2336 LT.second, CostKind);
2337 }
2338
2339 // vmv.v.x v10, a0
2340 // vmsne.vi v0, v10, 0
2341 // vmerge.vvm v8, v9, v8, v0
2342 return ConstantMatCost +
2343 LT.first * getRISCVInstructionCost(
2344 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2345 LT.second, CostKind);
2346 }
2347
2348 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2349 CmpInst::isIntPredicate(VecPred)) {
2350 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2351 // provided they incur the same cost across all implementations
2352 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2353 LT.second,
2354 CostKind);
2355 }
2356
2357 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2358 CmpInst::isFPPredicate(VecPred)) {
2359
2360 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2361 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2362 return ConstantMatCost +
2363 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2364
2365 // If we do not support the input floating point vector type, use the base
2366 // one which will calculate as:
2367 // ScalarizeCost + Num * Cost for fixed vector,
2368 // InvalidCost for scalable vector.
2369 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2370 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2371 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2373 Op1Info, Op2Info, I);
2374
2375 // Assuming vector fp compare and mask instructions are all the same cost
2376 // until a need arises to differentiate them.
2377 switch (VecPred) {
2378 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2379 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2380 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2381 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2382 return ConstantMatCost +
2383 LT.first * getRISCVInstructionCost(
2384 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2385 LT.second, CostKind);
2386
2387 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2388 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2389 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2390 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2391 return ConstantMatCost +
2392 LT.first *
2393 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2394 LT.second, CostKind);
2395
2396 case CmpInst::FCMP_OEQ: // vmfeq.vv
2397 case CmpInst::FCMP_OGT: // vmflt.vv
2398 case CmpInst::FCMP_OGE: // vmfle.vv
2399 case CmpInst::FCMP_OLT: // vmflt.vv
2400 case CmpInst::FCMP_OLE: // vmfle.vv
2401 case CmpInst::FCMP_UNE: // vmfne.vv
2402 return ConstantMatCost +
2403 LT.first *
2404 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2405 default:
2406 break;
2407 }
2408 }
2409
2410 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2411 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2412 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2413 // be (0 + select instr cost).
2414 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2415 ValTy->isIntegerTy() && !I->user_empty()) {
2416 if (all_of(I->users(), [&](const User *U) {
2417 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2418 U->getType()->isIntegerTy() &&
2419 !isa<ConstantData>(U->getOperand(1)) &&
2420 !isa<ConstantData>(U->getOperand(2));
2421 }))
2422 return 0;
2423 }
2424
2425 // TODO: Add cost for scalar type.
2426
2427 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2428 Op1Info, Op2Info, I);
2429}
2430
2433 const Instruction *I) const {
2435 return Opcode == Instruction::PHI ? 0 : 1;
2436 // Branches are assumed to be predicted.
2437 return 0;
2438}
2439
2441 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2442 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2443 assert(Val->isVectorTy() && "This must be a vector type");
2444
2445 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2446 // For now, skip all fixed vector cost analysis when P extension is available
2447 // to avoid crashes in getMinRVVVectorSizeInBits()
2448 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2449 return 1; // Treat as single instruction cost for now
2450 }
2451
2452 if (Opcode != Instruction::ExtractElement &&
2453 Opcode != Instruction::InsertElement)
2454 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2455 VIC);
2456
2457 // Legalize the type.
2458 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2459
2460 // This type is legalized to a scalar type.
2461 if (!LT.second.isVector()) {
2462 auto *FixedVecTy = cast<FixedVectorType>(Val);
2463 // If Index is a known constant, cost is zero.
2464 if (Index != -1U)
2465 return 0;
2466 // Extract/InsertElement with non-constant index is very costly when
2467 // scalarized; estimate cost of loads/stores sequence via the stack:
2468 // ExtractElement cost: store vector to stack, load scalar;
2469 // InsertElement cost: store vector to stack, store scalar, load vector.
2470 Type *ElemTy = FixedVecTy->getElementType();
2471 auto NumElems = FixedVecTy->getNumElements();
2472 auto Align = DL.getPrefTypeAlign(ElemTy);
2473 InstructionCost LoadCost =
2474 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2475 InstructionCost StoreCost =
2476 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2477 return Opcode == Instruction::ExtractElement
2478 ? StoreCost * NumElems + LoadCost
2479 : (StoreCost + LoadCost) * NumElems + StoreCost;
2480 }
2481
2482 // For unsupported scalable vector.
2483 if (LT.second.isScalableVector() && !LT.first.isValid())
2484 return LT.first;
2485
2486 // Mask vector extract/insert is expanded via e8.
2487 if (Val->getScalarSizeInBits() == 1) {
2488 VectorType *WideTy =
2490 cast<VectorType>(Val)->getElementCount());
2491 if (Opcode == Instruction::ExtractElement) {
2492 InstructionCost ExtendCost
2493 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2495 InstructionCost ExtractCost
2496 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2497 return ExtendCost + ExtractCost;
2498 }
2499 InstructionCost ExtendCost
2500 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2502 InstructionCost InsertCost
2503 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2504 InstructionCost TruncCost
2505 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2507 return ExtendCost + InsertCost + TruncCost;
2508 }
2509
2510
2511 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2512 // and vslideup + vmv.s.x to insert element to vector.
2513 unsigned BaseCost = 1;
2514 // When insertelement we should add the index with 1 as the input of vslideup.
2515 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2516
2517 if (Index != -1U) {
2518 // The type may be split. For fixed-width vectors we can normalize the
2519 // index to the new type.
2520 if (LT.second.isFixedLengthVector()) {
2521 unsigned Width = LT.second.getVectorNumElements();
2522 Index = Index % Width;
2523 }
2524
2525 // If exact VLEN is known, we will insert/extract into the appropriate
2526 // subvector with no additional subvector insert/extract cost.
2527 if (auto VLEN = ST->getRealVLen()) {
2528 unsigned EltSize = LT.second.getScalarSizeInBits();
2529 unsigned M1Max = *VLEN / EltSize;
2530 Index = Index % M1Max;
2531 }
2532
2533 if (Index == 0)
2534 // We can extract/insert the first element without vslidedown/vslideup.
2535 SlideCost = 0;
2536 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2537 Val->getScalarType()->isIntegerTy())
2538 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2539 else if (Opcode == Instruction::InsertElement)
2540 SlideCost = 1; // With a constant index, we do not need to use addi.
2541 }
2542
2543 // When the vector needs to split into multiple register groups and the index
2544 // exceeds single vector register group, we need to insert/extract the element
2545 // via stack.
2546 if (LT.first > 1 &&
2547 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2548 LT.second.isScalableVector()))) {
2549 Type *ScalarType = Val->getScalarType();
2550 Align VecAlign = DL.getPrefTypeAlign(Val);
2551 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2552 // Extra addi for unknown index.
2553 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2554
2555 // Store all split vectors into stack and load the target element.
2556 if (Opcode == Instruction::ExtractElement)
2557 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2558 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2559 CostKind) +
2560 IdxCost;
2561
2562 // Store all split vectors into stack and store the target element and load
2563 // vectors back.
2564 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2565 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2566 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2567 CostKind) +
2568 IdxCost;
2569 }
2570
2571 // Extract i64 in the target that has XLEN=32 need more instruction.
2572 if (Val->getScalarType()->isIntegerTy() &&
2573 ST->getXLen() < Val->getScalarSizeInBits()) {
2574 // For extractelement, we need the following instructions:
2575 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2576 // vslidedown.vx v8, v8, a0
2577 // vmv.x.s a0, v8
2578 // li a1, 32
2579 // vsrl.vx v8, v8, a1
2580 // vmv.x.s a1, v8
2581
2582 // For insertelement, we need the following instructions:
2583 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2584 // vmv.v.i v12, 0
2585 // vslide1up.vx v16, v12, a1
2586 // vslide1up.vx v12, v16, a0
2587 // addi a0, a2, 1
2588 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2589 // vslideup.vx v8, v12, a2
2590
2591 // TODO: should we count these special vsetvlis?
2592 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2593 }
2594 return BaseCost + SlideCost;
2595}
2596
2600 unsigned Index) const {
2601 if (isa<FixedVectorType>(Val))
2603 Index);
2604
2605 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2606 // for the cost of extracting the last lane of a scalable vector. It probably
2607 // needs a more accurate cost.
2608 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2609 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2610 return getVectorInstrCost(Opcode, Val, CostKind,
2611 EC.getKnownMinValue() - 1 - Index, nullptr,
2612 nullptr);
2613}
2614
2616 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2618 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2619
2620 // TODO: Handle more cost kinds.
2622 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2623 Args, CxtI);
2624
2625 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2626 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2627 Args, CxtI);
2628
2629 // Skip if scalar size of Ty is bigger than ELEN.
2630 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2631 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2632 Args, CxtI);
2633
2634 // Legalize the type.
2635 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2636 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2637
2638 // TODO: Handle scalar type.
2639 if (!LT.second.isVector()) {
2640 static const CostTblEntry DivTbl[]{
2641 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2642 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2643 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2644 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2645 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2646 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2647 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2648 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2649 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2650 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2651 return Entry->Cost * LT.first;
2652
2653 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2654 Args, CxtI);
2655 }
2656
2657 // f16 with zvfhmin and bf16 will be promoted to f32.
2658 // FIXME: nxv32[b]f16 will be custom lowered and split.
2659 InstructionCost CastCost = 0;
2660 if ((LT.second.getVectorElementType() == MVT::f16 ||
2661 LT.second.getVectorElementType() == MVT::bf16) &&
2662 TLI->getOperationAction(ISDOpcode, LT.second) ==
2664 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2665 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2666 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2667 // Add cost of extending arguments
2668 CastCost += LT.first * Args.size() *
2669 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2671 // Add cost of truncating result
2672 CastCost +=
2673 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2675 // Compute cost of op in promoted type
2676 LT.second = PromotedVT;
2677 }
2678
2679 auto getConstantMatCost =
2680 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2681 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2682 // Two sub-cases:
2683 // * Has a 5 bit immediate operand which can be splatted.
2684 // * Has a larger immediate which must be materialized in scalar register
2685 // We return 0 for both as we currently ignore the cost of materializing
2686 // scalar constants in GPRs.
2687 return 0;
2688
2689 return getConstantPoolLoadCost(Ty, CostKind);
2690 };
2691
2692 // Add the cost of materializing any constant vectors required.
2693 InstructionCost ConstantMatCost = 0;
2694 if (Op1Info.isConstant())
2695 ConstantMatCost += getConstantMatCost(0, Op1Info);
2696 if (Op2Info.isConstant())
2697 ConstantMatCost += getConstantMatCost(1, Op2Info);
2698
2699 unsigned Op;
2700 switch (ISDOpcode) {
2701 case ISD::ADD:
2702 case ISD::SUB:
2703 Op = RISCV::VADD_VV;
2704 break;
2705 case ISD::SHL:
2706 case ISD::SRL:
2707 case ISD::SRA:
2708 Op = RISCV::VSLL_VV;
2709 break;
2710 case ISD::AND:
2711 case ISD::OR:
2712 case ISD::XOR:
2713 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2714 break;
2715 case ISD::MUL:
2716 case ISD::MULHS:
2717 case ISD::MULHU:
2718 Op = RISCV::VMUL_VV;
2719 break;
2720 case ISD::SDIV:
2721 case ISD::UDIV:
2722 Op = RISCV::VDIV_VV;
2723 break;
2724 case ISD::SREM:
2725 case ISD::UREM:
2726 Op = RISCV::VREM_VV;
2727 break;
2728 case ISD::FADD:
2729 case ISD::FSUB:
2730 Op = RISCV::VFADD_VV;
2731 break;
2732 case ISD::FMUL:
2733 Op = RISCV::VFMUL_VV;
2734 break;
2735 case ISD::FDIV:
2736 Op = RISCV::VFDIV_VV;
2737 break;
2738 case ISD::FNEG:
2739 Op = RISCV::VFSGNJN_VV;
2740 break;
2741 default:
2742 // Assuming all other instructions have the same cost until a need arises to
2743 // differentiate them.
2744 return CastCost + ConstantMatCost +
2745 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2746 Args, CxtI);
2747 }
2748
2749 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2750 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2751 // ops are twice as expensive as integer ops. Do the same for vectors so
2752 // scalar floating point ops aren't cheaper than their vector equivalents.
2753 if (Ty->isFPOrFPVectorTy())
2754 InstrCost *= 2;
2755 return CastCost + ConstantMatCost + LT.first * InstrCost;
2756}
2757
2758// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2760 ArrayRef<const Value *> Ptrs, const Value *Base,
2761 const TTI::PointersChainInfo &Info, Type *AccessTy,
2764 // In the basic model we take into account GEP instructions only
2765 // (although here can come alloca instruction, a value, constants and/or
2766 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2767 // pointer). Typically, if Base is a not a GEP-instruction and all the
2768 // pointers are relative to the same base address, all the rest are
2769 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2770 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2771 // any their index is a non-const.
2772 // If no known dependencies between the pointers cost is calculated as a sum
2773 // of costs of GEP instructions.
2774 for (auto [I, V] : enumerate(Ptrs)) {
2775 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2776 if (!GEP)
2777 continue;
2778 if (Info.isSameBase() && V != Base) {
2779 if (GEP->hasAllConstantIndices())
2780 continue;
2781 // If the chain is unit-stride and BaseReg + stride*i is a legal
2782 // addressing mode, then presume the base GEP is sitting around in a
2783 // register somewhere and check if we can fold the offset relative to
2784 // it.
2785 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2786 if (Info.isUnitStride() &&
2787 isLegalAddressingMode(AccessTy,
2788 /* BaseGV */ nullptr,
2789 /* BaseOffset */ Stride * I,
2790 /* HasBaseReg */ true,
2791 /* Scale */ 0,
2792 GEP->getType()->getPointerAddressSpace()))
2793 continue;
2794 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2795 {TTI::OK_AnyValue, TTI::OP_None},
2796 {TTI::OK_AnyValue, TTI::OP_None}, {});
2797 } else {
2798 SmallVector<const Value *> Indices(GEP->indices());
2799 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2800 Indices, AccessTy, CostKind);
2801 }
2802 }
2803 return Cost;
2804}
2805
2808 OptimizationRemarkEmitter *ORE) const {
2809 // TODO: More tuning on benchmarks and metrics with changes as needed
2810 // would apply to all settings below to enable performance.
2811
2812
2813 if (ST->enableDefaultUnroll())
2814 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2815
2816 // Enable Upper bound unrolling universally, not dependent upon the conditions
2817 // below.
2818 UP.UpperBound = true;
2819
2820 // Disable loop unrolling for Oz and Os.
2821 UP.OptSizeThreshold = 0;
2823 if (L->getHeader()->getParent()->hasOptSize())
2824 return;
2825
2826 SmallVector<BasicBlock *, 4> ExitingBlocks;
2827 L->getExitingBlocks(ExitingBlocks);
2828 LLVM_DEBUG(dbgs() << "Loop has:\n"
2829 << "Blocks: " << L->getNumBlocks() << "\n"
2830 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2831
2832 // Only allow another exit other than the latch. This acts as an early exit
2833 // as it mirrors the profitability calculation of the runtime unroller.
2834 if (ExitingBlocks.size() > 2)
2835 return;
2836
2837 // Limit the CFG of the loop body for targets with a branch predictor.
2838 // Allowing 4 blocks permits if-then-else diamonds in the body.
2839 if (L->getNumBlocks() > 4)
2840 return;
2841
2842 // Scan the loop: don't unroll loops with calls as this could prevent
2843 // inlining. Don't unroll auto-vectorized loops either, though do allow
2844 // unrolling of the scalar remainder.
2845 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2847 for (auto *BB : L->getBlocks()) {
2848 for (auto &I : *BB) {
2849 // Both auto-vectorized loops and the scalar remainder have the
2850 // isvectorized attribute, so differentiate between them by the presence
2851 // of vector instructions.
2852 if (IsVectorized && (I.getType()->isVectorTy() ||
2853 llvm::any_of(I.operand_values(), [](Value *V) {
2854 return V->getType()->isVectorTy();
2855 })))
2856 return;
2857
2858 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2859 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2860 if (!isLoweredToCall(F))
2861 continue;
2862 }
2863 return;
2864 }
2865
2866 SmallVector<const Value *> Operands(I.operand_values());
2867 Cost += getInstructionCost(&I, Operands,
2869 }
2870 }
2871
2872 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2873
2874 UP.Partial = true;
2875 UP.Runtime = true;
2876 UP.UnrollRemainder = true;
2877 UP.UnrollAndJam = true;
2878
2879 // Force unrolling small loops can be very useful because of the branch
2880 // taken cost of the backedge.
2881 if (Cost < 12)
2882 UP.Force = true;
2883}
2884
2889
2891 MemIntrinsicInfo &Info) const {
2892 const DataLayout &DL = getDataLayout();
2893 Intrinsic::ID IID = Inst->getIntrinsicID();
2894 LLVMContext &C = Inst->getContext();
2895 bool HasMask = false;
2896
2897 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2898 bool IsWrite) -> int64_t {
2899 if (auto *TarExtTy =
2900 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2901 return TarExtTy->getIntParameter(0);
2902
2903 return 1;
2904 };
2905
2906 switch (IID) {
2907 case Intrinsic::riscv_vle_mask:
2908 case Intrinsic::riscv_vse_mask:
2909 case Intrinsic::riscv_vlseg2_mask:
2910 case Intrinsic::riscv_vlseg3_mask:
2911 case Intrinsic::riscv_vlseg4_mask:
2912 case Intrinsic::riscv_vlseg5_mask:
2913 case Intrinsic::riscv_vlseg6_mask:
2914 case Intrinsic::riscv_vlseg7_mask:
2915 case Intrinsic::riscv_vlseg8_mask:
2916 case Intrinsic::riscv_vsseg2_mask:
2917 case Intrinsic::riscv_vsseg3_mask:
2918 case Intrinsic::riscv_vsseg4_mask:
2919 case Intrinsic::riscv_vsseg5_mask:
2920 case Intrinsic::riscv_vsseg6_mask:
2921 case Intrinsic::riscv_vsseg7_mask:
2922 case Intrinsic::riscv_vsseg8_mask:
2923 HasMask = true;
2924 [[fallthrough]];
2925 case Intrinsic::riscv_vle:
2926 case Intrinsic::riscv_vse:
2927 case Intrinsic::riscv_vlseg2:
2928 case Intrinsic::riscv_vlseg3:
2929 case Intrinsic::riscv_vlseg4:
2930 case Intrinsic::riscv_vlseg5:
2931 case Intrinsic::riscv_vlseg6:
2932 case Intrinsic::riscv_vlseg7:
2933 case Intrinsic::riscv_vlseg8:
2934 case Intrinsic::riscv_vsseg2:
2935 case Intrinsic::riscv_vsseg3:
2936 case Intrinsic::riscv_vsseg4:
2937 case Intrinsic::riscv_vsseg5:
2938 case Intrinsic::riscv_vsseg6:
2939 case Intrinsic::riscv_vsseg7:
2940 case Intrinsic::riscv_vsseg8: {
2941 // Intrinsic interface:
2942 // riscv_vle(merge, ptr, vl)
2943 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2944 // riscv_vse(val, ptr, vl)
2945 // riscv_vse_mask(val, ptr, mask, vl, policy)
2946 // riscv_vlseg#(merge, ptr, vl, sew)
2947 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2948 // riscv_vsseg#(val, ptr, vl, sew)
2949 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2950 bool IsWrite = Inst->getType()->isVoidTy();
2951 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2952 // The results of segment loads are TargetExtType.
2953 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2954 unsigned SEW =
2955 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2956 ->getZExtValue();
2957 Ty = TarExtTy->getTypeParameter(0U);
2959 IntegerType::get(C, SEW),
2960 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2961 }
2962 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2963 unsigned VLIndex = RVVIInfo->VLOperand;
2964 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2965 MaybeAlign Alignment =
2966 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2967 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2968 Value *Mask = ConstantInt::getTrue(MaskType);
2969 if (HasMask)
2970 Mask = Inst->getArgOperand(VLIndex - 1);
2971 Value *EVL = Inst->getArgOperand(VLIndex);
2972 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2973 // RVV uses contiguous elements as a segment.
2974 if (SegNum > 1) {
2975 unsigned ElemSize = Ty->getScalarSizeInBits();
2976 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2977 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2978 }
2979 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2980 Alignment, Mask, EVL);
2981 return true;
2982 }
2983 case Intrinsic::riscv_vlse_mask:
2984 case Intrinsic::riscv_vsse_mask:
2985 case Intrinsic::riscv_vlsseg2_mask:
2986 case Intrinsic::riscv_vlsseg3_mask:
2987 case Intrinsic::riscv_vlsseg4_mask:
2988 case Intrinsic::riscv_vlsseg5_mask:
2989 case Intrinsic::riscv_vlsseg6_mask:
2990 case Intrinsic::riscv_vlsseg7_mask:
2991 case Intrinsic::riscv_vlsseg8_mask:
2992 case Intrinsic::riscv_vssseg2_mask:
2993 case Intrinsic::riscv_vssseg3_mask:
2994 case Intrinsic::riscv_vssseg4_mask:
2995 case Intrinsic::riscv_vssseg5_mask:
2996 case Intrinsic::riscv_vssseg6_mask:
2997 case Intrinsic::riscv_vssseg7_mask:
2998 case Intrinsic::riscv_vssseg8_mask:
2999 HasMask = true;
3000 [[fallthrough]];
3001 case Intrinsic::riscv_vlse:
3002 case Intrinsic::riscv_vsse:
3003 case Intrinsic::riscv_vlsseg2:
3004 case Intrinsic::riscv_vlsseg3:
3005 case Intrinsic::riscv_vlsseg4:
3006 case Intrinsic::riscv_vlsseg5:
3007 case Intrinsic::riscv_vlsseg6:
3008 case Intrinsic::riscv_vlsseg7:
3009 case Intrinsic::riscv_vlsseg8:
3010 case Intrinsic::riscv_vssseg2:
3011 case Intrinsic::riscv_vssseg3:
3012 case Intrinsic::riscv_vssseg4:
3013 case Intrinsic::riscv_vssseg5:
3014 case Intrinsic::riscv_vssseg6:
3015 case Intrinsic::riscv_vssseg7:
3016 case Intrinsic::riscv_vssseg8: {
3017 // Intrinsic interface:
3018 // riscv_vlse(merge, ptr, stride, vl)
3019 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3020 // riscv_vsse(val, ptr, stride, vl)
3021 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3022 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3023 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3024 // riscv_vssseg#(val, ptr, offset, vl, sew)
3025 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3026 bool IsWrite = Inst->getType()->isVoidTy();
3027 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3028 // The results of segment loads are TargetExtType.
3029 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3030 unsigned SEW =
3031 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3032 ->getZExtValue();
3033 Ty = TarExtTy->getTypeParameter(0U);
3035 IntegerType::get(C, SEW),
3036 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3037 }
3038 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3039 unsigned VLIndex = RVVIInfo->VLOperand;
3040 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3041 MaybeAlign Alignment =
3042 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3043
3044 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3045 // Use the pointer alignment as the element alignment if the stride is a
3046 // multiple of the pointer alignment. Otherwise, the element alignment
3047 // should be the greatest common divisor of pointer alignment and stride.
3048 // For simplicity, just consider unalignment for elements.
3049 unsigned PointerAlign = Alignment.valueOrOne().value();
3050 if (!isa<ConstantInt>(Stride) ||
3051 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3052 Alignment = Align(1);
3053
3054 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3055 Value *Mask = ConstantInt::getTrue(MaskType);
3056 if (HasMask)
3057 Mask = Inst->getArgOperand(VLIndex - 1);
3058 Value *EVL = Inst->getArgOperand(VLIndex);
3059 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3060 // RVV uses contiguous elements as a segment.
3061 if (SegNum > 1) {
3062 unsigned ElemSize = Ty->getScalarSizeInBits();
3063 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3064 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3065 }
3066 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3067 Alignment, Mask, EVL, Stride);
3068 return true;
3069 }
3070 case Intrinsic::riscv_vloxei_mask:
3071 case Intrinsic::riscv_vluxei_mask:
3072 case Intrinsic::riscv_vsoxei_mask:
3073 case Intrinsic::riscv_vsuxei_mask:
3074 case Intrinsic::riscv_vloxseg2_mask:
3075 case Intrinsic::riscv_vloxseg3_mask:
3076 case Intrinsic::riscv_vloxseg4_mask:
3077 case Intrinsic::riscv_vloxseg5_mask:
3078 case Intrinsic::riscv_vloxseg6_mask:
3079 case Intrinsic::riscv_vloxseg7_mask:
3080 case Intrinsic::riscv_vloxseg8_mask:
3081 case Intrinsic::riscv_vluxseg2_mask:
3082 case Intrinsic::riscv_vluxseg3_mask:
3083 case Intrinsic::riscv_vluxseg4_mask:
3084 case Intrinsic::riscv_vluxseg5_mask:
3085 case Intrinsic::riscv_vluxseg6_mask:
3086 case Intrinsic::riscv_vluxseg7_mask:
3087 case Intrinsic::riscv_vluxseg8_mask:
3088 case Intrinsic::riscv_vsoxseg2_mask:
3089 case Intrinsic::riscv_vsoxseg3_mask:
3090 case Intrinsic::riscv_vsoxseg4_mask:
3091 case Intrinsic::riscv_vsoxseg5_mask:
3092 case Intrinsic::riscv_vsoxseg6_mask:
3093 case Intrinsic::riscv_vsoxseg7_mask:
3094 case Intrinsic::riscv_vsoxseg8_mask:
3095 case Intrinsic::riscv_vsuxseg2_mask:
3096 case Intrinsic::riscv_vsuxseg3_mask:
3097 case Intrinsic::riscv_vsuxseg4_mask:
3098 case Intrinsic::riscv_vsuxseg5_mask:
3099 case Intrinsic::riscv_vsuxseg6_mask:
3100 case Intrinsic::riscv_vsuxseg7_mask:
3101 case Intrinsic::riscv_vsuxseg8_mask:
3102 HasMask = true;
3103 [[fallthrough]];
3104 case Intrinsic::riscv_vloxei:
3105 case Intrinsic::riscv_vluxei:
3106 case Intrinsic::riscv_vsoxei:
3107 case Intrinsic::riscv_vsuxei:
3108 case Intrinsic::riscv_vloxseg2:
3109 case Intrinsic::riscv_vloxseg3:
3110 case Intrinsic::riscv_vloxseg4:
3111 case Intrinsic::riscv_vloxseg5:
3112 case Intrinsic::riscv_vloxseg6:
3113 case Intrinsic::riscv_vloxseg7:
3114 case Intrinsic::riscv_vloxseg8:
3115 case Intrinsic::riscv_vluxseg2:
3116 case Intrinsic::riscv_vluxseg3:
3117 case Intrinsic::riscv_vluxseg4:
3118 case Intrinsic::riscv_vluxseg5:
3119 case Intrinsic::riscv_vluxseg6:
3120 case Intrinsic::riscv_vluxseg7:
3121 case Intrinsic::riscv_vluxseg8:
3122 case Intrinsic::riscv_vsoxseg2:
3123 case Intrinsic::riscv_vsoxseg3:
3124 case Intrinsic::riscv_vsoxseg4:
3125 case Intrinsic::riscv_vsoxseg5:
3126 case Intrinsic::riscv_vsoxseg6:
3127 case Intrinsic::riscv_vsoxseg7:
3128 case Intrinsic::riscv_vsoxseg8:
3129 case Intrinsic::riscv_vsuxseg2:
3130 case Intrinsic::riscv_vsuxseg3:
3131 case Intrinsic::riscv_vsuxseg4:
3132 case Intrinsic::riscv_vsuxseg5:
3133 case Intrinsic::riscv_vsuxseg6:
3134 case Intrinsic::riscv_vsuxseg7:
3135 case Intrinsic::riscv_vsuxseg8: {
3136 // Intrinsic interface (only listed ordered version):
3137 // riscv_vloxei(merge, ptr, index, vl)
3138 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3139 // riscv_vsoxei(val, ptr, index, vl)
3140 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3141 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3142 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3143 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3144 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3145 bool IsWrite = Inst->getType()->isVoidTy();
3146 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3147 // The results of segment loads are TargetExtType.
3148 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3149 unsigned SEW =
3150 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3151 ->getZExtValue();
3152 Ty = TarExtTy->getTypeParameter(0U);
3154 IntegerType::get(C, SEW),
3155 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3156 }
3157 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3158 unsigned VLIndex = RVVIInfo->VLOperand;
3159 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3160 Value *Mask;
3161 if (HasMask) {
3162 Mask = Inst->getArgOperand(VLIndex - 1);
3163 } else {
3164 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3165 // and casting that to scalar i64 triggers a vector/scalar mismatch
3166 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3167 // via extractelement instead.
3168 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3169 Mask = ConstantInt::getTrue(MaskType);
3170 }
3171 Value *EVL = Inst->getArgOperand(VLIndex);
3172 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3173 // RVV uses contiguous elements as a segment.
3174 if (SegNum > 1) {
3175 unsigned ElemSize = Ty->getScalarSizeInBits();
3176 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3177 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3178 }
3179 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3180 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3181 Align(1), Mask, EVL,
3182 /* Stride */ nullptr, OffsetOp);
3183 return true;
3184 }
3185 }
3186 return false;
3187}
3188
3190 if (Ty->isVectorTy()) {
3191 // f16 with only zvfhmin and bf16 will be promoted to f32
3192 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3193 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3194 EltTy->isBFloatTy())
3195 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3196 cast<VectorType>(Ty));
3197
3198 TypeSize Size = DL.getTypeSizeInBits(Ty);
3199 if (Size.isScalable() && ST->hasVInstructions())
3200 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3201
3202 if (ST->useRVVForFixedLengthVectors())
3203 return divideCeil(Size, ST->getRealMinVLen());
3204 }
3205
3206 return BaseT::getRegUsageForType(Ty);
3207}
3208
3209unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3210 if (SLPMaxVF.getNumOccurrences())
3211 return SLPMaxVF;
3212
3213 // Return how many elements can fit in getRegisterBitwidth. This is the
3214 // same routine as used in LoopVectorizer. We should probably be
3215 // accounting for whether we actually have instructions with the right
3216 // lane type, but we don't have enough information to do that without
3217 // some additional plumbing which hasn't been justified yet.
3218 TypeSize RegWidth =
3220 // If no vector registers, or absurd element widths, disable
3221 // vectorization by returning 1.
3222 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3223}
3224
3228
3230 return ST->enableUnalignedVectorMem();
3231}
3232
3235 ScalarEvolution *SE) const {
3236 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3237 return TTI::AMK_PostIndexed;
3238
3240}
3241
3243 const TargetTransformInfo::LSRCost &C2) const {
3244 // RISC-V specific here are "instruction number 1st priority".
3245 // If we need to emit adds inside the loop to add up base registers, then
3246 // we need at least one extra temporary register.
3247 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3248 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3249 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3250 C1.NumIVMuls, C1.NumBaseAdds,
3251 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3252 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3253 C2.NumIVMuls, C2.NumBaseAdds,
3254 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3255}
3256
3258 Align Alignment) const {
3259 auto *VTy = dyn_cast<VectorType>(DataTy);
3260 if (!VTy || VTy->isScalableTy())
3261 return false;
3262
3263 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3264 return false;
3265
3266 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3267 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3268 if (VTy->getElementType()->isIntegerTy(8))
3269 if (VTy->getElementCount().getFixedValue() > 256)
3270 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3271 ST->getMaxLMULForFixedLengthVectors();
3272 return true;
3273}
3274
3276 Align Alignment) const {
3277 auto *VTy = dyn_cast<VectorType>(DataTy);
3278 if (!VTy || VTy->isScalableTy())
3279 return false;
3280
3281 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3282 return false;
3283 return true;
3284}
3285
3286/// See if \p I should be considered for address type promotion. We check if \p
3287/// I is a sext with right type and used in memory accesses. If it used in a
3288/// "complex" getelementptr, we allow it to be promoted without finding other
3289/// sext instructions that sign extended the same initial value. A getelementptr
3290/// is considered as "complex" if it has more than 2 operands.
3292 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3293 bool Considerable = false;
3294 AllowPromotionWithoutCommonHeader = false;
3295 if (!isa<SExtInst>(&I))
3296 return false;
3297 Type *ConsideredSExtType =
3298 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3299 if (I.getType() != ConsideredSExtType)
3300 return false;
3301 // See if the sext is the one with the right type and used in at least one
3302 // GetElementPtrInst.
3303 for (const User *U : I.users()) {
3304 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3305 Considerable = true;
3306 // A getelementptr is considered as "complex" if it has more than 2
3307 // operands. We will promote a SExt used in such complex GEP as we
3308 // expect some computation to be merged if they are done on 64 bits.
3309 if (GEPInst->getNumOperands() > 2) {
3310 AllowPromotionWithoutCommonHeader = true;
3311 break;
3312 }
3313 }
3314 }
3315 return Considerable;
3316}
3317
3318bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3319 switch (Opcode) {
3320 case Instruction::Add:
3321 case Instruction::Sub:
3322 case Instruction::Mul:
3323 case Instruction::And:
3324 case Instruction::Or:
3325 case Instruction::Xor:
3326 case Instruction::FAdd:
3327 case Instruction::FSub:
3328 case Instruction::FMul:
3329 case Instruction::FDiv:
3330 case Instruction::ICmp:
3331 case Instruction::FCmp:
3332 return true;
3333 case Instruction::Shl:
3334 case Instruction::LShr:
3335 case Instruction::AShr:
3336 case Instruction::UDiv:
3337 case Instruction::SDiv:
3338 case Instruction::URem:
3339 case Instruction::SRem:
3340 case Instruction::Select:
3341 return Operand == 1;
3342 default:
3343 return false;
3344 }
3345}
3346
3348 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3349 return false;
3350
3351 if (canSplatOperand(I->getOpcode(), Operand))
3352 return true;
3353
3354 auto *II = dyn_cast<IntrinsicInst>(I);
3355 if (!II)
3356 return false;
3357
3358 switch (II->getIntrinsicID()) {
3359 case Intrinsic::fma:
3360 case Intrinsic::vp_fma:
3361 case Intrinsic::fmuladd:
3362 case Intrinsic::vp_fmuladd:
3363 return Operand == 0 || Operand == 1;
3364 case Intrinsic::vp_shl:
3365 case Intrinsic::vp_lshr:
3366 case Intrinsic::vp_ashr:
3367 case Intrinsic::vp_udiv:
3368 case Intrinsic::vp_sdiv:
3369 case Intrinsic::vp_urem:
3370 case Intrinsic::vp_srem:
3371 case Intrinsic::ssub_sat:
3372 case Intrinsic::vp_ssub_sat:
3373 case Intrinsic::usub_sat:
3374 case Intrinsic::vp_usub_sat:
3375 case Intrinsic::vp_select:
3376 return Operand == 1;
3377 // These intrinsics are commutative.
3378 case Intrinsic::vp_add:
3379 case Intrinsic::vp_mul:
3380 case Intrinsic::vp_and:
3381 case Intrinsic::vp_or:
3382 case Intrinsic::vp_xor:
3383 case Intrinsic::vp_fadd:
3384 case Intrinsic::vp_fmul:
3385 case Intrinsic::vp_icmp:
3386 case Intrinsic::vp_fcmp:
3387 case Intrinsic::smin:
3388 case Intrinsic::vp_smin:
3389 case Intrinsic::umin:
3390 case Intrinsic::vp_umin:
3391 case Intrinsic::smax:
3392 case Intrinsic::vp_smax:
3393 case Intrinsic::umax:
3394 case Intrinsic::vp_umax:
3395 case Intrinsic::sadd_sat:
3396 case Intrinsic::vp_sadd_sat:
3397 case Intrinsic::uadd_sat:
3398 case Intrinsic::vp_uadd_sat:
3399 // These intrinsics have 'vr' versions.
3400 case Intrinsic::vp_sub:
3401 case Intrinsic::vp_fsub:
3402 case Intrinsic::vp_fdiv:
3403 return Operand == 0 || Operand == 1;
3404 default:
3405 return false;
3406 }
3407}
3408
3409/// Check if sinking \p I's operands to I's basic block is profitable, because
3410/// the operands can be folded into a target instruction, e.g.
3411/// splats of scalars can fold into vector instructions.
3414 using namespace llvm::PatternMatch;
3415
3416 if (I->isBitwiseLogicOp()) {
3417 if (!I->getType()->isVectorTy()) {
3418 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3419 for (auto &Op : I->operands()) {
3420 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3421 if (match(Op.get(), m_Not(m_Value()))) {
3422 Ops.push_back(&Op);
3423 return true;
3424 }
3425 }
3426 }
3427 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3428 for (auto &Op : I->operands()) {
3429 // (and X, (not Y)) -> (vandn.vv X, Y)
3430 if (match(Op.get(), m_Not(m_Value()))) {
3431 Ops.push_back(&Op);
3432 return true;
3433 }
3434 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3436 m_ZeroInt()),
3437 m_Value(), m_ZeroMask()))) {
3438 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3439 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3440 Ops.push_back(&Not);
3441 Ops.push_back(&InsertElt);
3442 Ops.push_back(&Op);
3443 return true;
3444 }
3445 }
3446 }
3447 }
3448
3449 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3450 return false;
3451
3452 // Don't sink splat operands if the target prefers it. Some targets requires
3453 // S2V transfer buffers and we can run out of them copying the same value
3454 // repeatedly.
3455 // FIXME: It could still be worth doing if it would improve vector register
3456 // pressure and prevent a vector spill.
3457 if (!ST->sinkSplatOperands())
3458 return false;
3459
3460 for (auto OpIdx : enumerate(I->operands())) {
3461 if (!canSplatOperand(I, OpIdx.index()))
3462 continue;
3463
3464 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3465 // Make sure we are not already sinking this operand
3466 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3467 continue;
3468
3469 // We are looking for a splat that can be sunk.
3471 m_Value(), m_ZeroMask())))
3472 continue;
3473
3474 // Don't sink i1 splats.
3475 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3476 continue;
3477
3478 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3479 // and vector registers
3480 for (Use &U : Op->uses()) {
3481 Instruction *Insn = cast<Instruction>(U.getUser());
3482 if (!canSplatOperand(Insn, U.getOperandNo()))
3483 return false;
3484 }
3485
3486 // Sink any fpexts since they might be used in a widening fp pattern.
3487 Use *InsertEltUse = &Op->getOperandUse(0);
3488 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3489 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3490 Ops.push_back(&InsertElt->getOperandUse(1));
3491 Ops.push_back(InsertEltUse);
3492 Ops.push_back(&OpIdx.value());
3493 }
3494 return true;
3495}
3496
3498RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3500 // TODO: Enable expansion when unaligned access is not supported after we fix
3501 // issues in ExpandMemcmp.
3502 if (!ST->enableUnalignedScalarMem())
3503 return Options;
3504
3505 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3506 return Options;
3507
3508 Options.AllowOverlappingLoads = true;
3509 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3510 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3511 if (ST->is64Bit()) {
3512 Options.LoadSizes = {8, 4, 2, 1};
3513 Options.AllowedTailExpansions = {3, 5, 6};
3514 } else {
3515 Options.LoadSizes = {4, 2, 1};
3516 Options.AllowedTailExpansions = {3};
3517 }
3518
3519 if (IsZeroCmp && ST->hasVInstructions()) {
3520 unsigned VLenB = ST->getRealMinVLen() / 8;
3521 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3522 // `VLenB * MaxLMUL` so that it fits in a single register group.
3523 unsigned MinSize = ST->getXLen() / 8 + 1;
3524 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3525 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3526 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3527 }
3528 return Options;
3529}
3530
3532 const Instruction *I) const {
3534 // For the binary operators (e.g. or) we need to be more careful than
3535 // selects, here we only transform them if they are already at a natural
3536 // break point in the code - the end of a block with an unconditional
3537 // terminator.
3538 if (I->getOpcode() == Instruction::Or &&
3539 isa<BranchInst>(I->getNextNode()) &&
3540 cast<BranchInst>(I->getNextNode())->isUnconditional())
3541 return true;
3542
3543 if (I->getOpcode() == Instruction::Add ||
3544 I->getOpcode() == Instruction::Sub)
3545 return true;
3546 }
3548}
3549
3551 const Function *Caller, const Attribute &Attr) const {
3552 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3553 // restrictions on their signatures). We can outline from the bodies of these
3554 // handlers, but when we do we need to make sure we don't mark the outlined
3555 // function as an interrupt handler too.
3556 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3557 return false;
3558
3560}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).