LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
973 // For now, skip all fixed vector cost analysis when P extension is available
974 // to avoid crashes in getMinRVVVectorSizeInBits()
975 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
976 return 1; // Treat as single instruction cost for now
977 }
978
979 // A build_vector (which is m1 sized or smaller) can be done in no
980 // worse than one vslide1down.vx per element in the type. We could
981 // in theory do an explode_vector in the inverse manner, but our
982 // lowering today does not have a first class node for this pattern.
984 Ty, DemandedElts, Insert, Extract, CostKind);
985 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
986 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
987 if (Ty->getScalarSizeInBits() == 1) {
988 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
989 // Note: Implicit scalar anyextend is assumed to be free since the i1
990 // must be stored in a GPR.
991 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
992 CostKind) +
993 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
995 }
996
997 assert(LT.second.isFixedLengthVector());
998 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
999 if (isM1OrSmaller(ContainerVT)) {
1000 InstructionCost BV =
1001 cast<FixedVectorType>(Ty)->getNumElements() *
1002 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1003 if (BV < Cost)
1004 Cost = BV;
1005 }
1006 }
1007 return Cost;
1008}
1009
1013 Type *DataTy = MICA.getDataType();
1014 Align Alignment = MICA.getAlignment();
1015 switch (MICA.getID()) {
1016 case Intrinsic::vp_load_ff: {
1017 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1018 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1020
1021 unsigned AS = MICA.getAddressSpace();
1022 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1023 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1024 }
1025 case Intrinsic::experimental_vp_strided_load:
1026 case Intrinsic::experimental_vp_strided_store:
1027 return getStridedMemoryOpCost(MICA, CostKind);
1028 case Intrinsic::masked_compressstore:
1029 case Intrinsic::masked_expandload:
1031 case Intrinsic::vp_scatter:
1032 case Intrinsic::vp_gather:
1033 case Intrinsic::masked_scatter:
1034 case Intrinsic::masked_gather:
1035 return getGatherScatterOpCost(MICA, CostKind);
1036 case Intrinsic::vp_load:
1037 case Intrinsic::vp_store:
1038 case Intrinsic::masked_load:
1039 case Intrinsic::masked_store:
1040 return getMaskedMemoryOpCost(MICA, CostKind);
1041 }
1043}
1044
1048 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1049 : Instruction::Store;
1050 Type *Src = MICA.getDataType();
1051 Align Alignment = MICA.getAlignment();
1052 unsigned AddressSpace = MICA.getAddressSpace();
1053
1054 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1057
1058 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1059}
1060
1062 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1063 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1064 bool UseMaskForCond, bool UseMaskForGaps) const {
1065
1066 // The interleaved memory access pass will lower (de)interleave ops combined
1067 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1068 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1069 // gap).
1070 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1071 auto *VTy = cast<VectorType>(VecTy);
1072 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1073 // Need to make sure type has't been scalarized
1074 if (LT.second.isVector()) {
1075 auto *SubVecTy =
1076 VectorType::get(VTy->getElementType(),
1077 VTy->getElementCount().divideCoefficientBy(Factor));
1078 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1079 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1080 AddressSpace, DL)) {
1081
1082 // Some processors optimize segment loads/stores as one wide memory op +
1083 // Factor * LMUL shuffle ops.
1084 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1086 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1087 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1088 Cost += Factor * TLI->getLMULCost(SubVecVT);
1089 return LT.first * Cost;
1090 }
1091
1092 // Otherwise, the cost is proportional to the number of elements (VL *
1093 // Factor ops).
1094 InstructionCost MemOpCost =
1095 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1096 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1097 unsigned NumLoads = getEstimatedVLFor(VTy);
1098 return NumLoads * MemOpCost;
1099 }
1100 }
1101 }
1102
1103 // TODO: Return the cost of interleaved accesses for scalable vector when
1104 // unable to convert to segment accesses instructions.
1105 if (isa<ScalableVectorType>(VecTy))
1107
1108 auto *FVTy = cast<FixedVectorType>(VecTy);
1109 InstructionCost MemCost =
1110 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1111 unsigned VF = FVTy->getNumElements() / Factor;
1112
1113 // An interleaved load will look like this for Factor=3:
1114 // %wide.vec = load <12 x i32>, ptr %3, align 4
1115 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1116 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1117 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1118 if (Opcode == Instruction::Load) {
1119 InstructionCost Cost = MemCost;
1120 for (unsigned Index : Indices) {
1121 FixedVectorType *VecTy =
1122 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1123 auto Mask = createStrideMask(Index, Factor, VF);
1124 Mask.resize(VF * Factor, -1);
1125 InstructionCost ShuffleCost =
1127 Mask, CostKind, 0, nullptr, {});
1128 Cost += ShuffleCost;
1129 }
1130 return Cost;
1131 }
1132
1133 // TODO: Model for NF > 2
1134 // We'll need to enhance getShuffleCost to model shuffles that are just
1135 // inserts and extracts into subvectors, since they won't have the full cost
1136 // of a vrgather.
1137 // An interleaved store for 3 vectors of 4 lanes will look like
1138 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1139 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1140 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1141 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1142 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1143 if (Factor != 2)
1144 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1145 Alignment, AddressSpace, CostKind,
1146 UseMaskForCond, UseMaskForGaps);
1147
1148 assert(Opcode == Instruction::Store && "Opcode must be a store");
1149 // For an interleaving store of 2 vectors, we perform one large interleaving
1150 // shuffle that goes into the wide store
1151 auto Mask = createInterleaveMask(VF, Factor);
1152 InstructionCost ShuffleCost =
1154 CostKind, 0, nullptr, {});
1155 return MemCost + ShuffleCost;
1156}
1157
1161
1162 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1163 MICA.getID() == Intrinsic::vp_gather;
1164 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1165 Type *DataTy = MICA.getDataType();
1166 Align Alignment = MICA.getAlignment();
1167 const Instruction *I = MICA.getInst();
1170
1171 if ((Opcode == Instruction::Load &&
1172 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1173 (Opcode == Instruction::Store &&
1174 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1176
1177 // Cost is proportional to the number of memory operations implied. For
1178 // scalable vectors, we use an estimate on that number since we don't
1179 // know exactly what VL will be.
1180 auto &VTy = *cast<VectorType>(DataTy);
1181 InstructionCost MemOpCost =
1182 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1183 {TTI::OK_AnyValue, TTI::OP_None}, I);
1184 unsigned NumLoads = getEstimatedVLFor(&VTy);
1185 return NumLoads * MemOpCost;
1186}
1187
1189 const MemIntrinsicCostAttributes &MICA,
1191 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1192 ? Instruction::Load
1193 : Instruction::Store;
1194 Type *DataTy = MICA.getDataType();
1195 bool VariableMask = MICA.getVariableMask();
1196 Align Alignment = MICA.getAlignment();
1197 bool IsLegal = (Opcode == Instruction::Store &&
1198 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1199 (Opcode == Instruction::Load &&
1200 isLegalMaskedExpandLoad(DataTy, Alignment));
1201 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1203 // Example compressstore sequence:
1204 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1205 // vcompress.vm v10, v8, v0
1206 // vcpop.m a1, v0
1207 // vsetvli zero, a1, e32, m2, ta, ma
1208 // vse32.v v10, (a0)
1209 // Example expandload sequence:
1210 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1211 // vcpop.m a1, v0
1212 // vsetvli zero, a1, e32, m2, ta, ma
1213 // vle32.v v10, (a0)
1214 // vsetivli zero, 8, e32, m2, ta, ma
1215 // viota.m v12, v0
1216 // vrgather.vv v8, v10, v12, v0.t
1217 auto MemOpCost =
1218 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1219 auto LT = getTypeLegalizationCost(DataTy);
1220 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1221 if (VariableMask)
1222 Opcodes.push_back(RISCV::VCPOP_M);
1223 if (Opcode == Instruction::Store)
1224 Opcodes.append({RISCV::VCOMPRESS_VM});
1225 else
1226 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1227 return MemOpCost +
1228 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1229}
1230
1234
1235 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1236 ? Instruction::Load
1237 : Instruction::Store;
1238
1239 Type *DataTy = MICA.getDataType();
1240 Align Alignment = MICA.getAlignment();
1241 const Instruction *I = MICA.getInst();
1242
1243 if (!isLegalStridedLoadStore(DataTy, Alignment))
1245
1247 return TTI::TCC_Basic;
1248
1249 // Cost is proportional to the number of memory operations implied. For
1250 // scalable vectors, we use an estimate on that number since we don't
1251 // know exactly what VL will be.
1252 auto &VTy = *cast<VectorType>(DataTy);
1253 InstructionCost MemOpCost =
1254 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1255 {TTI::OK_AnyValue, TTI::OP_None}, I);
1256 unsigned NumLoads = getEstimatedVLFor(&VTy);
1257 return NumLoads * MemOpCost;
1258}
1259
1262 // FIXME: This is a property of the default vector convention, not
1263 // all possible calling conventions. Fixing that will require
1264 // some TTI API and SLP rework.
1267 for (auto *Ty : Tys) {
1268 if (!Ty->isVectorTy())
1269 continue;
1270 Align A = DL.getPrefTypeAlign(Ty);
1271 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1272 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1273 }
1274 return Cost;
1275}
1276
1277// Currently, these represent both throughput and codesize costs
1278// for the respective intrinsics. The costs in this table are simply
1279// instruction counts with the following adjustments made:
1280// * One vsetvli is considered free.
1282 {Intrinsic::floor, MVT::f32, 9},
1283 {Intrinsic::floor, MVT::f64, 9},
1284 {Intrinsic::ceil, MVT::f32, 9},
1285 {Intrinsic::ceil, MVT::f64, 9},
1286 {Intrinsic::trunc, MVT::f32, 7},
1287 {Intrinsic::trunc, MVT::f64, 7},
1288 {Intrinsic::round, MVT::f32, 9},
1289 {Intrinsic::round, MVT::f64, 9},
1290 {Intrinsic::roundeven, MVT::f32, 9},
1291 {Intrinsic::roundeven, MVT::f64, 9},
1292 {Intrinsic::rint, MVT::f32, 7},
1293 {Intrinsic::rint, MVT::f64, 7},
1294 {Intrinsic::nearbyint, MVT::f32, 9},
1295 {Intrinsic::nearbyint, MVT::f64, 9},
1296 {Intrinsic::bswap, MVT::i16, 3},
1297 {Intrinsic::bswap, MVT::i32, 12},
1298 {Intrinsic::bswap, MVT::i64, 31},
1299 {Intrinsic::vp_bswap, MVT::i16, 3},
1300 {Intrinsic::vp_bswap, MVT::i32, 12},
1301 {Intrinsic::vp_bswap, MVT::i64, 31},
1302 {Intrinsic::vp_fshl, MVT::i8, 7},
1303 {Intrinsic::vp_fshl, MVT::i16, 7},
1304 {Intrinsic::vp_fshl, MVT::i32, 7},
1305 {Intrinsic::vp_fshl, MVT::i64, 7},
1306 {Intrinsic::vp_fshr, MVT::i8, 7},
1307 {Intrinsic::vp_fshr, MVT::i16, 7},
1308 {Intrinsic::vp_fshr, MVT::i32, 7},
1309 {Intrinsic::vp_fshr, MVT::i64, 7},
1310 {Intrinsic::bitreverse, MVT::i8, 17},
1311 {Intrinsic::bitreverse, MVT::i16, 24},
1312 {Intrinsic::bitreverse, MVT::i32, 33},
1313 {Intrinsic::bitreverse, MVT::i64, 52},
1314 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1315 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1316 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1317 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1318 {Intrinsic::ctpop, MVT::i8, 12},
1319 {Intrinsic::ctpop, MVT::i16, 19},
1320 {Intrinsic::ctpop, MVT::i32, 20},
1321 {Intrinsic::ctpop, MVT::i64, 21},
1322 {Intrinsic::ctlz, MVT::i8, 19},
1323 {Intrinsic::ctlz, MVT::i16, 28},
1324 {Intrinsic::ctlz, MVT::i32, 31},
1325 {Intrinsic::ctlz, MVT::i64, 35},
1326 {Intrinsic::cttz, MVT::i8, 16},
1327 {Intrinsic::cttz, MVT::i16, 23},
1328 {Intrinsic::cttz, MVT::i32, 24},
1329 {Intrinsic::cttz, MVT::i64, 25},
1330 {Intrinsic::vp_ctpop, MVT::i8, 12},
1331 {Intrinsic::vp_ctpop, MVT::i16, 19},
1332 {Intrinsic::vp_ctpop, MVT::i32, 20},
1333 {Intrinsic::vp_ctpop, MVT::i64, 21},
1334 {Intrinsic::vp_ctlz, MVT::i8, 19},
1335 {Intrinsic::vp_ctlz, MVT::i16, 28},
1336 {Intrinsic::vp_ctlz, MVT::i32, 31},
1337 {Intrinsic::vp_ctlz, MVT::i64, 35},
1338 {Intrinsic::vp_cttz, MVT::i8, 16},
1339 {Intrinsic::vp_cttz, MVT::i16, 23},
1340 {Intrinsic::vp_cttz, MVT::i32, 24},
1341 {Intrinsic::vp_cttz, MVT::i64, 25},
1342};
1343
1347 auto *RetTy = ICA.getReturnType();
1348 switch (ICA.getID()) {
1349 case Intrinsic::lrint:
1350 case Intrinsic::llrint:
1351 case Intrinsic::lround:
1352 case Intrinsic::llround: {
1353 auto LT = getTypeLegalizationCost(RetTy);
1354 Type *SrcTy = ICA.getArgTypes().front();
1355 auto SrcLT = getTypeLegalizationCost(SrcTy);
1356 if (ST->hasVInstructions() && LT.second.isVector()) {
1358 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1359 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1360 if (LT.second.getVectorElementType() == MVT::bf16) {
1361 if (!ST->hasVInstructionsBF16Minimal())
1363 if (DstEltSz == 32)
1364 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1365 else
1366 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1367 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1368 !ST->hasVInstructionsF16()) {
1369 if (!ST->hasVInstructionsF16Minimal())
1371 if (DstEltSz == 32)
1372 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1373 else
1374 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1375
1376 } else if (SrcEltSz > DstEltSz) {
1377 Ops = {RISCV::VFNCVT_X_F_W};
1378 } else if (SrcEltSz < DstEltSz) {
1379 Ops = {RISCV::VFWCVT_X_F_V};
1380 } else {
1381 Ops = {RISCV::VFCVT_X_F_V};
1382 }
1383
1384 // We need to use the source LMUL in the case of a narrowing op, and the
1385 // destination LMUL otherwise.
1386 if (SrcEltSz > DstEltSz)
1387 return SrcLT.first *
1388 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1389 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1390 }
1391 break;
1392 }
1393 case Intrinsic::ceil:
1394 case Intrinsic::floor:
1395 case Intrinsic::trunc:
1396 case Intrinsic::rint:
1397 case Intrinsic::round:
1398 case Intrinsic::roundeven: {
1399 // These all use the same code.
1400 auto LT = getTypeLegalizationCost(RetTy);
1401 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1402 return LT.first * 8;
1403 break;
1404 }
1405 case Intrinsic::umin:
1406 case Intrinsic::umax:
1407 case Intrinsic::smin:
1408 case Intrinsic::smax: {
1409 auto LT = getTypeLegalizationCost(RetTy);
1410 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1411 return LT.first;
1412
1413 if (ST->hasVInstructions() && LT.second.isVector()) {
1414 unsigned Op;
1415 switch (ICA.getID()) {
1416 case Intrinsic::umin:
1417 Op = RISCV::VMINU_VV;
1418 break;
1419 case Intrinsic::umax:
1420 Op = RISCV::VMAXU_VV;
1421 break;
1422 case Intrinsic::smin:
1423 Op = RISCV::VMIN_VV;
1424 break;
1425 case Intrinsic::smax:
1426 Op = RISCV::VMAX_VV;
1427 break;
1428 }
1429 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1430 }
1431 break;
1432 }
1433 case Intrinsic::sadd_sat:
1434 case Intrinsic::ssub_sat:
1435 case Intrinsic::uadd_sat:
1436 case Intrinsic::usub_sat: {
1437 auto LT = getTypeLegalizationCost(RetTy);
1438 if (ST->hasVInstructions() && LT.second.isVector()) {
1439 unsigned Op;
1440 switch (ICA.getID()) {
1441 case Intrinsic::sadd_sat:
1442 Op = RISCV::VSADD_VV;
1443 break;
1444 case Intrinsic::ssub_sat:
1445 Op = RISCV::VSSUBU_VV;
1446 break;
1447 case Intrinsic::uadd_sat:
1448 Op = RISCV::VSADDU_VV;
1449 break;
1450 case Intrinsic::usub_sat:
1451 Op = RISCV::VSSUBU_VV;
1452 break;
1453 }
1454 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1455 }
1456 break;
1457 }
1458 case Intrinsic::fma:
1459 case Intrinsic::fmuladd: {
1460 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1461 auto LT = getTypeLegalizationCost(RetTy);
1462 if (ST->hasVInstructions() && LT.second.isVector())
1463 return LT.first *
1464 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1465 break;
1466 }
1467 case Intrinsic::fabs: {
1468 auto LT = getTypeLegalizationCost(RetTy);
1469 if (ST->hasVInstructions() && LT.second.isVector()) {
1470 // lui a0, 8
1471 // addi a0, a0, -1
1472 // vsetvli a1, zero, e16, m1, ta, ma
1473 // vand.vx v8, v8, a0
1474 // f16 with zvfhmin and bf16 with zvfhbmin
1475 if (LT.second.getVectorElementType() == MVT::bf16 ||
1476 (LT.second.getVectorElementType() == MVT::f16 &&
1477 !ST->hasVInstructionsF16()))
1478 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1479 CostKind) +
1480 2;
1481 else
1482 return LT.first *
1483 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1484 }
1485 break;
1486 }
1487 case Intrinsic::sqrt: {
1488 auto LT = getTypeLegalizationCost(RetTy);
1489 if (ST->hasVInstructions() && LT.second.isVector()) {
1492 MVT ConvType = LT.second;
1493 MVT FsqrtType = LT.second;
1494 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1495 // will be spilt.
1496 if (LT.second.getVectorElementType() == MVT::bf16) {
1497 if (LT.second == MVT::nxv32bf16) {
1498 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1499 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1500 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1501 ConvType = MVT::nxv16f16;
1502 FsqrtType = MVT::nxv16f32;
1503 } else {
1504 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1505 FsqrtOp = {RISCV::VFSQRT_V};
1506 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1507 }
1508 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1509 !ST->hasVInstructionsF16()) {
1510 if (LT.second == MVT::nxv32f16) {
1511 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1512 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1513 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1514 ConvType = MVT::nxv16f16;
1515 FsqrtType = MVT::nxv16f32;
1516 } else {
1517 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1518 FsqrtOp = {RISCV::VFSQRT_V};
1519 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1520 }
1521 } else {
1522 FsqrtOp = {RISCV::VFSQRT_V};
1523 }
1524
1525 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1526 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1527 }
1528 break;
1529 }
1530 case Intrinsic::cttz:
1531 case Intrinsic::ctlz:
1532 case Intrinsic::ctpop: {
1533 auto LT = getTypeLegalizationCost(RetTy);
1534 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1535 unsigned Op;
1536 switch (ICA.getID()) {
1537 case Intrinsic::cttz:
1538 Op = RISCV::VCTZ_V;
1539 break;
1540 case Intrinsic::ctlz:
1541 Op = RISCV::VCLZ_V;
1542 break;
1543 case Intrinsic::ctpop:
1544 Op = RISCV::VCPOP_V;
1545 break;
1546 }
1547 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1548 }
1549 break;
1550 }
1551 case Intrinsic::abs: {
1552 auto LT = getTypeLegalizationCost(RetTy);
1553 if (ST->hasVInstructions() && LT.second.isVector()) {
1554 // vrsub.vi v10, v8, 0
1555 // vmax.vv v8, v8, v10
1556 return LT.first *
1557 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1558 LT.second, CostKind);
1559 }
1560 break;
1561 }
1562 case Intrinsic::fshl:
1563 case Intrinsic::fshr: {
1564 if (ICA.getArgs().empty())
1565 break;
1566
1567 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1568 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1569 // instruction.
1570 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1571 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1572 (RetTy->getIntegerBitWidth() == 32 ||
1573 RetTy->getIntegerBitWidth() == 64) &&
1574 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1575 return 1;
1576 }
1577 break;
1578 }
1579 case Intrinsic::get_active_lane_mask: {
1580 if (ST->hasVInstructions()) {
1581 Type *ExpRetTy = VectorType::get(
1582 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1583 auto LT = getTypeLegalizationCost(ExpRetTy);
1584
1585 // vid.v v8 // considered hoisted
1586 // vsaddu.vx v8, v8, a0
1587 // vmsltu.vx v0, v8, a1
1588 return LT.first *
1589 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1590 LT.second, CostKind);
1591 }
1592 break;
1593 }
1594 // TODO: add more intrinsic
1595 case Intrinsic::stepvector: {
1596 auto LT = getTypeLegalizationCost(RetTy);
1597 // Legalisation of illegal types involves an `index' instruction plus
1598 // (LT.first - 1) vector adds.
1599 if (ST->hasVInstructions())
1600 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1601 (LT.first - 1) *
1602 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1603 return 1 + (LT.first - 1);
1604 }
1605 case Intrinsic::experimental_cttz_elts: {
1606 Type *ArgTy = ICA.getArgTypes()[0];
1607 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1608 if (getTLI()->shouldExpandCttzElements(ArgType))
1609 break;
1610 InstructionCost Cost = getRISCVInstructionCost(
1611 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1612
1613 // If zero_is_poison is false, then we will generate additional
1614 // cmp + select instructions to convert -1 to EVL.
1615 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1616 if (ICA.getArgs().size() > 1 &&
1617 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1618 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1620 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1622
1623 return Cost;
1624 }
1625 case Intrinsic::experimental_vp_splat: {
1626 auto LT = getTypeLegalizationCost(RetTy);
1627 // TODO: Lower i1 experimental_vp_splat
1628 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1630 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1631 ? RISCV::VFMV_V_F
1632 : RISCV::VMV_V_X,
1633 LT.second, CostKind);
1634 }
1635 case Intrinsic::experimental_vp_splice: {
1636 // To support type-based query from vectorizer, set the index to 0.
1637 // Note that index only change the cost from vslide.vx to vslide.vi and in
1638 // current implementations they have same costs.
1640 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1642 }
1643 case Intrinsic::fptoui_sat:
1644 case Intrinsic::fptosi_sat: {
1646 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1647 Type *SrcTy = ICA.getArgTypes()[0];
1648
1649 auto SrcLT = getTypeLegalizationCost(SrcTy);
1650 auto DstLT = getTypeLegalizationCost(RetTy);
1651 if (!SrcTy->isVectorTy())
1652 break;
1653
1654 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1656
1657 Cost +=
1658 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1659 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1660
1661 // Handle NaN.
1662 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1663 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1664 Type *CondTy = RetTy->getWithNewBitWidth(1);
1665 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1667 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1669 return Cost;
1670 }
1671 }
1672
1673 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1674 if (auto LT = getTypeLegalizationCost(RetTy);
1675 LT.second.isVector()) {
1676 MVT EltTy = LT.second.getVectorElementType();
1677 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1678 ICA.getID(), EltTy))
1679 return LT.first * Entry->Cost;
1680 }
1681 }
1682
1684}
1685
1688 const SCEV *Ptr,
1690 // Address computations for vector indexed load/store likely require an offset
1691 // and/or scaling.
1692 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1693 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1694
1695 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1696}
1697
1699 Type *Src,
1702 const Instruction *I) const {
1703 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1704 if (!IsVectorType)
1705 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1706
1707 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1708 // For now, skip all fixed vector cost analysis when P extension is available
1709 // to avoid crashes in getMinRVVVectorSizeInBits()
1710 if (ST->enablePExtCodeGen() &&
1712 return 1; // Treat as single instruction cost for now
1713 }
1714
1715 // FIXME: Need to compute legalizing cost for illegal types. The current
1716 // code handles only legal types and those which can be trivially
1717 // promoted to legal.
1718 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1719 Dst->getScalarSizeInBits() > ST->getELen())
1720 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1721
1722 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1723 assert(ISD && "Invalid opcode");
1724 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1725 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1726
1727 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1728 // The shared implementation doesn't model vector widening during legalization
1729 // and instead assumes scalarization. In order to scalarize an <N x i1>
1730 // vector, we need to extend/trunc to/from i8. If we don't special case
1731 // this, we can get an infinite recursion cycle.
1732 switch (ISD) {
1733 default:
1734 break;
1735 case ISD::SIGN_EXTEND:
1736 case ISD::ZERO_EXTEND:
1737 if (Src->getScalarSizeInBits() == 1) {
1738 // We do not use vsext/vzext to extend from mask vector.
1739 // Instead we use the following instructions to extend from mask vector:
1740 // vmv.v.i v8, 0
1741 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1742 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1743 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1744 DstLT.second, CostKind) +
1745 DstLT.first - 1;
1746 }
1747 break;
1748 case ISD::TRUNCATE:
1749 if (Dst->getScalarSizeInBits() == 1) {
1750 // We do not use several vncvt to truncate to mask vector. So we could
1751 // not use PowDiff to calculate it.
1752 // Instead we use the following instructions to truncate to mask vector:
1753 // vand.vi v8, v8, 1
1754 // vmsne.vi v0, v8, 0
1755 return SrcLT.first *
1756 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1757 SrcLT.second, CostKind) +
1758 SrcLT.first - 1;
1759 }
1760 break;
1761 };
1762
1763 // Our actual lowering for the case where a wider legal type is available
1764 // uses promotion to the wider type. This is reflected in the result of
1765 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1766 // scalarized if the legalized Src and Dst are not equal sized.
1767 const DataLayout &DL = this->getDataLayout();
1768 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1769 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1770 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1771 SrcLT.second.getSizeInBits()) ||
1772 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1773 DstLT.second.getSizeInBits()) ||
1774 SrcLT.first > 1 || DstLT.first > 1)
1775 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1776
1777 // The split cost is handled by the base getCastInstrCost
1778 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1779
1780 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1781 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1782 switch (ISD) {
1783 case ISD::SIGN_EXTEND:
1784 case ISD::ZERO_EXTEND: {
1785 if ((PowDiff < 1) || (PowDiff > 3))
1786 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1787 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1788 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1789 unsigned Op =
1790 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1791 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1792 }
1793 case ISD::TRUNCATE:
1794 case ISD::FP_EXTEND:
1795 case ISD::FP_ROUND: {
1796 // Counts of narrow/widen instructions.
1797 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1798 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1799
1800 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1801 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1802 : RISCV::VFNCVT_F_F_W;
1804 for (; SrcEltSize != DstEltSize;) {
1805 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1806 ? MVT::getIntegerVT(DstEltSize)
1807 : MVT::getFloatingPointVT(DstEltSize);
1808 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1809 DstEltSize =
1810 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1811 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1812 }
1813 return Cost;
1814 }
1815 case ISD::FP_TO_SINT:
1816 case ISD::FP_TO_UINT: {
1817 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1818 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1819 unsigned FWCVT =
1820 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1821 unsigned FNCVT =
1822 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1823 unsigned SrcEltSize = Src->getScalarSizeInBits();
1824 unsigned DstEltSize = Dst->getScalarSizeInBits();
1826 if ((SrcEltSize == 16) &&
1827 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1828 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1829 // pre-widening to f32 and then convert f32 to integer
1830 VectorType *VecF32Ty =
1831 VectorType::get(Type::getFloatTy(Dst->getContext()),
1832 cast<VectorType>(Dst)->getElementCount());
1833 std::pair<InstructionCost, MVT> VecF32LT =
1834 getTypeLegalizationCost(VecF32Ty);
1835 Cost +=
1836 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1837 VecF32LT.second, CostKind);
1838 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1839 return Cost;
1840 }
1841 if (DstEltSize == SrcEltSize)
1842 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1843 else if (DstEltSize > SrcEltSize)
1844 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1845 else { // (SrcEltSize > DstEltSize)
1846 // First do a narrowing conversion to an integer half the size, then
1847 // truncate if needed.
1848 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1849 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1850 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1851 if ((SrcEltSize / 2) > DstEltSize) {
1852 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1853 Cost +=
1854 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1855 }
1856 }
1857 return Cost;
1858 }
1859 case ISD::SINT_TO_FP:
1860 case ISD::UINT_TO_FP: {
1861 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1862 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1863 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1864 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1865 unsigned SrcEltSize = Src->getScalarSizeInBits();
1866 unsigned DstEltSize = Dst->getScalarSizeInBits();
1867
1869 if ((DstEltSize == 16) &&
1870 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1871 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1872 // it is converted to f32 and then converted to f16
1873 VectorType *VecF32Ty =
1874 VectorType::get(Type::getFloatTy(Dst->getContext()),
1875 cast<VectorType>(Dst)->getElementCount());
1876 std::pair<InstructionCost, MVT> VecF32LT =
1877 getTypeLegalizationCost(VecF32Ty);
1878 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1879 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1880 DstLT.second, CostKind);
1881 return Cost;
1882 }
1883
1884 if (DstEltSize == SrcEltSize)
1885 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1886 else if (DstEltSize > SrcEltSize) {
1887 if ((DstEltSize / 2) > SrcEltSize) {
1888 VectorType *VecTy =
1889 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1890 cast<VectorType>(Dst)->getElementCount());
1891 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1892 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1893 }
1894 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1895 } else
1896 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1897 return Cost;
1898 }
1899 }
1900 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1901}
1902
1903unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1904 if (isa<ScalableVectorType>(Ty)) {
1905 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1906 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1907 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1908 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1909 }
1910 return cast<FixedVectorType>(Ty)->getNumElements();
1911}
1912
1915 FastMathFlags FMF,
1917 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1918 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1919
1920 // Skip if scalar size of Ty is bigger than ELEN.
1921 if (Ty->getScalarSizeInBits() > ST->getELen())
1922 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1923
1924 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1925 if (Ty->getElementType()->isIntegerTy(1)) {
1926 // SelectionDAGBuilder does following transforms:
1927 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1928 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1929 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1930 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1931 else
1932 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1933 }
1934
1935 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1937 InstructionCost ExtraCost = 0;
1938 switch (IID) {
1939 case Intrinsic::maximum:
1940 if (FMF.noNaNs()) {
1941 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1942 } else {
1943 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1944 RISCV::VFMV_F_S};
1945 // Cost of Canonical Nan + branch
1946 // lui a0, 523264
1947 // fmv.w.x fa0, a0
1948 Type *DstTy = Ty->getScalarType();
1949 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1950 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1951 ExtraCost = 1 +
1952 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1954 getCFInstrCost(Instruction::Br, CostKind);
1955 }
1956 break;
1957
1958 case Intrinsic::minimum:
1959 if (FMF.noNaNs()) {
1960 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1961 } else {
1962 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1963 RISCV::VFMV_F_S};
1964 // Cost of Canonical Nan + branch
1965 // lui a0, 523264
1966 // fmv.w.x fa0, a0
1967 Type *DstTy = Ty->getScalarType();
1968 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1969 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1970 ExtraCost = 1 +
1971 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1973 getCFInstrCost(Instruction::Br, CostKind);
1974 }
1975 break;
1976 }
1977 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1978 }
1979
1980 // IR Reduction is composed by one rvv reduction instruction and vmv
1981 unsigned SplitOp;
1983 switch (IID) {
1984 default:
1985 llvm_unreachable("Unsupported intrinsic");
1986 case Intrinsic::smax:
1987 SplitOp = RISCV::VMAX_VV;
1988 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1989 break;
1990 case Intrinsic::smin:
1991 SplitOp = RISCV::VMIN_VV;
1992 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1993 break;
1994 case Intrinsic::umax:
1995 SplitOp = RISCV::VMAXU_VV;
1996 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1997 break;
1998 case Intrinsic::umin:
1999 SplitOp = RISCV::VMINU_VV;
2000 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2001 break;
2002 case Intrinsic::maxnum:
2003 SplitOp = RISCV::VFMAX_VV;
2004 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2005 break;
2006 case Intrinsic::minnum:
2007 SplitOp = RISCV::VFMIN_VV;
2008 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2009 break;
2010 }
2011 // Add a cost for data larger than LMUL8
2012 InstructionCost SplitCost =
2013 (LT.first > 1) ? (LT.first - 1) *
2014 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2015 : 0;
2016 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2017}
2018
2021 std::optional<FastMathFlags> FMF,
2023 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2024 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2025
2026 // Skip if scalar size of Ty is bigger than ELEN.
2027 if (Ty->getScalarSizeInBits() > ST->getELen())
2028 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2029
2030 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2031 assert(ISD && "Invalid opcode");
2032
2033 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2034 ISD != ISD::FADD)
2035 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2036
2037 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2038 Type *ElementTy = Ty->getElementType();
2039 if (ElementTy->isIntegerTy(1)) {
2040 // Example sequences:
2041 // vfirst.m a0, v0
2042 // seqz a0, a0
2043 if (LT.second == MVT::v1i1)
2044 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2045 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2047
2048 if (ISD == ISD::AND) {
2049 // Example sequences:
2050 // vmand.mm v8, v9, v8 ; needed every time type is split
2051 // vmnot.m v8, v0 ; alias for vmnand
2052 // vcpop.m a0, v8
2053 // seqz a0, a0
2054
2055 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2056 // For LMUL <= 8, there is no splitting,
2057 // the sequences are vmnot, vcpop and seqz.
2058 // When LMUL > 8 and split = 1,
2059 // the sequences are vmnand, vcpop and seqz.
2060 // When LMUL > 8 and split > 1,
2061 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2062 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2063 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2064 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2065 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2066 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2068 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2069 // Example sequences:
2070 // vsetvli a0, zero, e8, mf8, ta, ma
2071 // vmxor.mm v8, v0, v8 ; needed every time type is split
2072 // vcpop.m a0, v8
2073 // andi a0, a0, 1
2074 return (LT.first - 1) *
2075 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2076 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2077 } else {
2078 assert(ISD == ISD::OR);
2079 // Example sequences:
2080 // vsetvli a0, zero, e8, mf8, ta, ma
2081 // vmor.mm v8, v9, v8 ; needed every time type is split
2082 // vcpop.m a0, v0
2083 // snez a0, a0
2084 return (LT.first - 1) *
2085 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2086 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2087 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2089 }
2090 }
2091
2092 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2093 // instruction, and others is composed by two vmv and one rvv reduction
2094 // instruction
2095 unsigned SplitOp;
2097 switch (ISD) {
2098 case ISD::ADD:
2099 SplitOp = RISCV::VADD_VV;
2100 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2101 break;
2102 case ISD::OR:
2103 SplitOp = RISCV::VOR_VV;
2104 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2105 break;
2106 case ISD::XOR:
2107 SplitOp = RISCV::VXOR_VV;
2108 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2109 break;
2110 case ISD::AND:
2111 SplitOp = RISCV::VAND_VV;
2112 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2113 break;
2114 case ISD::FADD:
2115 // We can't promote f16/bf16 fadd reductions.
2116 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2117 LT.second.getScalarType() == MVT::bf16)
2118 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2120 Opcodes.push_back(RISCV::VFMV_S_F);
2121 for (unsigned i = 0; i < LT.first.getValue(); i++)
2122 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2123 Opcodes.push_back(RISCV::VFMV_F_S);
2124 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2125 }
2126 SplitOp = RISCV::VFADD_VV;
2127 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2128 break;
2129 }
2130 // Add a cost for data larger than LMUL8
2131 InstructionCost SplitCost =
2132 (LT.first > 1) ? (LT.first - 1) *
2133 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2134 : 0;
2135 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2136}
2137
2139 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2140 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2141 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2142 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2143 FMF, CostKind);
2144
2145 // Skip if scalar size of ResTy is bigger than ELEN.
2146 if (ResTy->getScalarSizeInBits() > ST->getELen())
2147 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2148 FMF, CostKind);
2149
2150 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2151 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2152 FMF, CostKind);
2153
2154 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2155
2156 if (IsUnsigned && Opcode == Instruction::Add &&
2157 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2158 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2159 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2160 return LT.first *
2161 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2162 }
2163
2164 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2165 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2166 FMF, CostKind);
2167
2168 return (LT.first - 1) +
2169 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2170}
2171
2175 assert(OpInfo.isConstant() && "non constant operand?");
2176 if (!isa<VectorType>(Ty))
2177 // FIXME: We need to account for immediate materialization here, but doing
2178 // a decent job requires more knowledge about the immediate than we
2179 // currently have here.
2180 return 0;
2181
2182 if (OpInfo.isUniform())
2183 // vmv.v.i, vmv.v.x, or vfmv.v.f
2184 // We ignore the cost of the scalar constant materialization to be consistent
2185 // with how we treat scalar constants themselves just above.
2186 return 1;
2187
2188 return getConstantPoolLoadCost(Ty, CostKind);
2189}
2190
2192 Align Alignment,
2193 unsigned AddressSpace,
2195 TTI::OperandValueInfo OpInfo,
2196 const Instruction *I) const {
2197 EVT VT = TLI->getValueType(DL, Src, true);
2198 // Type legalization can't handle structs
2199 if (VT == MVT::Other)
2200 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2201 CostKind, OpInfo, I);
2202
2204 if (Opcode == Instruction::Store && OpInfo.isConstant())
2205 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2206
2207 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2208
2209 InstructionCost BaseCost = [&]() {
2210 InstructionCost Cost = LT.first;
2212 return Cost;
2213
2214 // Our actual lowering for the case where a wider legal type is available
2215 // uses the a VL predicated load on the wider type. This is reflected in
2216 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2217 // widened cases are scalarized.
2218 const DataLayout &DL = this->getDataLayout();
2219 if (Src->isVectorTy() && LT.second.isVector() &&
2220 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2221 LT.second.getSizeInBits()))
2222 return Cost;
2223
2224 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2225 CostKind, OpInfo, I);
2226 }();
2227
2228 // Assume memory ops cost scale with the number of vector registers
2229 // possible accessed by the instruction. Note that BasicTTI already
2230 // handles the LT.first term for us.
2231 if (ST->hasVInstructions() && LT.second.isVector() &&
2233 BaseCost *= TLI->getLMULCost(LT.second);
2234 return Cost + BaseCost;
2235}
2236
2238 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2240 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2242 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2243 Op1Info, Op2Info, I);
2244
2245 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2246 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2247 Op1Info, Op2Info, I);
2248
2249 // Skip if scalar size of ValTy is bigger than ELEN.
2250 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2251 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2252 Op1Info, Op2Info, I);
2253
2254 auto GetConstantMatCost =
2255 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2256 if (OpInfo.isUniform())
2257 // We return 0 we currently ignore the cost of materializing scalar
2258 // constants in GPRs.
2259 return 0;
2260
2261 return getConstantPoolLoadCost(ValTy, CostKind);
2262 };
2263
2264 InstructionCost ConstantMatCost;
2265 if (Op1Info.isConstant())
2266 ConstantMatCost += GetConstantMatCost(Op1Info);
2267 if (Op2Info.isConstant())
2268 ConstantMatCost += GetConstantMatCost(Op2Info);
2269
2270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2271 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2272 if (CondTy->isVectorTy()) {
2273 if (ValTy->getScalarSizeInBits() == 1) {
2274 // vmandn.mm v8, v8, v9
2275 // vmand.mm v9, v0, v9
2276 // vmor.mm v0, v9, v8
2277 return ConstantMatCost +
2278 LT.first *
2279 getRISCVInstructionCost(
2280 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2281 LT.second, CostKind);
2282 }
2283 // vselect and max/min are supported natively.
2284 return ConstantMatCost +
2285 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2286 CostKind);
2287 }
2288
2289 if (ValTy->getScalarSizeInBits() == 1) {
2290 // vmv.v.x v9, a0
2291 // vmsne.vi v9, v9, 0
2292 // vmandn.mm v8, v8, v9
2293 // vmand.mm v9, v0, v9
2294 // vmor.mm v0, v9, v8
2295 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2296 return ConstantMatCost +
2297 LT.first *
2298 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2299 InterimVT, CostKind) +
2300 LT.first * getRISCVInstructionCost(
2301 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2302 LT.second, CostKind);
2303 }
2304
2305 // vmv.v.x v10, a0
2306 // vmsne.vi v0, v10, 0
2307 // vmerge.vvm v8, v9, v8, v0
2308 return ConstantMatCost +
2309 LT.first * getRISCVInstructionCost(
2310 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2311 LT.second, CostKind);
2312 }
2313
2314 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2315 CmpInst::isIntPredicate(VecPred)) {
2316 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2317 // provided they incur the same cost across all implementations
2318 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2319 LT.second,
2320 CostKind);
2321 }
2322
2323 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2324 CmpInst::isFPPredicate(VecPred)) {
2325
2326 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2327 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2328 return ConstantMatCost +
2329 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2330
2331 // If we do not support the input floating point vector type, use the base
2332 // one which will calculate as:
2333 // ScalarizeCost + Num * Cost for fixed vector,
2334 // InvalidCost for scalable vector.
2335 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2336 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2337 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2338 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2339 Op1Info, Op2Info, I);
2340
2341 // Assuming vector fp compare and mask instructions are all the same cost
2342 // until a need arises to differentiate them.
2343 switch (VecPred) {
2344 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2345 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2346 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2347 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2348 return ConstantMatCost +
2349 LT.first * getRISCVInstructionCost(
2350 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2351 LT.second, CostKind);
2352
2353 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2354 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2355 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2356 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2357 return ConstantMatCost +
2358 LT.first *
2359 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2360 LT.second, CostKind);
2361
2362 case CmpInst::FCMP_OEQ: // vmfeq.vv
2363 case CmpInst::FCMP_OGT: // vmflt.vv
2364 case CmpInst::FCMP_OGE: // vmfle.vv
2365 case CmpInst::FCMP_OLT: // vmflt.vv
2366 case CmpInst::FCMP_OLE: // vmfle.vv
2367 case CmpInst::FCMP_UNE: // vmfne.vv
2368 return ConstantMatCost +
2369 LT.first *
2370 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2371 default:
2372 break;
2373 }
2374 }
2375
2376 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2377 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2378 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2379 // be (0 + select instr cost).
2380 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2381 ValTy->isIntegerTy() && !I->user_empty()) {
2382 if (all_of(I->users(), [&](const User *U) {
2383 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2384 U->getType()->isIntegerTy() &&
2385 !isa<ConstantData>(U->getOperand(1)) &&
2386 !isa<ConstantData>(U->getOperand(2));
2387 }))
2388 return 0;
2389 }
2390
2391 // TODO: Add cost for scalar type.
2392
2393 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2394 Op1Info, Op2Info, I);
2395}
2396
2399 const Instruction *I) const {
2401 return Opcode == Instruction::PHI ? 0 : 1;
2402 // Branches are assumed to be predicted.
2403 return 0;
2404}
2405
2408 unsigned Index,
2409 const Value *Op0,
2410 const Value *Op1) const {
2411 assert(Val->isVectorTy() && "This must be a vector type");
2412
2413 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2414 // For now, skip all fixed vector cost analysis when P extension is available
2415 // to avoid crashes in getMinRVVVectorSizeInBits()
2416 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
2417 return 1; // Treat as single instruction cost for now
2418 }
2419
2420 if (Opcode != Instruction::ExtractElement &&
2421 Opcode != Instruction::InsertElement)
2422 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2423
2424 // Legalize the type.
2425 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2426
2427 // This type is legalized to a scalar type.
2428 if (!LT.second.isVector()) {
2429 auto *FixedVecTy = cast<FixedVectorType>(Val);
2430 // If Index is a known constant, cost is zero.
2431 if (Index != -1U)
2432 return 0;
2433 // Extract/InsertElement with non-constant index is very costly when
2434 // scalarized; estimate cost of loads/stores sequence via the stack:
2435 // ExtractElement cost: store vector to stack, load scalar;
2436 // InsertElement cost: store vector to stack, store scalar, load vector.
2437 Type *ElemTy = FixedVecTy->getElementType();
2438 auto NumElems = FixedVecTy->getNumElements();
2439 auto Align = DL.getPrefTypeAlign(ElemTy);
2440 InstructionCost LoadCost =
2441 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2442 InstructionCost StoreCost =
2443 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2444 return Opcode == Instruction::ExtractElement
2445 ? StoreCost * NumElems + LoadCost
2446 : (StoreCost + LoadCost) * NumElems + StoreCost;
2447 }
2448
2449 // For unsupported scalable vector.
2450 if (LT.second.isScalableVector() && !LT.first.isValid())
2451 return LT.first;
2452
2453 // Mask vector extract/insert is expanded via e8.
2454 if (Val->getScalarSizeInBits() == 1) {
2455 VectorType *WideTy =
2457 cast<VectorType>(Val)->getElementCount());
2458 if (Opcode == Instruction::ExtractElement) {
2459 InstructionCost ExtendCost
2460 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2462 InstructionCost ExtractCost
2463 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2464 return ExtendCost + ExtractCost;
2465 }
2466 InstructionCost ExtendCost
2467 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2469 InstructionCost InsertCost
2470 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2471 InstructionCost TruncCost
2472 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2474 return ExtendCost + InsertCost + TruncCost;
2475 }
2476
2477
2478 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2479 // and vslideup + vmv.s.x to insert element to vector.
2480 unsigned BaseCost = 1;
2481 // When insertelement we should add the index with 1 as the input of vslideup.
2482 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2483
2484 if (Index != -1U) {
2485 // The type may be split. For fixed-width vectors we can normalize the
2486 // index to the new type.
2487 if (LT.second.isFixedLengthVector()) {
2488 unsigned Width = LT.second.getVectorNumElements();
2489 Index = Index % Width;
2490 }
2491
2492 // If exact VLEN is known, we will insert/extract into the appropriate
2493 // subvector with no additional subvector insert/extract cost.
2494 if (auto VLEN = ST->getRealVLen()) {
2495 unsigned EltSize = LT.second.getScalarSizeInBits();
2496 unsigned M1Max = *VLEN / EltSize;
2497 Index = Index % M1Max;
2498 }
2499
2500 if (Index == 0)
2501 // We can extract/insert the first element without vslidedown/vslideup.
2502 SlideCost = 0;
2503 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2504 Val->getScalarType()->isIntegerTy())
2505 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2506 else if (Opcode == Instruction::InsertElement)
2507 SlideCost = 1; // With a constant index, we do not need to use addi.
2508 }
2509
2510 // When the vector needs to split into multiple register groups and the index
2511 // exceeds single vector register group, we need to insert/extract the element
2512 // via stack.
2513 if (LT.first > 1 &&
2514 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2515 LT.second.isScalableVector()))) {
2516 Type *ScalarType = Val->getScalarType();
2517 Align VecAlign = DL.getPrefTypeAlign(Val);
2518 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2519 // Extra addi for unknown index.
2520 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2521
2522 // Store all split vectors into stack and load the target element.
2523 if (Opcode == Instruction::ExtractElement)
2524 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2525 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2526 CostKind) +
2527 IdxCost;
2528
2529 // Store all split vectors into stack and store the target element and load
2530 // vectors back.
2531 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2532 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2533 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2534 CostKind) +
2535 IdxCost;
2536 }
2537
2538 // Extract i64 in the target that has XLEN=32 need more instruction.
2539 if (Val->getScalarType()->isIntegerTy() &&
2540 ST->getXLen() < Val->getScalarSizeInBits()) {
2541 // For extractelement, we need the following instructions:
2542 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2543 // vslidedown.vx v8, v8, a0
2544 // vmv.x.s a0, v8
2545 // li a1, 32
2546 // vsrl.vx v8, v8, a1
2547 // vmv.x.s a1, v8
2548
2549 // For insertelement, we need the following instructions:
2550 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2551 // vmv.v.i v12, 0
2552 // vslide1up.vx v16, v12, a1
2553 // vslide1up.vx v12, v16, a0
2554 // addi a0, a2, 1
2555 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2556 // vslideup.vx v8, v12, a2
2557
2558 // TODO: should we count these special vsetvlis?
2559 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2560 }
2561 return BaseCost + SlideCost;
2562}
2563
2567 unsigned Index) const {
2568 if (isa<FixedVectorType>(Val))
2570 Index);
2571
2572 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2573 // for the cost of extracting the last lane of a scalable vector. It probably
2574 // needs a more accurate cost.
2575 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2576 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2577 return getVectorInstrCost(Opcode, Val, CostKind,
2578 EC.getKnownMinValue() - 1 - Index, nullptr,
2579 nullptr);
2580}
2581
2583 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2585 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2586
2587 // TODO: Handle more cost kinds.
2589 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2590 Args, CxtI);
2591
2592 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2593 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2594 Args, CxtI);
2595
2596 // Skip if scalar size of Ty is bigger than ELEN.
2597 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2598 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2599 Args, CxtI);
2600
2601 // Legalize the type.
2602 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2603
2604 // TODO: Handle scalar type.
2605 if (!LT.second.isVector())
2606 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2607 Args, CxtI);
2608
2609 // f16 with zvfhmin and bf16 will be promoted to f32.
2610 // FIXME: nxv32[b]f16 will be custom lowered and split.
2611 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2612 InstructionCost CastCost = 0;
2613 if ((LT.second.getVectorElementType() == MVT::f16 ||
2614 LT.second.getVectorElementType() == MVT::bf16) &&
2615 TLI->getOperationAction(ISDOpcode, LT.second) ==
2617 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2618 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2619 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2620 // Add cost of extending arguments
2621 CastCost += LT.first * Args.size() *
2622 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2624 // Add cost of truncating result
2625 CastCost +=
2626 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2628 // Compute cost of op in promoted type
2629 LT.second = PromotedVT;
2630 }
2631
2632 auto getConstantMatCost =
2633 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2634 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2635 // Two sub-cases:
2636 // * Has a 5 bit immediate operand which can be splatted.
2637 // * Has a larger immediate which must be materialized in scalar register
2638 // We return 0 for both as we currently ignore the cost of materializing
2639 // scalar constants in GPRs.
2640 return 0;
2641
2642 return getConstantPoolLoadCost(Ty, CostKind);
2643 };
2644
2645 // Add the cost of materializing any constant vectors required.
2646 InstructionCost ConstantMatCost = 0;
2647 if (Op1Info.isConstant())
2648 ConstantMatCost += getConstantMatCost(0, Op1Info);
2649 if (Op2Info.isConstant())
2650 ConstantMatCost += getConstantMatCost(1, Op2Info);
2651
2652 unsigned Op;
2653 switch (ISDOpcode) {
2654 case ISD::ADD:
2655 case ISD::SUB:
2656 Op = RISCV::VADD_VV;
2657 break;
2658 case ISD::SHL:
2659 case ISD::SRL:
2660 case ISD::SRA:
2661 Op = RISCV::VSLL_VV;
2662 break;
2663 case ISD::AND:
2664 case ISD::OR:
2665 case ISD::XOR:
2666 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2667 break;
2668 case ISD::MUL:
2669 case ISD::MULHS:
2670 case ISD::MULHU:
2671 Op = RISCV::VMUL_VV;
2672 break;
2673 case ISD::SDIV:
2674 case ISD::UDIV:
2675 Op = RISCV::VDIV_VV;
2676 break;
2677 case ISD::SREM:
2678 case ISD::UREM:
2679 Op = RISCV::VREM_VV;
2680 break;
2681 case ISD::FADD:
2682 case ISD::FSUB:
2683 Op = RISCV::VFADD_VV;
2684 break;
2685 case ISD::FMUL:
2686 Op = RISCV::VFMUL_VV;
2687 break;
2688 case ISD::FDIV:
2689 Op = RISCV::VFDIV_VV;
2690 break;
2691 case ISD::FNEG:
2692 Op = RISCV::VFSGNJN_VV;
2693 break;
2694 default:
2695 // Assuming all other instructions have the same cost until a need arises to
2696 // differentiate them.
2697 return CastCost + ConstantMatCost +
2698 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2699 Args, CxtI);
2700 }
2701
2702 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2703 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2704 // ops are twice as expensive as integer ops. Do the same for vectors so
2705 // scalar floating point ops aren't cheaper than their vector equivalents.
2706 if (Ty->isFPOrFPVectorTy())
2707 InstrCost *= 2;
2708 return CastCost + ConstantMatCost + LT.first * InstrCost;
2709}
2710
2711// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2713 ArrayRef<const Value *> Ptrs, const Value *Base,
2714 const TTI::PointersChainInfo &Info, Type *AccessTy,
2717 // In the basic model we take into account GEP instructions only
2718 // (although here can come alloca instruction, a value, constants and/or
2719 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2720 // pointer). Typically, if Base is a not a GEP-instruction and all the
2721 // pointers are relative to the same base address, all the rest are
2722 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2723 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2724 // any their index is a non-const.
2725 // If no known dependencies between the pointers cost is calculated as a sum
2726 // of costs of GEP instructions.
2727 for (auto [I, V] : enumerate(Ptrs)) {
2728 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2729 if (!GEP)
2730 continue;
2731 if (Info.isSameBase() && V != Base) {
2732 if (GEP->hasAllConstantIndices())
2733 continue;
2734 // If the chain is unit-stride and BaseReg + stride*i is a legal
2735 // addressing mode, then presume the base GEP is sitting around in a
2736 // register somewhere and check if we can fold the offset relative to
2737 // it.
2738 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2739 if (Info.isUnitStride() &&
2740 isLegalAddressingMode(AccessTy,
2741 /* BaseGV */ nullptr,
2742 /* BaseOffset */ Stride * I,
2743 /* HasBaseReg */ true,
2744 /* Scale */ 0,
2745 GEP->getType()->getPointerAddressSpace()))
2746 continue;
2747 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2748 {TTI::OK_AnyValue, TTI::OP_None},
2749 {TTI::OK_AnyValue, TTI::OP_None}, {});
2750 } else {
2751 SmallVector<const Value *> Indices(GEP->indices());
2752 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2753 Indices, AccessTy, CostKind);
2754 }
2755 }
2756 return Cost;
2757}
2758
2761 OptimizationRemarkEmitter *ORE) const {
2762 // TODO: More tuning on benchmarks and metrics with changes as needed
2763 // would apply to all settings below to enable performance.
2764
2765
2766 if (ST->enableDefaultUnroll())
2767 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2768
2769 // Enable Upper bound unrolling universally, not dependent upon the conditions
2770 // below.
2771 UP.UpperBound = true;
2772
2773 // Disable loop unrolling for Oz and Os.
2774 UP.OptSizeThreshold = 0;
2776 if (L->getHeader()->getParent()->hasOptSize())
2777 return;
2778
2779 SmallVector<BasicBlock *, 4> ExitingBlocks;
2780 L->getExitingBlocks(ExitingBlocks);
2781 LLVM_DEBUG(dbgs() << "Loop has:\n"
2782 << "Blocks: " << L->getNumBlocks() << "\n"
2783 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2784
2785 // Only allow another exit other than the latch. This acts as an early exit
2786 // as it mirrors the profitability calculation of the runtime unroller.
2787 if (ExitingBlocks.size() > 2)
2788 return;
2789
2790 // Limit the CFG of the loop body for targets with a branch predictor.
2791 // Allowing 4 blocks permits if-then-else diamonds in the body.
2792 if (L->getNumBlocks() > 4)
2793 return;
2794
2795 // Scan the loop: don't unroll loops with calls as this could prevent
2796 // inlining. Don't unroll auto-vectorized loops either, though do allow
2797 // unrolling of the scalar remainder.
2798 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2800 for (auto *BB : L->getBlocks()) {
2801 for (auto &I : *BB) {
2802 // Both auto-vectorized loops and the scalar remainder have the
2803 // isvectorized attribute, so differentiate between them by the presence
2804 // of vector instructions.
2805 if (IsVectorized && I.getType()->isVectorTy())
2806 return;
2807
2808 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2809 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2810 if (!isLoweredToCall(F))
2811 continue;
2812 }
2813 return;
2814 }
2815
2816 SmallVector<const Value *> Operands(I.operand_values());
2817 Cost += getInstructionCost(&I, Operands,
2819 }
2820 }
2821
2822 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2823
2824 UP.Partial = true;
2825 UP.Runtime = true;
2826 UP.UnrollRemainder = true;
2827 UP.UnrollAndJam = true;
2828
2829 // Force unrolling small loops can be very useful because of the branch
2830 // taken cost of the backedge.
2831 if (Cost < 12)
2832 UP.Force = true;
2833}
2834
2839
2841 MemIntrinsicInfo &Info) const {
2842 const DataLayout &DL = getDataLayout();
2843 Intrinsic::ID IID = Inst->getIntrinsicID();
2844 LLVMContext &C = Inst->getContext();
2845 bool HasMask = false;
2846
2847 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2848 bool IsWrite) -> int64_t {
2849 if (auto *TarExtTy =
2850 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2851 return TarExtTy->getIntParameter(0);
2852
2853 return 1;
2854 };
2855
2856 switch (IID) {
2857 case Intrinsic::riscv_vle_mask:
2858 case Intrinsic::riscv_vse_mask:
2859 case Intrinsic::riscv_vlseg2_mask:
2860 case Intrinsic::riscv_vlseg3_mask:
2861 case Intrinsic::riscv_vlseg4_mask:
2862 case Intrinsic::riscv_vlseg5_mask:
2863 case Intrinsic::riscv_vlseg6_mask:
2864 case Intrinsic::riscv_vlseg7_mask:
2865 case Intrinsic::riscv_vlseg8_mask:
2866 case Intrinsic::riscv_vsseg2_mask:
2867 case Intrinsic::riscv_vsseg3_mask:
2868 case Intrinsic::riscv_vsseg4_mask:
2869 case Intrinsic::riscv_vsseg5_mask:
2870 case Intrinsic::riscv_vsseg6_mask:
2871 case Intrinsic::riscv_vsseg7_mask:
2872 case Intrinsic::riscv_vsseg8_mask:
2873 HasMask = true;
2874 [[fallthrough]];
2875 case Intrinsic::riscv_vle:
2876 case Intrinsic::riscv_vse:
2877 case Intrinsic::riscv_vlseg2:
2878 case Intrinsic::riscv_vlseg3:
2879 case Intrinsic::riscv_vlseg4:
2880 case Intrinsic::riscv_vlseg5:
2881 case Intrinsic::riscv_vlseg6:
2882 case Intrinsic::riscv_vlseg7:
2883 case Intrinsic::riscv_vlseg8:
2884 case Intrinsic::riscv_vsseg2:
2885 case Intrinsic::riscv_vsseg3:
2886 case Intrinsic::riscv_vsseg4:
2887 case Intrinsic::riscv_vsseg5:
2888 case Intrinsic::riscv_vsseg6:
2889 case Intrinsic::riscv_vsseg7:
2890 case Intrinsic::riscv_vsseg8: {
2891 // Intrinsic interface:
2892 // riscv_vle(merge, ptr, vl)
2893 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2894 // riscv_vse(val, ptr, vl)
2895 // riscv_vse_mask(val, ptr, mask, vl, policy)
2896 // riscv_vlseg#(merge, ptr, vl, sew)
2897 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2898 // riscv_vsseg#(val, ptr, vl, sew)
2899 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2900 bool IsWrite = Inst->getType()->isVoidTy();
2901 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2902 // The results of segment loads are TargetExtType.
2903 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2904 unsigned SEW =
2905 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2906 ->getZExtValue();
2907 Ty = TarExtTy->getTypeParameter(0U);
2909 IntegerType::get(C, SEW),
2910 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2911 }
2912 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2913 unsigned VLIndex = RVVIInfo->VLOperand;
2914 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2915 MaybeAlign Alignment =
2916 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2917 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2918 Value *Mask = ConstantInt::getTrue(MaskType);
2919 if (HasMask)
2920 Mask = Inst->getArgOperand(VLIndex - 1);
2921 Value *EVL = Inst->getArgOperand(VLIndex);
2922 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2923 // RVV uses contiguous elements as a segment.
2924 if (SegNum > 1) {
2925 unsigned ElemSize = Ty->getScalarSizeInBits();
2926 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2927 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2928 }
2929 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2930 Alignment, Mask, EVL);
2931 return true;
2932 }
2933 case Intrinsic::riscv_vlse_mask:
2934 case Intrinsic::riscv_vsse_mask:
2935 case Intrinsic::riscv_vlsseg2_mask:
2936 case Intrinsic::riscv_vlsseg3_mask:
2937 case Intrinsic::riscv_vlsseg4_mask:
2938 case Intrinsic::riscv_vlsseg5_mask:
2939 case Intrinsic::riscv_vlsseg6_mask:
2940 case Intrinsic::riscv_vlsseg7_mask:
2941 case Intrinsic::riscv_vlsseg8_mask:
2942 case Intrinsic::riscv_vssseg2_mask:
2943 case Intrinsic::riscv_vssseg3_mask:
2944 case Intrinsic::riscv_vssseg4_mask:
2945 case Intrinsic::riscv_vssseg5_mask:
2946 case Intrinsic::riscv_vssseg6_mask:
2947 case Intrinsic::riscv_vssseg7_mask:
2948 case Intrinsic::riscv_vssseg8_mask:
2949 HasMask = true;
2950 [[fallthrough]];
2951 case Intrinsic::riscv_vlse:
2952 case Intrinsic::riscv_vsse:
2953 case Intrinsic::riscv_vlsseg2:
2954 case Intrinsic::riscv_vlsseg3:
2955 case Intrinsic::riscv_vlsseg4:
2956 case Intrinsic::riscv_vlsseg5:
2957 case Intrinsic::riscv_vlsseg6:
2958 case Intrinsic::riscv_vlsseg7:
2959 case Intrinsic::riscv_vlsseg8:
2960 case Intrinsic::riscv_vssseg2:
2961 case Intrinsic::riscv_vssseg3:
2962 case Intrinsic::riscv_vssseg4:
2963 case Intrinsic::riscv_vssseg5:
2964 case Intrinsic::riscv_vssseg6:
2965 case Intrinsic::riscv_vssseg7:
2966 case Intrinsic::riscv_vssseg8: {
2967 // Intrinsic interface:
2968 // riscv_vlse(merge, ptr, stride, vl)
2969 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2970 // riscv_vsse(val, ptr, stride, vl)
2971 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2972 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2973 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2974 // riscv_vssseg#(val, ptr, offset, vl, sew)
2975 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2976 bool IsWrite = Inst->getType()->isVoidTy();
2977 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2978 // The results of segment loads are TargetExtType.
2979 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2980 unsigned SEW =
2981 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2982 ->getZExtValue();
2983 Ty = TarExtTy->getTypeParameter(0U);
2985 IntegerType::get(C, SEW),
2986 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2987 }
2988 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2989 unsigned VLIndex = RVVIInfo->VLOperand;
2990 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2991 MaybeAlign Alignment =
2992 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2993
2994 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2995 // Use the pointer alignment as the element alignment if the stride is a
2996 // multiple of the pointer alignment. Otherwise, the element alignment
2997 // should be the greatest common divisor of pointer alignment and stride.
2998 // For simplicity, just consider unalignment for elements.
2999 unsigned PointerAlign = Alignment.valueOrOne().value();
3000 if (!isa<ConstantInt>(Stride) ||
3001 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3002 Alignment = Align(1);
3003
3004 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3005 Value *Mask = ConstantInt::getTrue(MaskType);
3006 if (HasMask)
3007 Mask = Inst->getArgOperand(VLIndex - 1);
3008 Value *EVL = Inst->getArgOperand(VLIndex);
3009 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3010 // RVV uses contiguous elements as a segment.
3011 if (SegNum > 1) {
3012 unsigned ElemSize = Ty->getScalarSizeInBits();
3013 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3014 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3015 }
3016 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3017 Alignment, Mask, EVL, Stride);
3018 return true;
3019 }
3020 case Intrinsic::riscv_vloxei_mask:
3021 case Intrinsic::riscv_vluxei_mask:
3022 case Intrinsic::riscv_vsoxei_mask:
3023 case Intrinsic::riscv_vsuxei_mask:
3024 case Intrinsic::riscv_vloxseg2_mask:
3025 case Intrinsic::riscv_vloxseg3_mask:
3026 case Intrinsic::riscv_vloxseg4_mask:
3027 case Intrinsic::riscv_vloxseg5_mask:
3028 case Intrinsic::riscv_vloxseg6_mask:
3029 case Intrinsic::riscv_vloxseg7_mask:
3030 case Intrinsic::riscv_vloxseg8_mask:
3031 case Intrinsic::riscv_vluxseg2_mask:
3032 case Intrinsic::riscv_vluxseg3_mask:
3033 case Intrinsic::riscv_vluxseg4_mask:
3034 case Intrinsic::riscv_vluxseg5_mask:
3035 case Intrinsic::riscv_vluxseg6_mask:
3036 case Intrinsic::riscv_vluxseg7_mask:
3037 case Intrinsic::riscv_vluxseg8_mask:
3038 case Intrinsic::riscv_vsoxseg2_mask:
3039 case Intrinsic::riscv_vsoxseg3_mask:
3040 case Intrinsic::riscv_vsoxseg4_mask:
3041 case Intrinsic::riscv_vsoxseg5_mask:
3042 case Intrinsic::riscv_vsoxseg6_mask:
3043 case Intrinsic::riscv_vsoxseg7_mask:
3044 case Intrinsic::riscv_vsoxseg8_mask:
3045 case Intrinsic::riscv_vsuxseg2_mask:
3046 case Intrinsic::riscv_vsuxseg3_mask:
3047 case Intrinsic::riscv_vsuxseg4_mask:
3048 case Intrinsic::riscv_vsuxseg5_mask:
3049 case Intrinsic::riscv_vsuxseg6_mask:
3050 case Intrinsic::riscv_vsuxseg7_mask:
3051 case Intrinsic::riscv_vsuxseg8_mask:
3052 HasMask = true;
3053 [[fallthrough]];
3054 case Intrinsic::riscv_vloxei:
3055 case Intrinsic::riscv_vluxei:
3056 case Intrinsic::riscv_vsoxei:
3057 case Intrinsic::riscv_vsuxei:
3058 case Intrinsic::riscv_vloxseg2:
3059 case Intrinsic::riscv_vloxseg3:
3060 case Intrinsic::riscv_vloxseg4:
3061 case Intrinsic::riscv_vloxseg5:
3062 case Intrinsic::riscv_vloxseg6:
3063 case Intrinsic::riscv_vloxseg7:
3064 case Intrinsic::riscv_vloxseg8:
3065 case Intrinsic::riscv_vluxseg2:
3066 case Intrinsic::riscv_vluxseg3:
3067 case Intrinsic::riscv_vluxseg4:
3068 case Intrinsic::riscv_vluxseg5:
3069 case Intrinsic::riscv_vluxseg6:
3070 case Intrinsic::riscv_vluxseg7:
3071 case Intrinsic::riscv_vluxseg8:
3072 case Intrinsic::riscv_vsoxseg2:
3073 case Intrinsic::riscv_vsoxseg3:
3074 case Intrinsic::riscv_vsoxseg4:
3075 case Intrinsic::riscv_vsoxseg5:
3076 case Intrinsic::riscv_vsoxseg6:
3077 case Intrinsic::riscv_vsoxseg7:
3078 case Intrinsic::riscv_vsoxseg8:
3079 case Intrinsic::riscv_vsuxseg2:
3080 case Intrinsic::riscv_vsuxseg3:
3081 case Intrinsic::riscv_vsuxseg4:
3082 case Intrinsic::riscv_vsuxseg5:
3083 case Intrinsic::riscv_vsuxseg6:
3084 case Intrinsic::riscv_vsuxseg7:
3085 case Intrinsic::riscv_vsuxseg8: {
3086 // Intrinsic interface (only listed ordered version):
3087 // riscv_vloxei(merge, ptr, index, vl)
3088 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3089 // riscv_vsoxei(val, ptr, index, vl)
3090 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3091 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3092 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3093 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3094 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3095 bool IsWrite = Inst->getType()->isVoidTy();
3096 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3097 // The results of segment loads are TargetExtType.
3098 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3099 unsigned SEW =
3100 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3101 ->getZExtValue();
3102 Ty = TarExtTy->getTypeParameter(0U);
3104 IntegerType::get(C, SEW),
3105 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3106 }
3107 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3108 unsigned VLIndex = RVVIInfo->VLOperand;
3109 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3110 Value *Mask;
3111 if (HasMask) {
3112 Mask = Inst->getArgOperand(VLIndex - 1);
3113 } else {
3114 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3115 // and casting that to scalar i64 triggers a vector/scalar mismatch
3116 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3117 // via extractelement instead.
3118 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3119 Mask = ConstantInt::getTrue(MaskType);
3120 }
3121 Value *EVL = Inst->getArgOperand(VLIndex);
3122 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3123 // RVV uses contiguous elements as a segment.
3124 if (SegNum > 1) {
3125 unsigned ElemSize = Ty->getScalarSizeInBits();
3126 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3127 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3128 }
3129 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3130 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3131 Align(1), Mask, EVL,
3132 /* Stride */ nullptr, OffsetOp);
3133 return true;
3134 }
3135 }
3136 return false;
3137}
3138
3140 if (Ty->isVectorTy()) {
3141 // f16 with only zvfhmin and bf16 will be promoted to f32
3142 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3143 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3144 EltTy->isBFloatTy())
3145 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3146 cast<VectorType>(Ty));
3147
3148 TypeSize Size = DL.getTypeSizeInBits(Ty);
3149 if (Size.isScalable() && ST->hasVInstructions())
3150 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3151
3152 if (ST->useRVVForFixedLengthVectors())
3153 return divideCeil(Size, ST->getRealMinVLen());
3154 }
3155
3156 return BaseT::getRegUsageForType(Ty);
3157}
3158
3159unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3160 if (SLPMaxVF.getNumOccurrences())
3161 return SLPMaxVF;
3162
3163 // Return how many elements can fit in getRegisterBitwidth. This is the
3164 // same routine as used in LoopVectorizer. We should probably be
3165 // accounting for whether we actually have instructions with the right
3166 // lane type, but we don't have enough information to do that without
3167 // some additional plumbing which hasn't been justified yet.
3168 TypeSize RegWidth =
3170 // If no vector registers, or absurd element widths, disable
3171 // vectorization by returning 1.
3172 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3173}
3174
3178
3180 return ST->enableUnalignedVectorMem();
3181}
3182
3185 ScalarEvolution *SE) const {
3186 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3187 return TTI::AMK_PostIndexed;
3188
3190}
3191
3193 const TargetTransformInfo::LSRCost &C2) const {
3194 // RISC-V specific here are "instruction number 1st priority".
3195 // If we need to emit adds inside the loop to add up base registers, then
3196 // we need at least one extra temporary register.
3197 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3198 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3199 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3200 C1.NumIVMuls, C1.NumBaseAdds,
3201 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3202 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3203 C2.NumIVMuls, C2.NumBaseAdds,
3204 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3205}
3206
3208 Align Alignment) const {
3209 auto *VTy = dyn_cast<VectorType>(DataTy);
3210 if (!VTy || VTy->isScalableTy())
3211 return false;
3212
3213 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3214 return false;
3215
3216 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3217 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3218 if (VTy->getElementType()->isIntegerTy(8))
3219 if (VTy->getElementCount().getFixedValue() > 256)
3220 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3221 ST->getMaxLMULForFixedLengthVectors();
3222 return true;
3223}
3224
3226 Align Alignment) const {
3227 auto *VTy = dyn_cast<VectorType>(DataTy);
3228 if (!VTy || VTy->isScalableTy())
3229 return false;
3230
3231 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3232 return false;
3233 return true;
3234}
3235
3236/// See if \p I should be considered for address type promotion. We check if \p
3237/// I is a sext with right type and used in memory accesses. If it used in a
3238/// "complex" getelementptr, we allow it to be promoted without finding other
3239/// sext instructions that sign extended the same initial value. A getelementptr
3240/// is considered as "complex" if it has more than 2 operands.
3242 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3243 bool Considerable = false;
3244 AllowPromotionWithoutCommonHeader = false;
3245 if (!isa<SExtInst>(&I))
3246 return false;
3247 Type *ConsideredSExtType =
3248 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3249 if (I.getType() != ConsideredSExtType)
3250 return false;
3251 // See if the sext is the one with the right type and used in at least one
3252 // GetElementPtrInst.
3253 for (const User *U : I.users()) {
3254 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3255 Considerable = true;
3256 // A getelementptr is considered as "complex" if it has more than 2
3257 // operands. We will promote a SExt used in such complex GEP as we
3258 // expect some computation to be merged if they are done on 64 bits.
3259 if (GEPInst->getNumOperands() > 2) {
3260 AllowPromotionWithoutCommonHeader = true;
3261 break;
3262 }
3263 }
3264 }
3265 return Considerable;
3266}
3267
3268bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3269 switch (Opcode) {
3270 case Instruction::Add:
3271 case Instruction::Sub:
3272 case Instruction::Mul:
3273 case Instruction::And:
3274 case Instruction::Or:
3275 case Instruction::Xor:
3276 case Instruction::FAdd:
3277 case Instruction::FSub:
3278 case Instruction::FMul:
3279 case Instruction::FDiv:
3280 case Instruction::ICmp:
3281 case Instruction::FCmp:
3282 return true;
3283 case Instruction::Shl:
3284 case Instruction::LShr:
3285 case Instruction::AShr:
3286 case Instruction::UDiv:
3287 case Instruction::SDiv:
3288 case Instruction::URem:
3289 case Instruction::SRem:
3290 case Instruction::Select:
3291 return Operand == 1;
3292 default:
3293 return false;
3294 }
3295}
3296
3298 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3299 return false;
3300
3301 if (canSplatOperand(I->getOpcode(), Operand))
3302 return true;
3303
3304 auto *II = dyn_cast<IntrinsicInst>(I);
3305 if (!II)
3306 return false;
3307
3308 switch (II->getIntrinsicID()) {
3309 case Intrinsic::fma:
3310 case Intrinsic::vp_fma:
3311 case Intrinsic::fmuladd:
3312 case Intrinsic::vp_fmuladd:
3313 return Operand == 0 || Operand == 1;
3314 case Intrinsic::vp_shl:
3315 case Intrinsic::vp_lshr:
3316 case Intrinsic::vp_ashr:
3317 case Intrinsic::vp_udiv:
3318 case Intrinsic::vp_sdiv:
3319 case Intrinsic::vp_urem:
3320 case Intrinsic::vp_srem:
3321 case Intrinsic::ssub_sat:
3322 case Intrinsic::vp_ssub_sat:
3323 case Intrinsic::usub_sat:
3324 case Intrinsic::vp_usub_sat:
3325 case Intrinsic::vp_select:
3326 return Operand == 1;
3327 // These intrinsics are commutative.
3328 case Intrinsic::vp_add:
3329 case Intrinsic::vp_mul:
3330 case Intrinsic::vp_and:
3331 case Intrinsic::vp_or:
3332 case Intrinsic::vp_xor:
3333 case Intrinsic::vp_fadd:
3334 case Intrinsic::vp_fmul:
3335 case Intrinsic::vp_icmp:
3336 case Intrinsic::vp_fcmp:
3337 case Intrinsic::smin:
3338 case Intrinsic::vp_smin:
3339 case Intrinsic::umin:
3340 case Intrinsic::vp_umin:
3341 case Intrinsic::smax:
3342 case Intrinsic::vp_smax:
3343 case Intrinsic::umax:
3344 case Intrinsic::vp_umax:
3345 case Intrinsic::sadd_sat:
3346 case Intrinsic::vp_sadd_sat:
3347 case Intrinsic::uadd_sat:
3348 case Intrinsic::vp_uadd_sat:
3349 // These intrinsics have 'vr' versions.
3350 case Intrinsic::vp_sub:
3351 case Intrinsic::vp_fsub:
3352 case Intrinsic::vp_fdiv:
3353 return Operand == 0 || Operand == 1;
3354 default:
3355 return false;
3356 }
3357}
3358
3359/// Check if sinking \p I's operands to I's basic block is profitable, because
3360/// the operands can be folded into a target instruction, e.g.
3361/// splats of scalars can fold into vector instructions.
3364 using namespace llvm::PatternMatch;
3365
3366 if (I->isBitwiseLogicOp()) {
3367 if (!I->getType()->isVectorTy()) {
3368 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3369 for (auto &Op : I->operands()) {
3370 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3371 if (match(Op.get(), m_Not(m_Value()))) {
3372 Ops.push_back(&Op);
3373 return true;
3374 }
3375 }
3376 }
3377 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3378 for (auto &Op : I->operands()) {
3379 // (and X, (not Y)) -> (vandn.vv X, Y)
3380 if (match(Op.get(), m_Not(m_Value()))) {
3381 Ops.push_back(&Op);
3382 return true;
3383 }
3384 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3386 m_ZeroInt()),
3387 m_Value(), m_ZeroMask()))) {
3388 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3389 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3390 Ops.push_back(&Not);
3391 Ops.push_back(&InsertElt);
3392 Ops.push_back(&Op);
3393 return true;
3394 }
3395 }
3396 }
3397 }
3398
3399 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3400 return false;
3401
3402 // Don't sink splat operands if the target prefers it. Some targets requires
3403 // S2V transfer buffers and we can run out of them copying the same value
3404 // repeatedly.
3405 // FIXME: It could still be worth doing if it would improve vector register
3406 // pressure and prevent a vector spill.
3407 if (!ST->sinkSplatOperands())
3408 return false;
3409
3410 for (auto OpIdx : enumerate(I->operands())) {
3411 if (!canSplatOperand(I, OpIdx.index()))
3412 continue;
3413
3414 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3415 // Make sure we are not already sinking this operand
3416 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3417 continue;
3418
3419 // We are looking for a splat/vp.splat that can be sunk.
3421 m_Value(), m_Value(), m_Value()));
3422 if (!IsVPSplat &&
3424 m_Value(), m_ZeroMask())))
3425 continue;
3426
3427 // Don't sink i1 splats.
3428 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3429 continue;
3430
3431 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3432 // and vector registers
3433 for (Use &U : Op->uses()) {
3434 Instruction *Insn = cast<Instruction>(U.getUser());
3435 if (!canSplatOperand(Insn, U.getOperandNo()))
3436 return false;
3437 }
3438
3439 // Sink any fpexts since they might be used in a widening fp pattern.
3440 if (IsVPSplat) {
3441 if (isa<FPExtInst>(Op->getOperand(0)))
3442 Ops.push_back(&Op->getOperandUse(0));
3443 } else {
3444 Use *InsertEltUse = &Op->getOperandUse(0);
3445 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3446 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3447 Ops.push_back(&InsertElt->getOperandUse(1));
3448 Ops.push_back(InsertEltUse);
3449 }
3450 Ops.push_back(&OpIdx.value());
3451 }
3452 return true;
3453}
3454
3456RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3458 // TODO: Enable expansion when unaligned access is not supported after we fix
3459 // issues in ExpandMemcmp.
3460 if (!ST->enableUnalignedScalarMem())
3461 return Options;
3462
3463 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3464 return Options;
3465
3466 Options.AllowOverlappingLoads = true;
3467 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3468 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3469 if (ST->is64Bit()) {
3470 Options.LoadSizes = {8, 4, 2, 1};
3471 Options.AllowedTailExpansions = {3, 5, 6};
3472 } else {
3473 Options.LoadSizes = {4, 2, 1};
3474 Options.AllowedTailExpansions = {3};
3475 }
3476
3477 if (IsZeroCmp && ST->hasVInstructions()) {
3478 unsigned VLenB = ST->getRealMinVLen() / 8;
3479 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3480 // `VLenB * MaxLMUL` so that it fits in a single register group.
3481 unsigned MinSize = ST->getXLen() / 8 + 1;
3482 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3483 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3484 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3485 }
3486 return Options;
3487}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1847
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2100
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).