LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
973 // For now, skip all fixed vector cost analysis when P extension is available
974 // to avoid crashes in getMinRVVVectorSizeInBits()
975 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
976 return 1; // Treat as single instruction cost for now
977 }
978
979 // A build_vector (which is m1 sized or smaller) can be done in no
980 // worse than one vslide1down.vx per element in the type. We could
981 // in theory do an explode_vector in the inverse manner, but our
982 // lowering today does not have a first class node for this pattern.
984 Ty, DemandedElts, Insert, Extract, CostKind);
985 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
986 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
987 if (Ty->getScalarSizeInBits() == 1) {
988 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
989 // Note: Implicit scalar anyextend is assumed to be free since the i1
990 // must be stored in a GPR.
991 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
992 CostKind) +
993 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
995 }
996
997 assert(LT.second.isFixedLengthVector());
998 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
999 if (isM1OrSmaller(ContainerVT)) {
1000 InstructionCost BV =
1001 cast<FixedVectorType>(Ty)->getNumElements() *
1002 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1003 if (BV < Cost)
1004 Cost = BV;
1005 }
1006 }
1007 return Cost;
1008}
1009
1013 Type *DataTy = MICA.getDataType();
1014 Align Alignment = MICA.getAlignment();
1015 switch (MICA.getID()) {
1016 case Intrinsic::vp_load_ff: {
1017 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1018 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1020
1021 unsigned AS = MICA.getAddressSpace();
1022 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1023 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1024 }
1025 }
1027}
1028
1032 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1033 : Instruction::Store;
1034 Type *Src = MICA.getDataType();
1035 Align Alignment = MICA.getAlignment();
1036 unsigned AddressSpace = MICA.getAddressSpace();
1037
1038 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1041
1042 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1043}
1044
1046 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1047 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1048 bool UseMaskForCond, bool UseMaskForGaps) const {
1049
1050 // The interleaved memory access pass will lower (de)interleave ops combined
1051 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1052 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1053 // gap).
1054 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1055 auto *VTy = cast<VectorType>(VecTy);
1056 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1057 // Need to make sure type has't been scalarized
1058 if (LT.second.isVector()) {
1059 auto *SubVecTy =
1060 VectorType::get(VTy->getElementType(),
1061 VTy->getElementCount().divideCoefficientBy(Factor));
1062 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1063 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1064 AddressSpace, DL)) {
1065
1066 // Some processors optimize segment loads/stores as one wide memory op +
1067 // Factor * LMUL shuffle ops.
1068 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1070 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1071 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1072 Cost += Factor * TLI->getLMULCost(SubVecVT);
1073 return LT.first * Cost;
1074 }
1075
1076 // Otherwise, the cost is proportional to the number of elements (VL *
1077 // Factor ops).
1078 InstructionCost MemOpCost =
1079 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1080 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1081 unsigned NumLoads = getEstimatedVLFor(VTy);
1082 return NumLoads * MemOpCost;
1083 }
1084 }
1085 }
1086
1087 // TODO: Return the cost of interleaved accesses for scalable vector when
1088 // unable to convert to segment accesses instructions.
1089 if (isa<ScalableVectorType>(VecTy))
1091
1092 auto *FVTy = cast<FixedVectorType>(VecTy);
1093 InstructionCost MemCost =
1094 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1095 unsigned VF = FVTy->getNumElements() / Factor;
1096
1097 // An interleaved load will look like this for Factor=3:
1098 // %wide.vec = load <12 x i32>, ptr %3, align 4
1099 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1100 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1101 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1102 if (Opcode == Instruction::Load) {
1103 InstructionCost Cost = MemCost;
1104 for (unsigned Index : Indices) {
1105 FixedVectorType *VecTy =
1106 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1107 auto Mask = createStrideMask(Index, Factor, VF);
1108 Mask.resize(VF * Factor, -1);
1109 InstructionCost ShuffleCost =
1111 Mask, CostKind, 0, nullptr, {});
1112 Cost += ShuffleCost;
1113 }
1114 return Cost;
1115 }
1116
1117 // TODO: Model for NF > 2
1118 // We'll need to enhance getShuffleCost to model shuffles that are just
1119 // inserts and extracts into subvectors, since they won't have the full cost
1120 // of a vrgather.
1121 // An interleaved store for 3 vectors of 4 lanes will look like
1122 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1123 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1124 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1125 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1126 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1127 if (Factor != 2)
1128 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1129 Alignment, AddressSpace, CostKind,
1130 UseMaskForCond, UseMaskForGaps);
1131
1132 assert(Opcode == Instruction::Store && "Opcode must be a store");
1133 // For an interleaving store of 2 vectors, we perform one large interleaving
1134 // shuffle that goes into the wide store
1135 auto Mask = createInterleaveMask(VF, Factor);
1136 InstructionCost ShuffleCost =
1138 CostKind, 0, nullptr, {});
1139 return MemCost + ShuffleCost;
1140}
1141
1145
1146 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1147 MICA.getID() == Intrinsic::vp_gather;
1148 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1149 Type *DataTy = MICA.getDataType();
1150 Align Alignment = MICA.getAlignment();
1151 const Instruction *I = MICA.getInst();
1154
1155 if ((Opcode == Instruction::Load &&
1156 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1157 (Opcode == Instruction::Store &&
1158 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1160
1161 // Cost is proportional to the number of memory operations implied. For
1162 // scalable vectors, we use an estimate on that number since we don't
1163 // know exactly what VL will be.
1164 auto &VTy = *cast<VectorType>(DataTy);
1165 InstructionCost MemOpCost =
1166 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1167 {TTI::OK_AnyValue, TTI::OP_None}, I);
1168 unsigned NumLoads = getEstimatedVLFor(&VTy);
1169 return NumLoads * MemOpCost;
1170}
1171
1173 const MemIntrinsicCostAttributes &MICA,
1175 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1176 ? Instruction::Load
1177 : Instruction::Store;
1178 Type *DataTy = MICA.getDataType();
1179 bool VariableMask = MICA.getVariableMask();
1180 Align Alignment = MICA.getAlignment();
1181 bool IsLegal = (Opcode == Instruction::Store &&
1182 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1183 (Opcode == Instruction::Load &&
1184 isLegalMaskedExpandLoad(DataTy, Alignment));
1185 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1187 // Example compressstore sequence:
1188 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1189 // vcompress.vm v10, v8, v0
1190 // vcpop.m a1, v0
1191 // vsetvli zero, a1, e32, m2, ta, ma
1192 // vse32.v v10, (a0)
1193 // Example expandload sequence:
1194 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1195 // vcpop.m a1, v0
1196 // vsetvli zero, a1, e32, m2, ta, ma
1197 // vle32.v v10, (a0)
1198 // vsetivli zero, 8, e32, m2, ta, ma
1199 // viota.m v12, v0
1200 // vrgather.vv v8, v10, v12, v0.t
1201 auto MemOpCost =
1202 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1203 auto LT = getTypeLegalizationCost(DataTy);
1204 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1205 if (VariableMask)
1206 Opcodes.push_back(RISCV::VCPOP_M);
1207 if (Opcode == Instruction::Store)
1208 Opcodes.append({RISCV::VCOMPRESS_VM});
1209 else
1210 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1211 return MemOpCost +
1212 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1213}
1214
1218
1219 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1220 ? Instruction::Load
1221 : Instruction::Store;
1222
1223 Type *DataTy = MICA.getDataType();
1224 Align Alignment = MICA.getAlignment();
1225 const Instruction *I = MICA.getInst();
1226
1227 if (!isLegalStridedLoadStore(DataTy, Alignment))
1229
1231 return TTI::TCC_Basic;
1232
1233 // Cost is proportional to the number of memory operations implied. For
1234 // scalable vectors, we use an estimate on that number since we don't
1235 // know exactly what VL will be.
1236 auto &VTy = *cast<VectorType>(DataTy);
1237 InstructionCost MemOpCost =
1238 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1239 {TTI::OK_AnyValue, TTI::OP_None}, I);
1240 unsigned NumLoads = getEstimatedVLFor(&VTy);
1241 return NumLoads * MemOpCost;
1242}
1243
1246 // FIXME: This is a property of the default vector convention, not
1247 // all possible calling conventions. Fixing that will require
1248 // some TTI API and SLP rework.
1251 for (auto *Ty : Tys) {
1252 if (!Ty->isVectorTy())
1253 continue;
1254 Align A = DL.getPrefTypeAlign(Ty);
1255 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1256 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1257 }
1258 return Cost;
1259}
1260
1261// Currently, these represent both throughput and codesize costs
1262// for the respective intrinsics. The costs in this table are simply
1263// instruction counts with the following adjustments made:
1264// * One vsetvli is considered free.
1266 {Intrinsic::floor, MVT::f32, 9},
1267 {Intrinsic::floor, MVT::f64, 9},
1268 {Intrinsic::ceil, MVT::f32, 9},
1269 {Intrinsic::ceil, MVT::f64, 9},
1270 {Intrinsic::trunc, MVT::f32, 7},
1271 {Intrinsic::trunc, MVT::f64, 7},
1272 {Intrinsic::round, MVT::f32, 9},
1273 {Intrinsic::round, MVT::f64, 9},
1274 {Intrinsic::roundeven, MVT::f32, 9},
1275 {Intrinsic::roundeven, MVT::f64, 9},
1276 {Intrinsic::rint, MVT::f32, 7},
1277 {Intrinsic::rint, MVT::f64, 7},
1278 {Intrinsic::nearbyint, MVT::f32, 9},
1279 {Intrinsic::nearbyint, MVT::f64, 9},
1280 {Intrinsic::bswap, MVT::i16, 3},
1281 {Intrinsic::bswap, MVT::i32, 12},
1282 {Intrinsic::bswap, MVT::i64, 31},
1283 {Intrinsic::vp_bswap, MVT::i16, 3},
1284 {Intrinsic::vp_bswap, MVT::i32, 12},
1285 {Intrinsic::vp_bswap, MVT::i64, 31},
1286 {Intrinsic::vp_fshl, MVT::i8, 7},
1287 {Intrinsic::vp_fshl, MVT::i16, 7},
1288 {Intrinsic::vp_fshl, MVT::i32, 7},
1289 {Intrinsic::vp_fshl, MVT::i64, 7},
1290 {Intrinsic::vp_fshr, MVT::i8, 7},
1291 {Intrinsic::vp_fshr, MVT::i16, 7},
1292 {Intrinsic::vp_fshr, MVT::i32, 7},
1293 {Intrinsic::vp_fshr, MVT::i64, 7},
1294 {Intrinsic::bitreverse, MVT::i8, 17},
1295 {Intrinsic::bitreverse, MVT::i16, 24},
1296 {Intrinsic::bitreverse, MVT::i32, 33},
1297 {Intrinsic::bitreverse, MVT::i64, 52},
1298 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1299 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1300 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1301 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1302 {Intrinsic::ctpop, MVT::i8, 12},
1303 {Intrinsic::ctpop, MVT::i16, 19},
1304 {Intrinsic::ctpop, MVT::i32, 20},
1305 {Intrinsic::ctpop, MVT::i64, 21},
1306 {Intrinsic::ctlz, MVT::i8, 19},
1307 {Intrinsic::ctlz, MVT::i16, 28},
1308 {Intrinsic::ctlz, MVT::i32, 31},
1309 {Intrinsic::ctlz, MVT::i64, 35},
1310 {Intrinsic::cttz, MVT::i8, 16},
1311 {Intrinsic::cttz, MVT::i16, 23},
1312 {Intrinsic::cttz, MVT::i32, 24},
1313 {Intrinsic::cttz, MVT::i64, 25},
1314 {Intrinsic::vp_ctpop, MVT::i8, 12},
1315 {Intrinsic::vp_ctpop, MVT::i16, 19},
1316 {Intrinsic::vp_ctpop, MVT::i32, 20},
1317 {Intrinsic::vp_ctpop, MVT::i64, 21},
1318 {Intrinsic::vp_ctlz, MVT::i8, 19},
1319 {Intrinsic::vp_ctlz, MVT::i16, 28},
1320 {Intrinsic::vp_ctlz, MVT::i32, 31},
1321 {Intrinsic::vp_ctlz, MVT::i64, 35},
1322 {Intrinsic::vp_cttz, MVT::i8, 16},
1323 {Intrinsic::vp_cttz, MVT::i16, 23},
1324 {Intrinsic::vp_cttz, MVT::i32, 24},
1325 {Intrinsic::vp_cttz, MVT::i64, 25},
1326};
1327
1331 auto *RetTy = ICA.getReturnType();
1332 switch (ICA.getID()) {
1333 case Intrinsic::lrint:
1334 case Intrinsic::llrint:
1335 case Intrinsic::lround:
1336 case Intrinsic::llround: {
1337 auto LT = getTypeLegalizationCost(RetTy);
1338 Type *SrcTy = ICA.getArgTypes().front();
1339 auto SrcLT = getTypeLegalizationCost(SrcTy);
1340 if (ST->hasVInstructions() && LT.second.isVector()) {
1342 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1343 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1344 if (LT.second.getVectorElementType() == MVT::bf16) {
1345 if (!ST->hasVInstructionsBF16Minimal())
1347 if (DstEltSz == 32)
1348 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1349 else
1350 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1351 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1352 !ST->hasVInstructionsF16()) {
1353 if (!ST->hasVInstructionsF16Minimal())
1355 if (DstEltSz == 32)
1356 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1357 else
1358 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1359
1360 } else if (SrcEltSz > DstEltSz) {
1361 Ops = {RISCV::VFNCVT_X_F_W};
1362 } else if (SrcEltSz < DstEltSz) {
1363 Ops = {RISCV::VFWCVT_X_F_V};
1364 } else {
1365 Ops = {RISCV::VFCVT_X_F_V};
1366 }
1367
1368 // We need to use the source LMUL in the case of a narrowing op, and the
1369 // destination LMUL otherwise.
1370 if (SrcEltSz > DstEltSz)
1371 return SrcLT.first *
1372 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1373 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1374 }
1375 break;
1376 }
1377 case Intrinsic::ceil:
1378 case Intrinsic::floor:
1379 case Intrinsic::trunc:
1380 case Intrinsic::rint:
1381 case Intrinsic::round:
1382 case Intrinsic::roundeven: {
1383 // These all use the same code.
1384 auto LT = getTypeLegalizationCost(RetTy);
1385 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1386 return LT.first * 8;
1387 break;
1388 }
1389 case Intrinsic::umin:
1390 case Intrinsic::umax:
1391 case Intrinsic::smin:
1392 case Intrinsic::smax: {
1393 auto LT = getTypeLegalizationCost(RetTy);
1394 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1395 return LT.first;
1396
1397 if (ST->hasVInstructions() && LT.second.isVector()) {
1398 unsigned Op;
1399 switch (ICA.getID()) {
1400 case Intrinsic::umin:
1401 Op = RISCV::VMINU_VV;
1402 break;
1403 case Intrinsic::umax:
1404 Op = RISCV::VMAXU_VV;
1405 break;
1406 case Intrinsic::smin:
1407 Op = RISCV::VMIN_VV;
1408 break;
1409 case Intrinsic::smax:
1410 Op = RISCV::VMAX_VV;
1411 break;
1412 }
1413 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1414 }
1415 break;
1416 }
1417 case Intrinsic::sadd_sat:
1418 case Intrinsic::ssub_sat:
1419 case Intrinsic::uadd_sat:
1420 case Intrinsic::usub_sat: {
1421 auto LT = getTypeLegalizationCost(RetTy);
1422 if (ST->hasVInstructions() && LT.second.isVector()) {
1423 unsigned Op;
1424 switch (ICA.getID()) {
1425 case Intrinsic::sadd_sat:
1426 Op = RISCV::VSADD_VV;
1427 break;
1428 case Intrinsic::ssub_sat:
1429 Op = RISCV::VSSUBU_VV;
1430 break;
1431 case Intrinsic::uadd_sat:
1432 Op = RISCV::VSADDU_VV;
1433 break;
1434 case Intrinsic::usub_sat:
1435 Op = RISCV::VSSUBU_VV;
1436 break;
1437 }
1438 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1439 }
1440 break;
1441 }
1442 case Intrinsic::fma:
1443 case Intrinsic::fmuladd: {
1444 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1445 auto LT = getTypeLegalizationCost(RetTy);
1446 if (ST->hasVInstructions() && LT.second.isVector())
1447 return LT.first *
1448 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1449 break;
1450 }
1451 case Intrinsic::fabs: {
1452 auto LT = getTypeLegalizationCost(RetTy);
1453 if (ST->hasVInstructions() && LT.second.isVector()) {
1454 // lui a0, 8
1455 // addi a0, a0, -1
1456 // vsetvli a1, zero, e16, m1, ta, ma
1457 // vand.vx v8, v8, a0
1458 // f16 with zvfhmin and bf16 with zvfhbmin
1459 if (LT.second.getVectorElementType() == MVT::bf16 ||
1460 (LT.second.getVectorElementType() == MVT::f16 &&
1461 !ST->hasVInstructionsF16()))
1462 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1463 CostKind) +
1464 2;
1465 else
1466 return LT.first *
1467 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1468 }
1469 break;
1470 }
1471 case Intrinsic::sqrt: {
1472 auto LT = getTypeLegalizationCost(RetTy);
1473 if (ST->hasVInstructions() && LT.second.isVector()) {
1476 MVT ConvType = LT.second;
1477 MVT FsqrtType = LT.second;
1478 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1479 // will be spilt.
1480 if (LT.second.getVectorElementType() == MVT::bf16) {
1481 if (LT.second == MVT::nxv32bf16) {
1482 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1483 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1484 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1485 ConvType = MVT::nxv16f16;
1486 FsqrtType = MVT::nxv16f32;
1487 } else {
1488 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1489 FsqrtOp = {RISCV::VFSQRT_V};
1490 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1491 }
1492 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1493 !ST->hasVInstructionsF16()) {
1494 if (LT.second == MVT::nxv32f16) {
1495 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1496 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1497 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1498 ConvType = MVT::nxv16f16;
1499 FsqrtType = MVT::nxv16f32;
1500 } else {
1501 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1502 FsqrtOp = {RISCV::VFSQRT_V};
1503 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1504 }
1505 } else {
1506 FsqrtOp = {RISCV::VFSQRT_V};
1507 }
1508
1509 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1510 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1511 }
1512 break;
1513 }
1514 case Intrinsic::cttz:
1515 case Intrinsic::ctlz:
1516 case Intrinsic::ctpop: {
1517 auto LT = getTypeLegalizationCost(RetTy);
1518 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1519 unsigned Op;
1520 switch (ICA.getID()) {
1521 case Intrinsic::cttz:
1522 Op = RISCV::VCTZ_V;
1523 break;
1524 case Intrinsic::ctlz:
1525 Op = RISCV::VCLZ_V;
1526 break;
1527 case Intrinsic::ctpop:
1528 Op = RISCV::VCPOP_V;
1529 break;
1530 }
1531 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1532 }
1533 break;
1534 }
1535 case Intrinsic::abs: {
1536 auto LT = getTypeLegalizationCost(RetTy);
1537 if (ST->hasVInstructions() && LT.second.isVector()) {
1538 // vrsub.vi v10, v8, 0
1539 // vmax.vv v8, v8, v10
1540 return LT.first *
1541 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1542 LT.second, CostKind);
1543 }
1544 break;
1545 }
1546 case Intrinsic::get_active_lane_mask: {
1547 if (ST->hasVInstructions()) {
1548 Type *ExpRetTy = VectorType::get(
1549 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1550 auto LT = getTypeLegalizationCost(ExpRetTy);
1551
1552 // vid.v v8 // considered hoisted
1553 // vsaddu.vx v8, v8, a0
1554 // vmsltu.vx v0, v8, a1
1555 return LT.first *
1556 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1557 LT.second, CostKind);
1558 }
1559 break;
1560 }
1561 // TODO: add more intrinsic
1562 case Intrinsic::stepvector: {
1563 auto LT = getTypeLegalizationCost(RetTy);
1564 // Legalisation of illegal types involves an `index' instruction plus
1565 // (LT.first - 1) vector adds.
1566 if (ST->hasVInstructions())
1567 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1568 (LT.first - 1) *
1569 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1570 return 1 + (LT.first - 1);
1571 }
1572 case Intrinsic::experimental_cttz_elts: {
1573 Type *ArgTy = ICA.getArgTypes()[0];
1574 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1575 if (getTLI()->shouldExpandCttzElements(ArgType))
1576 break;
1577 InstructionCost Cost = getRISCVInstructionCost(
1578 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1579
1580 // If zero_is_poison is false, then we will generate additional
1581 // cmp + select instructions to convert -1 to EVL.
1582 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1583 if (ICA.getArgs().size() > 1 &&
1584 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1585 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1587 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1589
1590 return Cost;
1591 }
1592 case Intrinsic::experimental_vp_splat: {
1593 auto LT = getTypeLegalizationCost(RetTy);
1594 // TODO: Lower i1 experimental_vp_splat
1595 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1597 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1598 ? RISCV::VFMV_V_F
1599 : RISCV::VMV_V_X,
1600 LT.second, CostKind);
1601 }
1602 case Intrinsic::experimental_vp_splice: {
1603 // To support type-based query from vectorizer, set the index to 0.
1604 // Note that index only change the cost from vslide.vx to vslide.vi and in
1605 // current implementations they have same costs.
1607 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1609 }
1610 case Intrinsic::fptoui_sat:
1611 case Intrinsic::fptosi_sat: {
1613 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1614 Type *SrcTy = ICA.getArgTypes()[0];
1615
1616 auto SrcLT = getTypeLegalizationCost(SrcTy);
1617 auto DstLT = getTypeLegalizationCost(RetTy);
1618 if (!SrcTy->isVectorTy())
1619 break;
1620
1621 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1623
1624 Cost +=
1625 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1626 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1627
1628 // Handle NaN.
1629 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1630 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1631 Type *CondTy = RetTy->getWithNewBitWidth(1);
1632 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1634 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1636 return Cost;
1637 }
1638 }
1639
1640 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1641 if (auto LT = getTypeLegalizationCost(RetTy);
1642 LT.second.isVector()) {
1643 MVT EltTy = LT.second.getVectorElementType();
1644 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1645 ICA.getID(), EltTy))
1646 return LT.first * Entry->Cost;
1647 }
1648 }
1649
1651}
1652
1655 const SCEV *Ptr,
1657 // Address computations for vector indexed load/store likely require an offset
1658 // and/or scaling.
1659 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1660 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1661
1662 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1663}
1664
1666 Type *Src,
1669 const Instruction *I) const {
1670 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1671 if (!IsVectorType)
1672 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1673
1674 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1675 // For now, skip all fixed vector cost analysis when P extension is available
1676 // to avoid crashes in getMinRVVVectorSizeInBits()
1677 if (ST->enablePExtCodeGen() &&
1679 return 1; // Treat as single instruction cost for now
1680 }
1681
1682 // FIXME: Need to compute legalizing cost for illegal types. The current
1683 // code handles only legal types and those which can be trivially
1684 // promoted to legal.
1685 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1686 Dst->getScalarSizeInBits() > ST->getELen())
1687 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1688
1689 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1690 assert(ISD && "Invalid opcode");
1691 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1692 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1693
1694 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1695 // The shared implementation doesn't model vector widening during legalization
1696 // and instead assumes scalarization. In order to scalarize an <N x i1>
1697 // vector, we need to extend/trunc to/from i8. If we don't special case
1698 // this, we can get an infinite recursion cycle.
1699 switch (ISD) {
1700 default:
1701 break;
1702 case ISD::SIGN_EXTEND:
1703 case ISD::ZERO_EXTEND:
1704 if (Src->getScalarSizeInBits() == 1) {
1705 // We do not use vsext/vzext to extend from mask vector.
1706 // Instead we use the following instructions to extend from mask vector:
1707 // vmv.v.i v8, 0
1708 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1709 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1710 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1711 DstLT.second, CostKind) +
1712 DstLT.first - 1;
1713 }
1714 break;
1715 case ISD::TRUNCATE:
1716 if (Dst->getScalarSizeInBits() == 1) {
1717 // We do not use several vncvt to truncate to mask vector. So we could
1718 // not use PowDiff to calculate it.
1719 // Instead we use the following instructions to truncate to mask vector:
1720 // vand.vi v8, v8, 1
1721 // vmsne.vi v0, v8, 0
1722 return SrcLT.first *
1723 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1724 SrcLT.second, CostKind) +
1725 SrcLT.first - 1;
1726 }
1727 break;
1728 };
1729
1730 // Our actual lowering for the case where a wider legal type is available
1731 // uses promotion to the wider type. This is reflected in the result of
1732 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1733 // scalarized if the legalized Src and Dst are not equal sized.
1734 const DataLayout &DL = this->getDataLayout();
1735 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1736 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1737 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1738 SrcLT.second.getSizeInBits()) ||
1739 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1740 DstLT.second.getSizeInBits()) ||
1741 SrcLT.first > 1 || DstLT.first > 1)
1742 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1743
1744 // The split cost is handled by the base getCastInstrCost
1745 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1746
1747 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1748 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1749 switch (ISD) {
1750 case ISD::SIGN_EXTEND:
1751 case ISD::ZERO_EXTEND: {
1752 if ((PowDiff < 1) || (PowDiff > 3))
1753 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1754 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1755 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1756 unsigned Op =
1757 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1758 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1759 }
1760 case ISD::TRUNCATE:
1761 case ISD::FP_EXTEND:
1762 case ISD::FP_ROUND: {
1763 // Counts of narrow/widen instructions.
1764 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1765 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1766
1767 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1768 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1769 : RISCV::VFNCVT_F_F_W;
1771 for (; SrcEltSize != DstEltSize;) {
1772 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1773 ? MVT::getIntegerVT(DstEltSize)
1774 : MVT::getFloatingPointVT(DstEltSize);
1775 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1776 DstEltSize =
1777 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1778 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1779 }
1780 return Cost;
1781 }
1782 case ISD::FP_TO_SINT:
1783 case ISD::FP_TO_UINT: {
1784 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1785 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1786 unsigned FWCVT =
1787 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1788 unsigned FNCVT =
1789 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1790 unsigned SrcEltSize = Src->getScalarSizeInBits();
1791 unsigned DstEltSize = Dst->getScalarSizeInBits();
1793 if ((SrcEltSize == 16) &&
1794 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1795 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1796 // pre-widening to f32 and then convert f32 to integer
1797 VectorType *VecF32Ty =
1798 VectorType::get(Type::getFloatTy(Dst->getContext()),
1799 cast<VectorType>(Dst)->getElementCount());
1800 std::pair<InstructionCost, MVT> VecF32LT =
1801 getTypeLegalizationCost(VecF32Ty);
1802 Cost +=
1803 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1804 VecF32LT.second, CostKind);
1805 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1806 return Cost;
1807 }
1808 if (DstEltSize == SrcEltSize)
1809 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1810 else if (DstEltSize > SrcEltSize)
1811 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1812 else { // (SrcEltSize > DstEltSize)
1813 // First do a narrowing conversion to an integer half the size, then
1814 // truncate if needed.
1815 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1816 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1817 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1818 if ((SrcEltSize / 2) > DstEltSize) {
1819 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1820 Cost +=
1821 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1822 }
1823 }
1824 return Cost;
1825 }
1826 case ISD::SINT_TO_FP:
1827 case ISD::UINT_TO_FP: {
1828 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1829 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1830 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1831 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1832 unsigned SrcEltSize = Src->getScalarSizeInBits();
1833 unsigned DstEltSize = Dst->getScalarSizeInBits();
1834
1836 if ((DstEltSize == 16) &&
1837 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1838 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1839 // it is converted to f32 and then converted to f16
1840 VectorType *VecF32Ty =
1841 VectorType::get(Type::getFloatTy(Dst->getContext()),
1842 cast<VectorType>(Dst)->getElementCount());
1843 std::pair<InstructionCost, MVT> VecF32LT =
1844 getTypeLegalizationCost(VecF32Ty);
1845 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1846 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1847 DstLT.second, CostKind);
1848 return Cost;
1849 }
1850
1851 if (DstEltSize == SrcEltSize)
1852 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1853 else if (DstEltSize > SrcEltSize) {
1854 if ((DstEltSize / 2) > SrcEltSize) {
1855 VectorType *VecTy =
1856 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1857 cast<VectorType>(Dst)->getElementCount());
1858 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1859 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1860 }
1861 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1862 } else
1863 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1864 return Cost;
1865 }
1866 }
1867 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1868}
1869
1870unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1871 if (isa<ScalableVectorType>(Ty)) {
1872 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1873 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1874 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1875 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1876 }
1877 return cast<FixedVectorType>(Ty)->getNumElements();
1878}
1879
1882 FastMathFlags FMF,
1884 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1885 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1886
1887 // Skip if scalar size of Ty is bigger than ELEN.
1888 if (Ty->getScalarSizeInBits() > ST->getELen())
1889 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1890
1891 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1892 if (Ty->getElementType()->isIntegerTy(1)) {
1893 // SelectionDAGBuilder does following transforms:
1894 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1895 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1896 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1897 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1898 else
1899 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1900 }
1901
1902 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1904 InstructionCost ExtraCost = 0;
1905 switch (IID) {
1906 case Intrinsic::maximum:
1907 if (FMF.noNaNs()) {
1908 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1909 } else {
1910 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1911 RISCV::VFMV_F_S};
1912 // Cost of Canonical Nan + branch
1913 // lui a0, 523264
1914 // fmv.w.x fa0, a0
1915 Type *DstTy = Ty->getScalarType();
1916 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1917 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1918 ExtraCost = 1 +
1919 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1921 getCFInstrCost(Instruction::Br, CostKind);
1922 }
1923 break;
1924
1925 case Intrinsic::minimum:
1926 if (FMF.noNaNs()) {
1927 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1928 } else {
1929 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1930 RISCV::VFMV_F_S};
1931 // Cost of Canonical Nan + branch
1932 // lui a0, 523264
1933 // fmv.w.x fa0, a0
1934 Type *DstTy = Ty->getScalarType();
1935 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1936 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1937 ExtraCost = 1 +
1938 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1940 getCFInstrCost(Instruction::Br, CostKind);
1941 }
1942 break;
1943 }
1944 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1945 }
1946
1947 // IR Reduction is composed by one rvv reduction instruction and vmv
1948 unsigned SplitOp;
1950 switch (IID) {
1951 default:
1952 llvm_unreachable("Unsupported intrinsic");
1953 case Intrinsic::smax:
1954 SplitOp = RISCV::VMAX_VV;
1955 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1956 break;
1957 case Intrinsic::smin:
1958 SplitOp = RISCV::VMIN_VV;
1959 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1960 break;
1961 case Intrinsic::umax:
1962 SplitOp = RISCV::VMAXU_VV;
1963 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1964 break;
1965 case Intrinsic::umin:
1966 SplitOp = RISCV::VMINU_VV;
1967 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1968 break;
1969 case Intrinsic::maxnum:
1970 SplitOp = RISCV::VFMAX_VV;
1971 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1972 break;
1973 case Intrinsic::minnum:
1974 SplitOp = RISCV::VFMIN_VV;
1975 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1976 break;
1977 }
1978 // Add a cost for data larger than LMUL8
1979 InstructionCost SplitCost =
1980 (LT.first > 1) ? (LT.first - 1) *
1981 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1982 : 0;
1983 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1984}
1985
1988 std::optional<FastMathFlags> FMF,
1990 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1991 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1992
1993 // Skip if scalar size of Ty is bigger than ELEN.
1994 if (Ty->getScalarSizeInBits() > ST->getELen())
1995 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1996
1997 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1998 assert(ISD && "Invalid opcode");
1999
2000 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2001 ISD != ISD::FADD)
2002 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2003
2004 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2005 Type *ElementTy = Ty->getElementType();
2006 if (ElementTy->isIntegerTy(1)) {
2007 // Example sequences:
2008 // vfirst.m a0, v0
2009 // seqz a0, a0
2010 if (LT.second == MVT::v1i1)
2011 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2012 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2014
2015 if (ISD == ISD::AND) {
2016 // Example sequences:
2017 // vmand.mm v8, v9, v8 ; needed every time type is split
2018 // vmnot.m v8, v0 ; alias for vmnand
2019 // vcpop.m a0, v8
2020 // seqz a0, a0
2021
2022 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2023 // For LMUL <= 8, there is no splitting,
2024 // the sequences are vmnot, vcpop and seqz.
2025 // When LMUL > 8 and split = 1,
2026 // the sequences are vmnand, vcpop and seqz.
2027 // When LMUL > 8 and split > 1,
2028 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2029 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2030 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2031 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2032 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2033 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2035 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2036 // Example sequences:
2037 // vsetvli a0, zero, e8, mf8, ta, ma
2038 // vmxor.mm v8, v0, v8 ; needed every time type is split
2039 // vcpop.m a0, v8
2040 // andi a0, a0, 1
2041 return (LT.first - 1) *
2042 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2043 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2044 } else {
2045 assert(ISD == ISD::OR);
2046 // Example sequences:
2047 // vsetvli a0, zero, e8, mf8, ta, ma
2048 // vmor.mm v8, v9, v8 ; needed every time type is split
2049 // vcpop.m a0, v0
2050 // snez a0, a0
2051 return (LT.first - 1) *
2052 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2053 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2054 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2056 }
2057 }
2058
2059 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2060 // instruction, and others is composed by two vmv and one rvv reduction
2061 // instruction
2062 unsigned SplitOp;
2064 switch (ISD) {
2065 case ISD::ADD:
2066 SplitOp = RISCV::VADD_VV;
2067 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2068 break;
2069 case ISD::OR:
2070 SplitOp = RISCV::VOR_VV;
2071 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2072 break;
2073 case ISD::XOR:
2074 SplitOp = RISCV::VXOR_VV;
2075 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2076 break;
2077 case ISD::AND:
2078 SplitOp = RISCV::VAND_VV;
2079 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2080 break;
2081 case ISD::FADD:
2082 // We can't promote f16/bf16 fadd reductions.
2083 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2084 LT.second.getScalarType() == MVT::bf16)
2085 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2087 Opcodes.push_back(RISCV::VFMV_S_F);
2088 for (unsigned i = 0; i < LT.first.getValue(); i++)
2089 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2090 Opcodes.push_back(RISCV::VFMV_F_S);
2091 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2092 }
2093 SplitOp = RISCV::VFADD_VV;
2094 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2095 break;
2096 }
2097 // Add a cost for data larger than LMUL8
2098 InstructionCost SplitCost =
2099 (LT.first > 1) ? (LT.first - 1) *
2100 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2101 : 0;
2102 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2103}
2104
2106 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2107 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2108 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2109 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2110 FMF, CostKind);
2111
2112 // Skip if scalar size of ResTy is bigger than ELEN.
2113 if (ResTy->getScalarSizeInBits() > ST->getELen())
2114 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2115 FMF, CostKind);
2116
2117 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2118 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2119 FMF, CostKind);
2120
2121 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2122
2123 if (IsUnsigned && Opcode == Instruction::Add &&
2124 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2125 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2126 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2127 return LT.first *
2128 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2129 }
2130
2131 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2132 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2133 FMF, CostKind);
2134
2135 return (LT.first - 1) +
2136 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2137}
2138
2142 assert(OpInfo.isConstant() && "non constant operand?");
2143 if (!isa<VectorType>(Ty))
2144 // FIXME: We need to account for immediate materialization here, but doing
2145 // a decent job requires more knowledge about the immediate than we
2146 // currently have here.
2147 return 0;
2148
2149 if (OpInfo.isUniform())
2150 // vmv.v.i, vmv.v.x, or vfmv.v.f
2151 // We ignore the cost of the scalar constant materialization to be consistent
2152 // with how we treat scalar constants themselves just above.
2153 return 1;
2154
2155 return getConstantPoolLoadCost(Ty, CostKind);
2156}
2157
2159 Align Alignment,
2160 unsigned AddressSpace,
2162 TTI::OperandValueInfo OpInfo,
2163 const Instruction *I) const {
2164 EVT VT = TLI->getValueType(DL, Src, true);
2165 // Type legalization can't handle structs
2166 if (VT == MVT::Other)
2167 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2168 CostKind, OpInfo, I);
2169
2171 if (Opcode == Instruction::Store && OpInfo.isConstant())
2172 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2173
2174 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2175
2176 InstructionCost BaseCost = [&]() {
2177 InstructionCost Cost = LT.first;
2179 return Cost;
2180
2181 // Our actual lowering for the case where a wider legal type is available
2182 // uses the a VL predicated load on the wider type. This is reflected in
2183 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2184 // widened cases are scalarized.
2185 const DataLayout &DL = this->getDataLayout();
2186 if (Src->isVectorTy() && LT.second.isVector() &&
2187 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2188 LT.second.getSizeInBits()))
2189 return Cost;
2190
2191 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2192 CostKind, OpInfo, I);
2193 }();
2194
2195 // Assume memory ops cost scale with the number of vector registers
2196 // possible accessed by the instruction. Note that BasicTTI already
2197 // handles the LT.first term for us.
2198 if (ST->hasVInstructions() && LT.second.isVector() &&
2200 BaseCost *= TLI->getLMULCost(LT.second);
2201 return Cost + BaseCost;
2202}
2203
2205 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2207 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2209 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2210 Op1Info, Op2Info, I);
2211
2212 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2213 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2214 Op1Info, Op2Info, I);
2215
2216 // Skip if scalar size of ValTy is bigger than ELEN.
2217 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2218 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2219 Op1Info, Op2Info, I);
2220
2221 auto GetConstantMatCost =
2222 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2223 if (OpInfo.isUniform())
2224 // We return 0 we currently ignore the cost of materializing scalar
2225 // constants in GPRs.
2226 return 0;
2227
2228 return getConstantPoolLoadCost(ValTy, CostKind);
2229 };
2230
2231 InstructionCost ConstantMatCost;
2232 if (Op1Info.isConstant())
2233 ConstantMatCost += GetConstantMatCost(Op1Info);
2234 if (Op2Info.isConstant())
2235 ConstantMatCost += GetConstantMatCost(Op2Info);
2236
2237 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2238 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2239 if (CondTy->isVectorTy()) {
2240 if (ValTy->getScalarSizeInBits() == 1) {
2241 // vmandn.mm v8, v8, v9
2242 // vmand.mm v9, v0, v9
2243 // vmor.mm v0, v9, v8
2244 return ConstantMatCost +
2245 LT.first *
2246 getRISCVInstructionCost(
2247 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2248 LT.second, CostKind);
2249 }
2250 // vselect and max/min are supported natively.
2251 return ConstantMatCost +
2252 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2253 CostKind);
2254 }
2255
2256 if (ValTy->getScalarSizeInBits() == 1) {
2257 // vmv.v.x v9, a0
2258 // vmsne.vi v9, v9, 0
2259 // vmandn.mm v8, v8, v9
2260 // vmand.mm v9, v0, v9
2261 // vmor.mm v0, v9, v8
2262 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2263 return ConstantMatCost +
2264 LT.first *
2265 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2266 InterimVT, CostKind) +
2267 LT.first * getRISCVInstructionCost(
2268 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2269 LT.second, CostKind);
2270 }
2271
2272 // vmv.v.x v10, a0
2273 // vmsne.vi v0, v10, 0
2274 // vmerge.vvm v8, v9, v8, v0
2275 return ConstantMatCost +
2276 LT.first * getRISCVInstructionCost(
2277 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2278 LT.second, CostKind);
2279 }
2280
2281 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2282 CmpInst::isIntPredicate(VecPred)) {
2283 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2284 // provided they incur the same cost across all implementations
2285 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2286 LT.second,
2287 CostKind);
2288 }
2289
2290 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2291 CmpInst::isFPPredicate(VecPred)) {
2292
2293 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2294 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2295 return ConstantMatCost +
2296 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2297
2298 // If we do not support the input floating point vector type, use the base
2299 // one which will calculate as:
2300 // ScalarizeCost + Num * Cost for fixed vector,
2301 // InvalidCost for scalable vector.
2302 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2303 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2304 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2305 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2306 Op1Info, Op2Info, I);
2307
2308 // Assuming vector fp compare and mask instructions are all the same cost
2309 // until a need arises to differentiate them.
2310 switch (VecPred) {
2311 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2312 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2313 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2314 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2315 return ConstantMatCost +
2316 LT.first * getRISCVInstructionCost(
2317 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2318 LT.second, CostKind);
2319
2320 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2321 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2322 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2323 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2324 return ConstantMatCost +
2325 LT.first *
2326 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2327 LT.second, CostKind);
2328
2329 case CmpInst::FCMP_OEQ: // vmfeq.vv
2330 case CmpInst::FCMP_OGT: // vmflt.vv
2331 case CmpInst::FCMP_OGE: // vmfle.vv
2332 case CmpInst::FCMP_OLT: // vmflt.vv
2333 case CmpInst::FCMP_OLE: // vmfle.vv
2334 case CmpInst::FCMP_UNE: // vmfne.vv
2335 return ConstantMatCost +
2336 LT.first *
2337 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2338 default:
2339 break;
2340 }
2341 }
2342
2343 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2344 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2345 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2346 // be (0 + select instr cost).
2347 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2348 ValTy->isIntegerTy() && !I->user_empty()) {
2349 if (all_of(I->users(), [&](const User *U) {
2350 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2351 U->getType()->isIntegerTy() &&
2352 !isa<ConstantData>(U->getOperand(1)) &&
2353 !isa<ConstantData>(U->getOperand(2));
2354 }))
2355 return 0;
2356 }
2357
2358 // TODO: Add cost for scalar type.
2359
2360 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2361 Op1Info, Op2Info, I);
2362}
2363
2366 const Instruction *I) const {
2368 return Opcode == Instruction::PHI ? 0 : 1;
2369 // Branches are assumed to be predicted.
2370 return 0;
2371}
2372
2375 unsigned Index,
2376 const Value *Op0,
2377 const Value *Op1) const {
2378 assert(Val->isVectorTy() && "This must be a vector type");
2379
2380 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2381 // For now, skip all fixed vector cost analysis when P extension is available
2382 // to avoid crashes in getMinRVVVectorSizeInBits()
2383 if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
2384 return 1; // Treat as single instruction cost for now
2385 }
2386
2387 if (Opcode != Instruction::ExtractElement &&
2388 Opcode != Instruction::InsertElement)
2389 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2390
2391 // Legalize the type.
2392 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2393
2394 // This type is legalized to a scalar type.
2395 if (!LT.second.isVector()) {
2396 auto *FixedVecTy = cast<FixedVectorType>(Val);
2397 // If Index is a known constant, cost is zero.
2398 if (Index != -1U)
2399 return 0;
2400 // Extract/InsertElement with non-constant index is very costly when
2401 // scalarized; estimate cost of loads/stores sequence via the stack:
2402 // ExtractElement cost: store vector to stack, load scalar;
2403 // InsertElement cost: store vector to stack, store scalar, load vector.
2404 Type *ElemTy = FixedVecTy->getElementType();
2405 auto NumElems = FixedVecTy->getNumElements();
2406 auto Align = DL.getPrefTypeAlign(ElemTy);
2407 InstructionCost LoadCost =
2408 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2409 InstructionCost StoreCost =
2410 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2411 return Opcode == Instruction::ExtractElement
2412 ? StoreCost * NumElems + LoadCost
2413 : (StoreCost + LoadCost) * NumElems + StoreCost;
2414 }
2415
2416 // For unsupported scalable vector.
2417 if (LT.second.isScalableVector() && !LT.first.isValid())
2418 return LT.first;
2419
2420 // Mask vector extract/insert is expanded via e8.
2421 if (Val->getScalarSizeInBits() == 1) {
2422 VectorType *WideTy =
2424 cast<VectorType>(Val)->getElementCount());
2425 if (Opcode == Instruction::ExtractElement) {
2426 InstructionCost ExtendCost
2427 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2429 InstructionCost ExtractCost
2430 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2431 return ExtendCost + ExtractCost;
2432 }
2433 InstructionCost ExtendCost
2434 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2436 InstructionCost InsertCost
2437 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2438 InstructionCost TruncCost
2439 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2441 return ExtendCost + InsertCost + TruncCost;
2442 }
2443
2444
2445 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2446 // and vslideup + vmv.s.x to insert element to vector.
2447 unsigned BaseCost = 1;
2448 // When insertelement we should add the index with 1 as the input of vslideup.
2449 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2450
2451 if (Index != -1U) {
2452 // The type may be split. For fixed-width vectors we can normalize the
2453 // index to the new type.
2454 if (LT.second.isFixedLengthVector()) {
2455 unsigned Width = LT.second.getVectorNumElements();
2456 Index = Index % Width;
2457 }
2458
2459 // If exact VLEN is known, we will insert/extract into the appropriate
2460 // subvector with no additional subvector insert/extract cost.
2461 if (auto VLEN = ST->getRealVLen()) {
2462 unsigned EltSize = LT.second.getScalarSizeInBits();
2463 unsigned M1Max = *VLEN / EltSize;
2464 Index = Index % M1Max;
2465 }
2466
2467 if (Index == 0)
2468 // We can extract/insert the first element without vslidedown/vslideup.
2469 SlideCost = 0;
2470 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2471 Val->getScalarType()->isIntegerTy())
2472 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2473 else if (Opcode == Instruction::InsertElement)
2474 SlideCost = 1; // With a constant index, we do not need to use addi.
2475 }
2476
2477 // When the vector needs to split into multiple register groups and the index
2478 // exceeds single vector register group, we need to insert/extract the element
2479 // via stack.
2480 if (LT.first > 1 &&
2481 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2482 LT.second.isScalableVector()))) {
2483 Type *ScalarType = Val->getScalarType();
2484 Align VecAlign = DL.getPrefTypeAlign(Val);
2485 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2486 // Extra addi for unknown index.
2487 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2488
2489 // Store all split vectors into stack and load the target element.
2490 if (Opcode == Instruction::ExtractElement)
2491 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2492 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2493 CostKind) +
2494 IdxCost;
2495
2496 // Store all split vectors into stack and store the target element and load
2497 // vectors back.
2498 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2499 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2500 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2501 CostKind) +
2502 IdxCost;
2503 }
2504
2505 // Extract i64 in the target that has XLEN=32 need more instruction.
2506 if (Val->getScalarType()->isIntegerTy() &&
2507 ST->getXLen() < Val->getScalarSizeInBits()) {
2508 // For extractelement, we need the following instructions:
2509 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2510 // vslidedown.vx v8, v8, a0
2511 // vmv.x.s a0, v8
2512 // li a1, 32
2513 // vsrl.vx v8, v8, a1
2514 // vmv.x.s a1, v8
2515
2516 // For insertelement, we need the following instructions:
2517 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2518 // vmv.v.i v12, 0
2519 // vslide1up.vx v16, v12, a1
2520 // vslide1up.vx v12, v16, a0
2521 // addi a0, a2, 1
2522 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2523 // vslideup.vx v8, v12, a2
2524
2525 // TODO: should we count these special vsetvlis?
2526 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2527 }
2528 return BaseCost + SlideCost;
2529}
2530
2534 unsigned Index) const {
2535 if (isa<FixedVectorType>(Val))
2537 Index);
2538
2539 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2540 // for the cost of extracting the last lane of a scalable vector. It probably
2541 // needs a more accurate cost.
2542 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2543 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2544 return getVectorInstrCost(Opcode, Val, CostKind,
2545 EC.getKnownMinValue() - 1 - Index, nullptr,
2546 nullptr);
2547}
2548
2550 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2552 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2553
2554 // TODO: Handle more cost kinds.
2556 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2557 Args, CxtI);
2558
2559 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2560 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2561 Args, CxtI);
2562
2563 // Skip if scalar size of Ty is bigger than ELEN.
2564 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2565 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2566 Args, CxtI);
2567
2568 // Legalize the type.
2569 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2570
2571 // TODO: Handle scalar type.
2572 if (!LT.second.isVector())
2573 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2574 Args, CxtI);
2575
2576 // f16 with zvfhmin and bf16 will be promoted to f32.
2577 // FIXME: nxv32[b]f16 will be custom lowered and split.
2578 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2579 InstructionCost CastCost = 0;
2580 if ((LT.second.getVectorElementType() == MVT::f16 ||
2581 LT.second.getVectorElementType() == MVT::bf16) &&
2582 TLI->getOperationAction(ISDOpcode, LT.second) ==
2584 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2585 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2586 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2587 // Add cost of extending arguments
2588 CastCost += LT.first * Args.size() *
2589 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2591 // Add cost of truncating result
2592 CastCost +=
2593 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2595 // Compute cost of op in promoted type
2596 LT.second = PromotedVT;
2597 }
2598
2599 auto getConstantMatCost =
2600 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2601 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2602 // Two sub-cases:
2603 // * Has a 5 bit immediate operand which can be splatted.
2604 // * Has a larger immediate which must be materialized in scalar register
2605 // We return 0 for both as we currently ignore the cost of materializing
2606 // scalar constants in GPRs.
2607 return 0;
2608
2609 return getConstantPoolLoadCost(Ty, CostKind);
2610 };
2611
2612 // Add the cost of materializing any constant vectors required.
2613 InstructionCost ConstantMatCost = 0;
2614 if (Op1Info.isConstant())
2615 ConstantMatCost += getConstantMatCost(0, Op1Info);
2616 if (Op2Info.isConstant())
2617 ConstantMatCost += getConstantMatCost(1, Op2Info);
2618
2619 unsigned Op;
2620 switch (ISDOpcode) {
2621 case ISD::ADD:
2622 case ISD::SUB:
2623 Op = RISCV::VADD_VV;
2624 break;
2625 case ISD::SHL:
2626 case ISD::SRL:
2627 case ISD::SRA:
2628 Op = RISCV::VSLL_VV;
2629 break;
2630 case ISD::AND:
2631 case ISD::OR:
2632 case ISD::XOR:
2633 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2634 break;
2635 case ISD::MUL:
2636 case ISD::MULHS:
2637 case ISD::MULHU:
2638 Op = RISCV::VMUL_VV;
2639 break;
2640 case ISD::SDIV:
2641 case ISD::UDIV:
2642 Op = RISCV::VDIV_VV;
2643 break;
2644 case ISD::SREM:
2645 case ISD::UREM:
2646 Op = RISCV::VREM_VV;
2647 break;
2648 case ISD::FADD:
2649 case ISD::FSUB:
2650 Op = RISCV::VFADD_VV;
2651 break;
2652 case ISD::FMUL:
2653 Op = RISCV::VFMUL_VV;
2654 break;
2655 case ISD::FDIV:
2656 Op = RISCV::VFDIV_VV;
2657 break;
2658 case ISD::FNEG:
2659 Op = RISCV::VFSGNJN_VV;
2660 break;
2661 default:
2662 // Assuming all other instructions have the same cost until a need arises to
2663 // differentiate them.
2664 return CastCost + ConstantMatCost +
2665 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2666 Args, CxtI);
2667 }
2668
2669 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2670 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2671 // ops are twice as expensive as integer ops. Do the same for vectors so
2672 // scalar floating point ops aren't cheaper than their vector equivalents.
2673 if (Ty->isFPOrFPVectorTy())
2674 InstrCost *= 2;
2675 return CastCost + ConstantMatCost + LT.first * InstrCost;
2676}
2677
2678// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2680 ArrayRef<const Value *> Ptrs, const Value *Base,
2681 const TTI::PointersChainInfo &Info, Type *AccessTy,
2684 // In the basic model we take into account GEP instructions only
2685 // (although here can come alloca instruction, a value, constants and/or
2686 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2687 // pointer). Typically, if Base is a not a GEP-instruction and all the
2688 // pointers are relative to the same base address, all the rest are
2689 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2690 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2691 // any their index is a non-const.
2692 // If no known dependencies between the pointers cost is calculated as a sum
2693 // of costs of GEP instructions.
2694 for (auto [I, V] : enumerate(Ptrs)) {
2695 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2696 if (!GEP)
2697 continue;
2698 if (Info.isSameBase() && V != Base) {
2699 if (GEP->hasAllConstantIndices())
2700 continue;
2701 // If the chain is unit-stride and BaseReg + stride*i is a legal
2702 // addressing mode, then presume the base GEP is sitting around in a
2703 // register somewhere and check if we can fold the offset relative to
2704 // it.
2705 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2706 if (Info.isUnitStride() &&
2707 isLegalAddressingMode(AccessTy,
2708 /* BaseGV */ nullptr,
2709 /* BaseOffset */ Stride * I,
2710 /* HasBaseReg */ true,
2711 /* Scale */ 0,
2712 GEP->getType()->getPointerAddressSpace()))
2713 continue;
2714 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2715 {TTI::OK_AnyValue, TTI::OP_None},
2716 {TTI::OK_AnyValue, TTI::OP_None}, {});
2717 } else {
2718 SmallVector<const Value *> Indices(GEP->indices());
2719 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2720 Indices, AccessTy, CostKind);
2721 }
2722 }
2723 return Cost;
2724}
2725
2728 OptimizationRemarkEmitter *ORE) const {
2729 // TODO: More tuning on benchmarks and metrics with changes as needed
2730 // would apply to all settings below to enable performance.
2731
2732
2733 if (ST->enableDefaultUnroll())
2734 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2735
2736 // Enable Upper bound unrolling universally, not dependent upon the conditions
2737 // below.
2738 UP.UpperBound = true;
2739
2740 // Disable loop unrolling for Oz and Os.
2741 UP.OptSizeThreshold = 0;
2743 if (L->getHeader()->getParent()->hasOptSize())
2744 return;
2745
2746 SmallVector<BasicBlock *, 4> ExitingBlocks;
2747 L->getExitingBlocks(ExitingBlocks);
2748 LLVM_DEBUG(dbgs() << "Loop has:\n"
2749 << "Blocks: " << L->getNumBlocks() << "\n"
2750 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2751
2752 // Only allow another exit other than the latch. This acts as an early exit
2753 // as it mirrors the profitability calculation of the runtime unroller.
2754 if (ExitingBlocks.size() > 2)
2755 return;
2756
2757 // Limit the CFG of the loop body for targets with a branch predictor.
2758 // Allowing 4 blocks permits if-then-else diamonds in the body.
2759 if (L->getNumBlocks() > 4)
2760 return;
2761
2762 // Scan the loop: don't unroll loops with calls as this could prevent
2763 // inlining. Don't unroll auto-vectorized loops either, though do allow
2764 // unrolling of the scalar remainder.
2765 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2767 for (auto *BB : L->getBlocks()) {
2768 for (auto &I : *BB) {
2769 // Both auto-vectorized loops and the scalar remainder have the
2770 // isvectorized attribute, so differentiate between them by the presence
2771 // of vector instructions.
2772 if (IsVectorized && I.getType()->isVectorTy())
2773 return;
2774
2775 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2776 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2777 if (!isLoweredToCall(F))
2778 continue;
2779 }
2780 return;
2781 }
2782
2783 SmallVector<const Value *> Operands(I.operand_values());
2784 Cost += getInstructionCost(&I, Operands,
2786 }
2787 }
2788
2789 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2790
2791 UP.Partial = true;
2792 UP.Runtime = true;
2793 UP.UnrollRemainder = true;
2794 UP.UnrollAndJam = true;
2795
2796 // Force unrolling small loops can be very useful because of the branch
2797 // taken cost of the backedge.
2798 if (Cost < 12)
2799 UP.Force = true;
2800}
2801
2806
2808 MemIntrinsicInfo &Info) const {
2809 const DataLayout &DL = getDataLayout();
2810 Intrinsic::ID IID = Inst->getIntrinsicID();
2811 LLVMContext &C = Inst->getContext();
2812 bool HasMask = false;
2813
2814 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2815 bool IsWrite) -> int64_t {
2816 if (auto *TarExtTy =
2817 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2818 return TarExtTy->getIntParameter(0);
2819
2820 return 1;
2821 };
2822
2823 switch (IID) {
2824 case Intrinsic::riscv_vle_mask:
2825 case Intrinsic::riscv_vse_mask:
2826 case Intrinsic::riscv_vlseg2_mask:
2827 case Intrinsic::riscv_vlseg3_mask:
2828 case Intrinsic::riscv_vlseg4_mask:
2829 case Intrinsic::riscv_vlseg5_mask:
2830 case Intrinsic::riscv_vlseg6_mask:
2831 case Intrinsic::riscv_vlseg7_mask:
2832 case Intrinsic::riscv_vlseg8_mask:
2833 case Intrinsic::riscv_vsseg2_mask:
2834 case Intrinsic::riscv_vsseg3_mask:
2835 case Intrinsic::riscv_vsseg4_mask:
2836 case Intrinsic::riscv_vsseg5_mask:
2837 case Intrinsic::riscv_vsseg6_mask:
2838 case Intrinsic::riscv_vsseg7_mask:
2839 case Intrinsic::riscv_vsseg8_mask:
2840 HasMask = true;
2841 [[fallthrough]];
2842 case Intrinsic::riscv_vle:
2843 case Intrinsic::riscv_vse:
2844 case Intrinsic::riscv_vlseg2:
2845 case Intrinsic::riscv_vlseg3:
2846 case Intrinsic::riscv_vlseg4:
2847 case Intrinsic::riscv_vlseg5:
2848 case Intrinsic::riscv_vlseg6:
2849 case Intrinsic::riscv_vlseg7:
2850 case Intrinsic::riscv_vlseg8:
2851 case Intrinsic::riscv_vsseg2:
2852 case Intrinsic::riscv_vsseg3:
2853 case Intrinsic::riscv_vsseg4:
2854 case Intrinsic::riscv_vsseg5:
2855 case Intrinsic::riscv_vsseg6:
2856 case Intrinsic::riscv_vsseg7:
2857 case Intrinsic::riscv_vsseg8: {
2858 // Intrinsic interface:
2859 // riscv_vle(merge, ptr, vl)
2860 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2861 // riscv_vse(val, ptr, vl)
2862 // riscv_vse_mask(val, ptr, mask, vl, policy)
2863 // riscv_vlseg#(merge, ptr, vl, sew)
2864 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2865 // riscv_vsseg#(val, ptr, vl, sew)
2866 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2867 bool IsWrite = Inst->getType()->isVoidTy();
2868 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2869 // The results of segment loads are TargetExtType.
2870 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2871 unsigned SEW =
2872 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2873 ->getZExtValue();
2874 Ty = TarExtTy->getTypeParameter(0U);
2876 IntegerType::get(C, SEW),
2877 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2878 }
2879 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2880 unsigned VLIndex = RVVIInfo->VLOperand;
2881 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2882 MaybeAlign Alignment =
2883 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2884 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2885 Value *Mask = ConstantInt::getTrue(MaskType);
2886 if (HasMask)
2887 Mask = Inst->getArgOperand(VLIndex - 1);
2888 Value *EVL = Inst->getArgOperand(VLIndex);
2889 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2890 // RVV uses contiguous elements as a segment.
2891 if (SegNum > 1) {
2892 unsigned ElemSize = Ty->getScalarSizeInBits();
2893 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2894 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2895 }
2896 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2897 Alignment, Mask, EVL);
2898 return true;
2899 }
2900 case Intrinsic::riscv_vlse_mask:
2901 case Intrinsic::riscv_vsse_mask:
2902 case Intrinsic::riscv_vlsseg2_mask:
2903 case Intrinsic::riscv_vlsseg3_mask:
2904 case Intrinsic::riscv_vlsseg4_mask:
2905 case Intrinsic::riscv_vlsseg5_mask:
2906 case Intrinsic::riscv_vlsseg6_mask:
2907 case Intrinsic::riscv_vlsseg7_mask:
2908 case Intrinsic::riscv_vlsseg8_mask:
2909 case Intrinsic::riscv_vssseg2_mask:
2910 case Intrinsic::riscv_vssseg3_mask:
2911 case Intrinsic::riscv_vssseg4_mask:
2912 case Intrinsic::riscv_vssseg5_mask:
2913 case Intrinsic::riscv_vssseg6_mask:
2914 case Intrinsic::riscv_vssseg7_mask:
2915 case Intrinsic::riscv_vssseg8_mask:
2916 HasMask = true;
2917 [[fallthrough]];
2918 case Intrinsic::riscv_vlse:
2919 case Intrinsic::riscv_vsse:
2920 case Intrinsic::riscv_vlsseg2:
2921 case Intrinsic::riscv_vlsseg3:
2922 case Intrinsic::riscv_vlsseg4:
2923 case Intrinsic::riscv_vlsseg5:
2924 case Intrinsic::riscv_vlsseg6:
2925 case Intrinsic::riscv_vlsseg7:
2926 case Intrinsic::riscv_vlsseg8:
2927 case Intrinsic::riscv_vssseg2:
2928 case Intrinsic::riscv_vssseg3:
2929 case Intrinsic::riscv_vssseg4:
2930 case Intrinsic::riscv_vssseg5:
2931 case Intrinsic::riscv_vssseg6:
2932 case Intrinsic::riscv_vssseg7:
2933 case Intrinsic::riscv_vssseg8: {
2934 // Intrinsic interface:
2935 // riscv_vlse(merge, ptr, stride, vl)
2936 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2937 // riscv_vsse(val, ptr, stride, vl)
2938 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2939 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2940 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2941 // riscv_vssseg#(val, ptr, offset, vl, sew)
2942 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2943 bool IsWrite = Inst->getType()->isVoidTy();
2944 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2945 // The results of segment loads are TargetExtType.
2946 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2947 unsigned SEW =
2948 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2949 ->getZExtValue();
2950 Ty = TarExtTy->getTypeParameter(0U);
2952 IntegerType::get(C, SEW),
2953 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2954 }
2955 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2956 unsigned VLIndex = RVVIInfo->VLOperand;
2957 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2958 MaybeAlign Alignment =
2959 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2960
2961 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2962 // Use the pointer alignment as the element alignment if the stride is a
2963 // multiple of the pointer alignment. Otherwise, the element alignment
2964 // should be the greatest common divisor of pointer alignment and stride.
2965 // For simplicity, just consider unalignment for elements.
2966 unsigned PointerAlign = Alignment.valueOrOne().value();
2967 if (!isa<ConstantInt>(Stride) ||
2968 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2969 Alignment = Align(1);
2970
2971 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2972 Value *Mask = ConstantInt::getTrue(MaskType);
2973 if (HasMask)
2974 Mask = Inst->getArgOperand(VLIndex - 1);
2975 Value *EVL = Inst->getArgOperand(VLIndex);
2976 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2977 // RVV uses contiguous elements as a segment.
2978 if (SegNum > 1) {
2979 unsigned ElemSize = Ty->getScalarSizeInBits();
2980 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2981 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2982 }
2983 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2984 Alignment, Mask, EVL, Stride);
2985 return true;
2986 }
2987 case Intrinsic::riscv_vloxei_mask:
2988 case Intrinsic::riscv_vluxei_mask:
2989 case Intrinsic::riscv_vsoxei_mask:
2990 case Intrinsic::riscv_vsuxei_mask:
2991 case Intrinsic::riscv_vloxseg2_mask:
2992 case Intrinsic::riscv_vloxseg3_mask:
2993 case Intrinsic::riscv_vloxseg4_mask:
2994 case Intrinsic::riscv_vloxseg5_mask:
2995 case Intrinsic::riscv_vloxseg6_mask:
2996 case Intrinsic::riscv_vloxseg7_mask:
2997 case Intrinsic::riscv_vloxseg8_mask:
2998 case Intrinsic::riscv_vluxseg2_mask:
2999 case Intrinsic::riscv_vluxseg3_mask:
3000 case Intrinsic::riscv_vluxseg4_mask:
3001 case Intrinsic::riscv_vluxseg5_mask:
3002 case Intrinsic::riscv_vluxseg6_mask:
3003 case Intrinsic::riscv_vluxseg7_mask:
3004 case Intrinsic::riscv_vluxseg8_mask:
3005 case Intrinsic::riscv_vsoxseg2_mask:
3006 case Intrinsic::riscv_vsoxseg3_mask:
3007 case Intrinsic::riscv_vsoxseg4_mask:
3008 case Intrinsic::riscv_vsoxseg5_mask:
3009 case Intrinsic::riscv_vsoxseg6_mask:
3010 case Intrinsic::riscv_vsoxseg7_mask:
3011 case Intrinsic::riscv_vsoxseg8_mask:
3012 case Intrinsic::riscv_vsuxseg2_mask:
3013 case Intrinsic::riscv_vsuxseg3_mask:
3014 case Intrinsic::riscv_vsuxseg4_mask:
3015 case Intrinsic::riscv_vsuxseg5_mask:
3016 case Intrinsic::riscv_vsuxseg6_mask:
3017 case Intrinsic::riscv_vsuxseg7_mask:
3018 case Intrinsic::riscv_vsuxseg8_mask:
3019 HasMask = true;
3020 [[fallthrough]];
3021 case Intrinsic::riscv_vloxei:
3022 case Intrinsic::riscv_vluxei:
3023 case Intrinsic::riscv_vsoxei:
3024 case Intrinsic::riscv_vsuxei:
3025 case Intrinsic::riscv_vloxseg2:
3026 case Intrinsic::riscv_vloxseg3:
3027 case Intrinsic::riscv_vloxseg4:
3028 case Intrinsic::riscv_vloxseg5:
3029 case Intrinsic::riscv_vloxseg6:
3030 case Intrinsic::riscv_vloxseg7:
3031 case Intrinsic::riscv_vloxseg8:
3032 case Intrinsic::riscv_vluxseg2:
3033 case Intrinsic::riscv_vluxseg3:
3034 case Intrinsic::riscv_vluxseg4:
3035 case Intrinsic::riscv_vluxseg5:
3036 case Intrinsic::riscv_vluxseg6:
3037 case Intrinsic::riscv_vluxseg7:
3038 case Intrinsic::riscv_vluxseg8:
3039 case Intrinsic::riscv_vsoxseg2:
3040 case Intrinsic::riscv_vsoxseg3:
3041 case Intrinsic::riscv_vsoxseg4:
3042 case Intrinsic::riscv_vsoxseg5:
3043 case Intrinsic::riscv_vsoxseg6:
3044 case Intrinsic::riscv_vsoxseg7:
3045 case Intrinsic::riscv_vsoxseg8:
3046 case Intrinsic::riscv_vsuxseg2:
3047 case Intrinsic::riscv_vsuxseg3:
3048 case Intrinsic::riscv_vsuxseg4:
3049 case Intrinsic::riscv_vsuxseg5:
3050 case Intrinsic::riscv_vsuxseg6:
3051 case Intrinsic::riscv_vsuxseg7:
3052 case Intrinsic::riscv_vsuxseg8: {
3053 // Intrinsic interface (only listed ordered version):
3054 // riscv_vloxei(merge, ptr, index, vl)
3055 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3056 // riscv_vsoxei(val, ptr, index, vl)
3057 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3058 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3059 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3060 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3061 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3062 bool IsWrite = Inst->getType()->isVoidTy();
3063 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3064 // The results of segment loads are TargetExtType.
3065 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3066 unsigned SEW =
3067 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3068 ->getZExtValue();
3069 Ty = TarExtTy->getTypeParameter(0U);
3071 IntegerType::get(C, SEW),
3072 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3073 }
3074 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3075 unsigned VLIndex = RVVIInfo->VLOperand;
3076 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3077 Value *Mask;
3078 if (HasMask) {
3079 Mask = Inst->getArgOperand(VLIndex - 1);
3080 } else {
3081 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3082 // and casting that to scalar i64 triggers a vector/scalar mismatch
3083 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3084 // via extractelement instead.
3085 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3086 Mask = ConstantInt::getTrue(MaskType);
3087 }
3088 Value *EVL = Inst->getArgOperand(VLIndex);
3089 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3090 // RVV uses contiguous elements as a segment.
3091 if (SegNum > 1) {
3092 unsigned ElemSize = Ty->getScalarSizeInBits();
3093 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3094 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3095 }
3096 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3097 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3098 Align(1), Mask, EVL,
3099 /* Stride */ nullptr, OffsetOp);
3100 return true;
3101 }
3102 }
3103 return false;
3104}
3105
3107 if (Ty->isVectorTy()) {
3108 // f16 with only zvfhmin and bf16 will be promoted to f32
3109 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3110 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3111 EltTy->isBFloatTy())
3112 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3113 cast<VectorType>(Ty));
3114
3115 TypeSize Size = DL.getTypeSizeInBits(Ty);
3116 if (Size.isScalable() && ST->hasVInstructions())
3117 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3118
3119 if (ST->useRVVForFixedLengthVectors())
3120 return divideCeil(Size, ST->getRealMinVLen());
3121 }
3122
3123 return BaseT::getRegUsageForType(Ty);
3124}
3125
3126unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3127 if (SLPMaxVF.getNumOccurrences())
3128 return SLPMaxVF;
3129
3130 // Return how many elements can fit in getRegisterBitwidth. This is the
3131 // same routine as used in LoopVectorizer. We should probably be
3132 // accounting for whether we actually have instructions with the right
3133 // lane type, but we don't have enough information to do that without
3134 // some additional plumbing which hasn't been justified yet.
3135 TypeSize RegWidth =
3137 // If no vector registers, or absurd element widths, disable
3138 // vectorization by returning 1.
3139 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3140}
3141
3145
3147 return ST->enableUnalignedVectorMem();
3148}
3149
3152 ScalarEvolution *SE) const {
3153 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3154 return TTI::AMK_PostIndexed;
3155
3157}
3158
3160 const TargetTransformInfo::LSRCost &C2) const {
3161 // RISC-V specific here are "instruction number 1st priority".
3162 // If we need to emit adds inside the loop to add up base registers, then
3163 // we need at least one extra temporary register.
3164 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3165 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3166 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3167 C1.NumIVMuls, C1.NumBaseAdds,
3168 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3169 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3170 C2.NumIVMuls, C2.NumBaseAdds,
3171 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3172}
3173
3175 Align Alignment) const {
3176 auto *VTy = dyn_cast<VectorType>(DataTy);
3177 if (!VTy || VTy->isScalableTy())
3178 return false;
3179
3180 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3181 return false;
3182
3183 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3184 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3185 if (VTy->getElementType()->isIntegerTy(8))
3186 if (VTy->getElementCount().getFixedValue() > 256)
3187 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3188 ST->getMaxLMULForFixedLengthVectors();
3189 return true;
3190}
3191
3193 Align Alignment) const {
3194 auto *VTy = dyn_cast<VectorType>(DataTy);
3195 if (!VTy || VTy->isScalableTy())
3196 return false;
3197
3198 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3199 return false;
3200 return true;
3201}
3202
3203/// See if \p I should be considered for address type promotion. We check if \p
3204/// I is a sext with right type and used in memory accesses. If it used in a
3205/// "complex" getelementptr, we allow it to be promoted without finding other
3206/// sext instructions that sign extended the same initial value. A getelementptr
3207/// is considered as "complex" if it has more than 2 operands.
3209 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3210 bool Considerable = false;
3211 AllowPromotionWithoutCommonHeader = false;
3212 if (!isa<SExtInst>(&I))
3213 return false;
3214 Type *ConsideredSExtType =
3215 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3216 if (I.getType() != ConsideredSExtType)
3217 return false;
3218 // See if the sext is the one with the right type and used in at least one
3219 // GetElementPtrInst.
3220 for (const User *U : I.users()) {
3221 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3222 Considerable = true;
3223 // A getelementptr is considered as "complex" if it has more than 2
3224 // operands. We will promote a SExt used in such complex GEP as we
3225 // expect some computation to be merged if they are done on 64 bits.
3226 if (GEPInst->getNumOperands() > 2) {
3227 AllowPromotionWithoutCommonHeader = true;
3228 break;
3229 }
3230 }
3231 }
3232 return Considerable;
3233}
3234
3235bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3236 switch (Opcode) {
3237 case Instruction::Add:
3238 case Instruction::Sub:
3239 case Instruction::Mul:
3240 case Instruction::And:
3241 case Instruction::Or:
3242 case Instruction::Xor:
3243 case Instruction::FAdd:
3244 case Instruction::FSub:
3245 case Instruction::FMul:
3246 case Instruction::FDiv:
3247 case Instruction::ICmp:
3248 case Instruction::FCmp:
3249 return true;
3250 case Instruction::Shl:
3251 case Instruction::LShr:
3252 case Instruction::AShr:
3253 case Instruction::UDiv:
3254 case Instruction::SDiv:
3255 case Instruction::URem:
3256 case Instruction::SRem:
3257 case Instruction::Select:
3258 return Operand == 1;
3259 default:
3260 return false;
3261 }
3262}
3263
3265 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3266 return false;
3267
3268 if (canSplatOperand(I->getOpcode(), Operand))
3269 return true;
3270
3271 auto *II = dyn_cast<IntrinsicInst>(I);
3272 if (!II)
3273 return false;
3274
3275 switch (II->getIntrinsicID()) {
3276 case Intrinsic::fma:
3277 case Intrinsic::vp_fma:
3278 case Intrinsic::fmuladd:
3279 case Intrinsic::vp_fmuladd:
3280 return Operand == 0 || Operand == 1;
3281 case Intrinsic::vp_shl:
3282 case Intrinsic::vp_lshr:
3283 case Intrinsic::vp_ashr:
3284 case Intrinsic::vp_udiv:
3285 case Intrinsic::vp_sdiv:
3286 case Intrinsic::vp_urem:
3287 case Intrinsic::vp_srem:
3288 case Intrinsic::ssub_sat:
3289 case Intrinsic::vp_ssub_sat:
3290 case Intrinsic::usub_sat:
3291 case Intrinsic::vp_usub_sat:
3292 case Intrinsic::vp_select:
3293 return Operand == 1;
3294 // These intrinsics are commutative.
3295 case Intrinsic::vp_add:
3296 case Intrinsic::vp_mul:
3297 case Intrinsic::vp_and:
3298 case Intrinsic::vp_or:
3299 case Intrinsic::vp_xor:
3300 case Intrinsic::vp_fadd:
3301 case Intrinsic::vp_fmul:
3302 case Intrinsic::vp_icmp:
3303 case Intrinsic::vp_fcmp:
3304 case Intrinsic::smin:
3305 case Intrinsic::vp_smin:
3306 case Intrinsic::umin:
3307 case Intrinsic::vp_umin:
3308 case Intrinsic::smax:
3309 case Intrinsic::vp_smax:
3310 case Intrinsic::umax:
3311 case Intrinsic::vp_umax:
3312 case Intrinsic::sadd_sat:
3313 case Intrinsic::vp_sadd_sat:
3314 case Intrinsic::uadd_sat:
3315 case Intrinsic::vp_uadd_sat:
3316 // These intrinsics have 'vr' versions.
3317 case Intrinsic::vp_sub:
3318 case Intrinsic::vp_fsub:
3319 case Intrinsic::vp_fdiv:
3320 return Operand == 0 || Operand == 1;
3321 default:
3322 return false;
3323 }
3324}
3325
3326/// Check if sinking \p I's operands to I's basic block is profitable, because
3327/// the operands can be folded into a target instruction, e.g.
3328/// splats of scalars can fold into vector instructions.
3331 using namespace llvm::PatternMatch;
3332
3333 if (I->isBitwiseLogicOp()) {
3334 if (!I->getType()->isVectorTy()) {
3335 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3336 for (auto &Op : I->operands()) {
3337 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3338 if (match(Op.get(), m_Not(m_Value()))) {
3339 Ops.push_back(&Op);
3340 return true;
3341 }
3342 }
3343 }
3344 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3345 for (auto &Op : I->operands()) {
3346 // (and X, (not Y)) -> (vandn.vv X, Y)
3347 if (match(Op.get(), m_Not(m_Value()))) {
3348 Ops.push_back(&Op);
3349 return true;
3350 }
3351 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3353 m_ZeroInt()),
3354 m_Value(), m_ZeroMask()))) {
3355 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3356 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3357 Ops.push_back(&Not);
3358 Ops.push_back(&InsertElt);
3359 Ops.push_back(&Op);
3360 return true;
3361 }
3362 }
3363 }
3364 }
3365
3366 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3367 return false;
3368
3369 // Don't sink splat operands if the target prefers it. Some targets requires
3370 // S2V transfer buffers and we can run out of them copying the same value
3371 // repeatedly.
3372 // FIXME: It could still be worth doing if it would improve vector register
3373 // pressure and prevent a vector spill.
3374 if (!ST->sinkSplatOperands())
3375 return false;
3376
3377 for (auto OpIdx : enumerate(I->operands())) {
3378 if (!canSplatOperand(I, OpIdx.index()))
3379 continue;
3380
3381 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3382 // Make sure we are not already sinking this operand
3383 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3384 continue;
3385
3386 // We are looking for a splat/vp.splat that can be sunk.
3388 m_Value(), m_Value(), m_Value()));
3389 if (!IsVPSplat &&
3391 m_Value(), m_ZeroMask())))
3392 continue;
3393
3394 // Don't sink i1 splats.
3395 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3396 continue;
3397
3398 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3399 // and vector registers
3400 for (Use &U : Op->uses()) {
3401 Instruction *Insn = cast<Instruction>(U.getUser());
3402 if (!canSplatOperand(Insn, U.getOperandNo()))
3403 return false;
3404 }
3405
3406 // Sink any fpexts since they might be used in a widening fp pattern.
3407 if (IsVPSplat) {
3408 if (isa<FPExtInst>(Op->getOperand(0)))
3409 Ops.push_back(&Op->getOperandUse(0));
3410 } else {
3411 Use *InsertEltUse = &Op->getOperandUse(0);
3412 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3413 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3414 Ops.push_back(&InsertElt->getOperandUse(1));
3415 Ops.push_back(InsertEltUse);
3416 }
3417 Ops.push_back(&OpIdx.value());
3418 }
3419 return true;
3420}
3421
3423RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3425 // TODO: Enable expansion when unaligned access is not supported after we fix
3426 // issues in ExpandMemcmp.
3427 if (!ST->enableUnalignedScalarMem())
3428 return Options;
3429
3430 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3431 return Options;
3432
3433 Options.AllowOverlappingLoads = true;
3434 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3435 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3436 if (ST->is64Bit()) {
3437 Options.LoadSizes = {8, 4, 2, 1};
3438 Options.AllowedTailExpansions = {3, 5, 6};
3439 } else {
3440 Options.LoadSizes = {4, 2, 1};
3441 Options.AllowedTailExpansions = {3};
3442 }
3443
3444 if (IsZeroCmp && ST->hasVInstructions()) {
3445 unsigned VLenB = ST->getRealMinVLen() / 8;
3446 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3447 // `VLenB * MaxLMUL` so that it fits in a single register group.
3448 unsigned MinSize = ST->getXLen() / 8 + 1;
3449 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3450 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3451 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3452 }
3453 return Options;
3454}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).