LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
405InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
406 const TTI::TargetCostKind CostKind) const {
407 switch (CostKind) {
410 // Always 2 instructions
411 return 2;
412 case TTI::TCK_Latency:
414 // Depending on the memory model the address generation will
415 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
416 // have a way of getting this information here, so conservatively
417 // require both.
418 // In practice, these are generally implemented together.
419 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
420 }
421 llvm_unreachable("Unsupported cost kind");
422}
423
425RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
427 // Add a cost of address generation + the cost of the load. The address
428 // is expected to be a PC relative offset to a constant pool entry
429 // using auipc/addi.
430 return getStaticDataAddrGenerationCost(CostKind) +
431 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
432 /*AddressSpace=*/0, CostKind);
433}
434
435static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
436 unsigned Size = Mask.size();
437 if (!isPowerOf2_32(Size))
438 return false;
439 for (unsigned I = 0; I != Size; ++I) {
440 if (static_cast<unsigned>(Mask[I]) == I)
441 continue;
442 if (Mask[I] != 0)
443 return false;
444 if (Size % I != 0)
445 return false;
446 for (unsigned J = I + 1; J != Size; ++J)
447 // Check the pattern is repeated.
448 if (static_cast<unsigned>(Mask[J]) != J % I)
449 return false;
450 SubVectorSize = I;
451 return true;
452 }
453 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
454 return false;
455}
456
458 LLVMContext &C) {
459 assert((DataVT.getScalarSizeInBits() != 8 ||
460 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
461 MVT IndexVT = DataVT.changeTypeToInteger();
462 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
463 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
464 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
465}
466
467/// Attempt to approximate the cost of a shuffle which will require splitting
468/// during legalization. Note that processShuffleMasks is not an exact proxy
469/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
470/// reasonably close upperbound.
472 MVT LegalVT, VectorType *Tp,
473 ArrayRef<int> Mask,
475 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
476 "Expected fixed vector type and non-empty mask");
477 unsigned LegalNumElts = LegalVT.getVectorNumElements();
478 // Number of destination vectors after legalization:
479 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
480 // We are going to permute multiple sources and the result will be in
481 // multiple destinations. Providing an accurate cost only for splits where
482 // the element type remains the same.
483 if (NumOfDests <= 1 ||
485 Tp->getElementType()->getPrimitiveSizeInBits() ||
486 LegalNumElts >= Tp->getElementCount().getFixedValue())
488
489 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
490 unsigned LegalVTSize = LegalVT.getStoreSize();
491 // Number of source vectors after legalization:
492 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
493
494 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
495
496 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
497 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
498 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
499 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
500 assert(NormalizedVF >= Mask.size() &&
501 "Normalized mask expected to be not shorter than original mask.");
502 copy(Mask, NormalizedMask.begin());
503 InstructionCost Cost = 0;
504 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
506 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
507 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
508 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
509 return;
510 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
511 .second)
512 return;
513 Cost += TTI.getShuffleCost(
515 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
516 SingleOpTy, RegMask, CostKind, 0, nullptr);
517 },
518 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
519 Cost += TTI.getShuffleCost(
521 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
522 SingleOpTy, RegMask, CostKind, 0, nullptr);
523 });
524 return Cost;
525}
526
527/// Try to perform better estimation of the permutation.
528/// 1. Split the source/destination vectors into real registers.
529/// 2. Do the mask analysis to identify which real registers are
530/// permuted. If more than 1 source registers are used for the
531/// destination register building, the cost for this destination register
532/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
533/// source register is used, build mask and calculate the cost as a cost
534/// of PermuteSingleSrc.
535/// Also, for the single register permute we try to identify if the
536/// destination register is just a copy of the source register or the
537/// copy of the previous destination register (the cost is
538/// TTI::TCC_Basic). If the source register is just reused, the cost for
539/// this operation is 0.
540static InstructionCost
542 std::optional<unsigned> VLen, VectorType *Tp,
544 assert(LegalVT.isFixedLengthVector());
545 if (!VLen || Mask.empty())
547 MVT ElemVT = LegalVT.getVectorElementType();
548 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
549 LegalVT = TTI.getTypeLegalizationCost(
550 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
551 .second;
552 // Number of destination vectors after legalization:
553 InstructionCost NumOfDests =
554 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
555 if (NumOfDests <= 1 ||
557 Tp->getElementType()->getPrimitiveSizeInBits() ||
558 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
560
561 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
562 unsigned LegalVTSize = LegalVT.getStoreSize();
563 // Number of source vectors after legalization:
564 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
565
566 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
567 LegalVT.getVectorNumElements());
568
569 unsigned E = NumOfDests.getValue();
570 unsigned NormalizedVF =
571 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
572 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
573 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
574 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
575 assert(NormalizedVF >= Mask.size() &&
576 "Normalized mask expected to be not shorter than original mask.");
577 copy(Mask, NormalizedMask.begin());
578 InstructionCost Cost = 0;
579 int NumShuffles = 0;
580 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
582 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
583 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
584 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
585 return;
586 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
587 .second)
588 return;
589 ++NumShuffles;
590 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
591 SingleOpTy, RegMask, CostKind, 0, nullptr);
592 },
593 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
594 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
595 SingleOpTy, RegMask, CostKind, 0, nullptr);
596 NumShuffles += 2;
597 });
598 // Note: check that we do not emit too many shuffles here to prevent code
599 // size explosion.
600 // TODO: investigate, if it can be improved by extra analysis of the masks
601 // to check if the code is more profitable.
602 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
603 (NumOfDestRegs <= 2 && NumShuffles < 4))
604 return Cost;
606}
607
608InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
609 ArrayRef<int> Mask,
611 // Avoid missing masks and length changing shuffles
612 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
614
615 int NumElts = Tp->getNumElements();
616 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
617 // Avoid scalarization cases
618 if (!LT.second.isFixedLengthVector())
620
621 // Requires moving elements between parts, which requires additional
622 // unmodeled instructions.
623 if (LT.first != 1)
625
626 auto GetSlideOpcode = [&](int SlideAmt) {
627 assert(SlideAmt != 0);
628 bool IsVI = isUInt<5>(std::abs(SlideAmt));
629 if (SlideAmt < 0)
630 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
631 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
632 };
633
634 std::array<std::pair<int, int>, 2> SrcInfo;
635 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
637
638 if (SrcInfo[1].second == 0)
639 std::swap(SrcInfo[0], SrcInfo[1]);
640
641 InstructionCost FirstSlideCost = 0;
642 if (SrcInfo[0].second != 0) {
643 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
644 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
645 }
646
647 if (SrcInfo[1].first == -1)
648 return FirstSlideCost;
649
650 InstructionCost SecondSlideCost = 0;
651 if (SrcInfo[1].second != 0) {
652 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
653 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
654 } else {
655 SecondSlideCost =
656 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
657 }
658
659 auto EC = Tp->getElementCount();
660 VectorType *MaskTy =
662 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
663 return FirstSlideCost + SecondSlideCost + MaskCost;
664}
665
668 VectorType *SrcTy, ArrayRef<int> Mask,
669 TTI::TargetCostKind CostKind, int Index,
671 const Instruction *CxtI) const {
672 assert((Mask.empty() || DstTy->isScalableTy() ||
673 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
674 "Expected the Mask to match the return size if given");
675 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
676 "Expected the same scalar types");
677
678 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
679 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
680
681 // First, handle cases where having a fixed length vector enables us to
682 // give a more accurate cost than falling back to generic scalable codegen.
683 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
684 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
685 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
687 *this, LT.second, ST->getRealVLen(),
688 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
689 if (VRegSplittingCost.isValid())
690 return VRegSplittingCost;
691 switch (Kind) {
692 default:
693 break;
695 if (Mask.size() >= 2) {
696 MVT EltTp = LT.second.getVectorElementType();
697 // If the size of the element is < ELEN then shuffles of interleaves and
698 // deinterleaves of 2 vectors can be lowered into the following
699 // sequences
700 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
701 // Example sequence:
702 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
703 // vwaddu.vv v10, v8, v9
704 // li a0, -1 (ignored)
705 // vwmaccu.vx v10, a0, v9
706 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
707 return 2 * LT.first * TLI->getLMULCost(LT.second);
708
709 if (Mask[0] == 0 || Mask[0] == 1) {
710 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
711 // Example sequence:
712 // vnsrl.wi v10, v8, 0
713 if (equal(DeinterleaveMask, Mask))
714 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
715 LT.second, CostKind);
716 }
717 }
718 int SubVectorSize;
719 if (LT.second.getScalarSizeInBits() != 1 &&
720 isRepeatedConcatMask(Mask, SubVectorSize)) {
722 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
723 // The cost of extraction from a subvector is 0 if the index is 0.
724 for (unsigned I = 0; I != NumSlides; ++I) {
725 unsigned InsertIndex = SubVectorSize * (1 << I);
726 FixedVectorType *SubTp =
727 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
728 FixedVectorType *DestTp =
730 std::pair<InstructionCost, MVT> DestLT =
732 // Add the cost of whole vector register move because the
733 // destination vector register group for vslideup cannot overlap the
734 // source.
735 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
736 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
737 CostKind, InsertIndex, SubTp);
738 }
739 return Cost;
740 }
741 }
742
743 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
744 SlideCost.isValid())
745 return SlideCost;
746
747 // vrgather + cost of generating the mask constant.
748 // We model this for an unknown mask with a single vrgather.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 VectorType *IdxTy =
752 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
753 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
754 return IndexCost +
755 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
756 }
757 break;
758 }
761
762 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
763 SlideCost.isValid())
764 return SlideCost;
765
766 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
767 // register for the second vrgather. We model this for an unknown
768 // (shuffle) mask.
769 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
770 LT.second.getVectorNumElements() <= 256)) {
771 auto &C = SrcTy->getContext();
772 auto EC = SrcTy->getElementCount();
773 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
775 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
776 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
777 return 2 * IndexCost +
778 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
779 LT.second, CostKind) +
780 MaskCost;
781 }
782 break;
783 }
784 }
785
786 auto shouldSplit = [](TTI::ShuffleKind Kind) {
787 switch (Kind) {
788 default:
789 return false;
793 return true;
794 }
795 };
796
797 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
798 shouldSplit(Kind)) {
799 InstructionCost SplitCost =
800 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
801 if (SplitCost.isValid())
802 return SplitCost;
803 }
804 }
805
806 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
807 switch (Kind) {
808 default:
809 // Fallthrough to generic handling.
810 // TODO: Most of these cases will return getInvalid in generic code, and
811 // must be implemented here.
812 break;
814 // Extract at zero is always a subregister extract
815 if (Index == 0)
816 return TTI::TCC_Free;
817
818 // If we're extracting a subvector of at most m1 size at a sub-register
819 // boundary - which unfortunately we need exact vlen to identify - this is
820 // a subregister extract at worst and thus won't require a vslidedown.
821 // TODO: Extend for aligned m2, m4 subvector extracts
822 // TODO: Extend for misalgined (but contained) extracts
823 // TODO: Extend for scalable subvector types
824 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
825 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
826 if (std::optional<unsigned> VLen = ST->getRealVLen();
827 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
828 SubLT.second.getSizeInBits() <= *VLen)
829 return TTI::TCC_Free;
830 }
831
832 // Example sequence:
833 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
834 // vslidedown.vi v8, v9, 2
835 return LT.first *
836 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
838 // Example sequence:
839 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
840 // vslideup.vi v8, v9, 2
841 LT = getTypeLegalizationCost(DstTy);
842 return LT.first *
843 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
844 case TTI::SK_Select: {
845 // Example sequence:
846 // li a0, 90
847 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
848 // vmv.s.x v0, a0
849 // vmerge.vvm v8, v9, v8, v0
850 // We use 2 for the cost of the mask materialization as this is the true
851 // cost for small masks and most shuffles are small. At worst, this cost
852 // should be a very small constant for the constant pool load. As such,
853 // we may bias towards large selects slightly more than truly warranted.
854 return LT.first *
855 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
856 LT.second, CostKind));
857 }
858 case TTI::SK_Broadcast: {
859 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
860 Instruction::InsertElement);
861 if (LT.second.getScalarSizeInBits() == 1) {
862 if (HasScalar) {
863 // Example sequence:
864 // andi a0, a0, 1
865 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
866 // vmv.v.x v8, a0
867 // vmsne.vi v0, v8, 0
868 return LT.first *
869 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
870 LT.second, CostKind));
871 }
872 // Example sequence:
873 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
874 // vmv.v.i v8, 0
875 // vmerge.vim v8, v8, 1, v0
876 // vmv.x.s a0, v8
877 // andi a0, a0, 1
878 // vmv.v.x v8, a0
879 // vmsne.vi v0, v8, 0
880
881 return LT.first *
882 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
883 RISCV::VMV_X_S, RISCV::VMV_V_X,
884 RISCV::VMSNE_VI},
885 LT.second, CostKind));
886 }
887
888 if (HasScalar) {
889 // Example sequence:
890 // vmv.v.x v8, a0
891 return LT.first *
892 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
893 }
894
895 // Example sequence:
896 // vrgather.vi v9, v8, 0
897 return LT.first *
898 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
899 }
900 case TTI::SK_Splice: {
901 // vslidedown+vslideup.
902 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
903 // of similar code, but I think we expand through memory.
904 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
905 if (Index >= 0 && Index < 32)
906 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
907 else if (Index < 0 && Index > -32)
908 Opcodes[1] = RISCV::VSLIDEUP_VI;
909 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
910 }
911 case TTI::SK_Reverse: {
912
913 if (!LT.second.isVector())
915
916 // TODO: Cases to improve here:
917 // * Illegal vector types
918 // * i64 on RV32
919 if (SrcTy->getElementType()->isIntegerTy(1)) {
920 VectorType *WideTy =
921 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
922 cast<VectorType>(SrcTy)->getElementCount());
923 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
925 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
926 nullptr) +
927 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
929 }
930
931 MVT ContainerVT = LT.second;
932 if (LT.second.isFixedLengthVector())
933 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
934 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
935 if (ContainerVT.bitsLE(M1VT)) {
936 // Example sequence:
937 // csrr a0, vlenb
938 // srli a0, a0, 3
939 // addi a0, a0, -1
940 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
941 // vid.v v9
942 // vrsub.vx v10, v9, a0
943 // vrgather.vv v9, v8, v10
944 InstructionCost LenCost = 3;
945 if (LT.second.isFixedLengthVector())
946 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
947 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
948 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
949 if (LT.second.isFixedLengthVector() &&
950 isInt<5>(LT.second.getVectorNumElements() - 1))
951 Opcodes[1] = RISCV::VRSUB_VI;
952 InstructionCost GatherCost =
953 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
954 return LT.first * (LenCost + GatherCost);
955 }
956
957 // At high LMUL, we split into a series of M1 reverses (see
958 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
959 // the resulting gap at the bottom (for fixed vectors only). The important
960 // bit is that the cost scales linearly, not quadratically with LMUL.
961 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
962 InstructionCost FixedCost =
963 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
964 unsigned Ratio =
966 InstructionCost GatherCost =
967 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
968 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
969 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
970 return FixedCost + LT.first * (GatherCost + SlideCost);
971 }
972 }
973 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
974 SubTp);
975}
976
977static unsigned isM1OrSmaller(MVT VT) {
979 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
983}
984
986 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
987 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
988 ArrayRef<Value *> VL) const {
991
992 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
993 // For now, skip all fixed vector cost analysis when P extension is available
994 // to avoid crashes in getMinRVVVectorSizeInBits()
995 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Ty)) {
996 return 1; // Treat as single instruction cost for now
997 }
998
999 // A build_vector (which is m1 sized or smaller) can be done in no
1000 // worse than one vslide1down.vx per element in the type. We could
1001 // in theory do an explode_vector in the inverse manner, but our
1002 // lowering today does not have a first class node for this pattern.
1004 Ty, DemandedElts, Insert, Extract, CostKind);
1005 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1006 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1007 if (Ty->getScalarSizeInBits() == 1) {
1008 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1009 // Note: Implicit scalar anyextend is assumed to be free since the i1
1010 // must be stored in a GPR.
1011 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1012 CostKind) +
1013 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1015 }
1016
1017 assert(LT.second.isFixedLengthVector());
1018 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1019 if (isM1OrSmaller(ContainerVT)) {
1020 InstructionCost BV =
1021 cast<FixedVectorType>(Ty)->getNumElements() *
1022 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1023 if (BV < Cost)
1024 Cost = BV;
1025 }
1026 }
1027 return Cost;
1028}
1029
1033 Type *DataTy = MICA.getDataType();
1034 Align Alignment = MICA.getAlignment();
1035 switch (MICA.getID()) {
1036 case Intrinsic::vp_load_ff: {
1037 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1038 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1040
1041 unsigned AS = MICA.getAddressSpace();
1042 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1043 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1044 }
1045 case Intrinsic::experimental_vp_strided_load:
1046 case Intrinsic::experimental_vp_strided_store:
1047 return getStridedMemoryOpCost(MICA, CostKind);
1048 case Intrinsic::masked_compressstore:
1049 case Intrinsic::masked_expandload:
1051 case Intrinsic::vp_scatter:
1052 case Intrinsic::vp_gather:
1053 case Intrinsic::masked_scatter:
1054 case Intrinsic::masked_gather:
1055 return getGatherScatterOpCost(MICA, CostKind);
1056 case Intrinsic::vp_load:
1057 case Intrinsic::vp_store:
1058 case Intrinsic::masked_load:
1059 case Intrinsic::masked_store:
1060 return getMaskedMemoryOpCost(MICA, CostKind);
1061 }
1063}
1064
1068 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1069 : Instruction::Store;
1070 Type *Src = MICA.getDataType();
1071 Align Alignment = MICA.getAlignment();
1072 unsigned AddressSpace = MICA.getAddressSpace();
1073
1074 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1077
1078 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1079}
1080
1082 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1083 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1084 bool UseMaskForCond, bool UseMaskForGaps) const {
1085
1086 // The interleaved memory access pass will lower (de)interleave ops combined
1087 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1088 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1089 // gap).
1090 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1091 auto *VTy = cast<VectorType>(VecTy);
1092 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1093 // Need to make sure type has't been scalarized
1094 if (LT.second.isVector()) {
1095 auto *SubVecTy =
1096 VectorType::get(VTy->getElementType(),
1097 VTy->getElementCount().divideCoefficientBy(Factor));
1098 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1099 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1100 AddressSpace, DL)) {
1101
1102 // Some processors optimize segment loads/stores as one wide memory op +
1103 // Factor * LMUL shuffle ops.
1104 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1106 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1107 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1108 Cost += Factor * TLI->getLMULCost(SubVecVT);
1109 return LT.first * Cost;
1110 }
1111
1112 // Otherwise, the cost is proportional to the number of elements (VL *
1113 // Factor ops).
1114 InstructionCost MemOpCost =
1115 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1116 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1117 unsigned NumLoads = getEstimatedVLFor(VTy);
1118 return NumLoads * MemOpCost;
1119 }
1120 }
1121 }
1122
1123 // TODO: Return the cost of interleaved accesses for scalable vector when
1124 // unable to convert to segment accesses instructions.
1125 if (isa<ScalableVectorType>(VecTy))
1127
1128 auto *FVTy = cast<FixedVectorType>(VecTy);
1129 InstructionCost MemCost =
1130 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1131 unsigned VF = FVTy->getNumElements() / Factor;
1132
1133 // An interleaved load will look like this for Factor=3:
1134 // %wide.vec = load <12 x i32>, ptr %3, align 4
1135 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1136 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1137 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1138 if (Opcode == Instruction::Load) {
1139 InstructionCost Cost = MemCost;
1140 for (unsigned Index : Indices) {
1141 FixedVectorType *VecTy =
1142 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1143 auto Mask = createStrideMask(Index, Factor, VF);
1144 Mask.resize(VF * Factor, -1);
1145 InstructionCost ShuffleCost =
1147 Mask, CostKind, 0, nullptr, {});
1148 Cost += ShuffleCost;
1149 }
1150 return Cost;
1151 }
1152
1153 // TODO: Model for NF > 2
1154 // We'll need to enhance getShuffleCost to model shuffles that are just
1155 // inserts and extracts into subvectors, since they won't have the full cost
1156 // of a vrgather.
1157 // An interleaved store for 3 vectors of 4 lanes will look like
1158 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1159 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1160 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1161 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1162 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1163 if (Factor != 2)
1164 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1165 Alignment, AddressSpace, CostKind,
1166 UseMaskForCond, UseMaskForGaps);
1167
1168 assert(Opcode == Instruction::Store && "Opcode must be a store");
1169 // For an interleaving store of 2 vectors, we perform one large interleaving
1170 // shuffle that goes into the wide store
1171 auto Mask = createInterleaveMask(VF, Factor);
1172 InstructionCost ShuffleCost =
1174 CostKind, 0, nullptr, {});
1175 return MemCost + ShuffleCost;
1176}
1177
1181
1182 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1183 MICA.getID() == Intrinsic::vp_gather;
1184 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1185 Type *DataTy = MICA.getDataType();
1186 Align Alignment = MICA.getAlignment();
1189
1190 if ((Opcode == Instruction::Load &&
1191 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1192 (Opcode == Instruction::Store &&
1193 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1195
1196 // Cost is proportional to the number of memory operations implied. For
1197 // scalable vectors, we use an estimate on that number since we don't
1198 // know exactly what VL will be.
1199 auto &VTy = *cast<VectorType>(DataTy);
1200 unsigned NumLoads = getEstimatedVLFor(&VTy);
1201 return NumLoads * TTI::TCC_Basic;
1202}
1203
1205 const MemIntrinsicCostAttributes &MICA,
1207 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1208 ? Instruction::Load
1209 : Instruction::Store;
1210 Type *DataTy = MICA.getDataType();
1211 bool VariableMask = MICA.getVariableMask();
1212 Align Alignment = MICA.getAlignment();
1213 bool IsLegal = (Opcode == Instruction::Store &&
1214 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1215 (Opcode == Instruction::Load &&
1216 isLegalMaskedExpandLoad(DataTy, Alignment));
1217 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1219 // Example compressstore sequence:
1220 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1221 // vcompress.vm v10, v8, v0
1222 // vcpop.m a1, v0
1223 // vsetvli zero, a1, e32, m2, ta, ma
1224 // vse32.v v10, (a0)
1225 // Example expandload sequence:
1226 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1227 // vcpop.m a1, v0
1228 // vsetvli zero, a1, e32, m2, ta, ma
1229 // vle32.v v10, (a0)
1230 // vsetivli zero, 8, e32, m2, ta, ma
1231 // viota.m v12, v0
1232 // vrgather.vv v8, v10, v12, v0.t
1233 auto MemOpCost =
1234 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1235 auto LT = getTypeLegalizationCost(DataTy);
1236 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1237 if (VariableMask)
1238 Opcodes.push_back(RISCV::VCPOP_M);
1239 if (Opcode == Instruction::Store)
1240 Opcodes.append({RISCV::VCOMPRESS_VM});
1241 else
1242 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1243 return MemOpCost +
1244 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1245}
1246
1250
1251 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1252 ? Instruction::Load
1253 : Instruction::Store;
1254
1255 Type *DataTy = MICA.getDataType();
1256 Align Alignment = MICA.getAlignment();
1257 const Instruction *I = MICA.getInst();
1258
1259 if (!isLegalStridedLoadStore(DataTy, Alignment))
1261
1263 return TTI::TCC_Basic;
1264
1265 // Cost is proportional to the number of memory operations implied. For
1266 // scalable vectors, we use an estimate on that number since we don't
1267 // know exactly what VL will be.
1268 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1269 auto &VTy = *cast<VectorType>(DataTy);
1270 InstructionCost MemOpCost =
1271 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1272 {TTI::OK_AnyValue, TTI::OP_None}, I);
1273 unsigned NumLoads = getEstimatedVLFor(&VTy);
1274 return NumLoads * MemOpCost;
1275}
1276
1279 // FIXME: This is a property of the default vector convention, not
1280 // all possible calling conventions. Fixing that will require
1281 // some TTI API and SLP rework.
1284 for (auto *Ty : Tys) {
1285 if (!Ty->isVectorTy())
1286 continue;
1287 Align A = DL.getPrefTypeAlign(Ty);
1288 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1289 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1290 }
1291 return Cost;
1292}
1293
1294// Currently, these represent both throughput and codesize costs
1295// for the respective intrinsics. The costs in this table are simply
1296// instruction counts with the following adjustments made:
1297// * One vsetvli is considered free.
1299 {Intrinsic::floor, MVT::f32, 9},
1300 {Intrinsic::floor, MVT::f64, 9},
1301 {Intrinsic::ceil, MVT::f32, 9},
1302 {Intrinsic::ceil, MVT::f64, 9},
1303 {Intrinsic::trunc, MVT::f32, 7},
1304 {Intrinsic::trunc, MVT::f64, 7},
1305 {Intrinsic::round, MVT::f32, 9},
1306 {Intrinsic::round, MVT::f64, 9},
1307 {Intrinsic::roundeven, MVT::f32, 9},
1308 {Intrinsic::roundeven, MVT::f64, 9},
1309 {Intrinsic::rint, MVT::f32, 7},
1310 {Intrinsic::rint, MVT::f64, 7},
1311 {Intrinsic::nearbyint, MVT::f32, 9},
1312 {Intrinsic::nearbyint, MVT::f64, 9},
1313 {Intrinsic::bswap, MVT::i16, 3},
1314 {Intrinsic::bswap, MVT::i32, 12},
1315 {Intrinsic::bswap, MVT::i64, 31},
1316 {Intrinsic::vp_bswap, MVT::i16, 3},
1317 {Intrinsic::vp_bswap, MVT::i32, 12},
1318 {Intrinsic::vp_bswap, MVT::i64, 31},
1319 {Intrinsic::vp_fshl, MVT::i8, 7},
1320 {Intrinsic::vp_fshl, MVT::i16, 7},
1321 {Intrinsic::vp_fshl, MVT::i32, 7},
1322 {Intrinsic::vp_fshl, MVT::i64, 7},
1323 {Intrinsic::vp_fshr, MVT::i8, 7},
1324 {Intrinsic::vp_fshr, MVT::i16, 7},
1325 {Intrinsic::vp_fshr, MVT::i32, 7},
1326 {Intrinsic::vp_fshr, MVT::i64, 7},
1327 {Intrinsic::bitreverse, MVT::i8, 17},
1328 {Intrinsic::bitreverse, MVT::i16, 24},
1329 {Intrinsic::bitreverse, MVT::i32, 33},
1330 {Intrinsic::bitreverse, MVT::i64, 52},
1331 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1332 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1333 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1334 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1335 {Intrinsic::ctpop, MVT::i8, 12},
1336 {Intrinsic::ctpop, MVT::i16, 19},
1337 {Intrinsic::ctpop, MVT::i32, 20},
1338 {Intrinsic::ctpop, MVT::i64, 21},
1339 {Intrinsic::ctlz, MVT::i8, 19},
1340 {Intrinsic::ctlz, MVT::i16, 28},
1341 {Intrinsic::ctlz, MVT::i32, 31},
1342 {Intrinsic::ctlz, MVT::i64, 35},
1343 {Intrinsic::cttz, MVT::i8, 16},
1344 {Intrinsic::cttz, MVT::i16, 23},
1345 {Intrinsic::cttz, MVT::i32, 24},
1346 {Intrinsic::cttz, MVT::i64, 25},
1347 {Intrinsic::vp_ctpop, MVT::i8, 12},
1348 {Intrinsic::vp_ctpop, MVT::i16, 19},
1349 {Intrinsic::vp_ctpop, MVT::i32, 20},
1350 {Intrinsic::vp_ctpop, MVT::i64, 21},
1351 {Intrinsic::vp_ctlz, MVT::i8, 19},
1352 {Intrinsic::vp_ctlz, MVT::i16, 28},
1353 {Intrinsic::vp_ctlz, MVT::i32, 31},
1354 {Intrinsic::vp_ctlz, MVT::i64, 35},
1355 {Intrinsic::vp_cttz, MVT::i8, 16},
1356 {Intrinsic::vp_cttz, MVT::i16, 23},
1357 {Intrinsic::vp_cttz, MVT::i32, 24},
1358 {Intrinsic::vp_cttz, MVT::i64, 25},
1359};
1360
1364 auto *RetTy = ICA.getReturnType();
1365 switch (ICA.getID()) {
1366 case Intrinsic::lrint:
1367 case Intrinsic::llrint:
1368 case Intrinsic::lround:
1369 case Intrinsic::llround: {
1370 auto LT = getTypeLegalizationCost(RetTy);
1371 Type *SrcTy = ICA.getArgTypes().front();
1372 auto SrcLT = getTypeLegalizationCost(SrcTy);
1373 if (ST->hasVInstructions() && LT.second.isVector()) {
1375 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1376 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1377 if (LT.second.getVectorElementType() == MVT::bf16) {
1378 if (!ST->hasVInstructionsBF16Minimal())
1380 if (DstEltSz == 32)
1381 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1382 else
1383 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1384 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1385 !ST->hasVInstructionsF16()) {
1386 if (!ST->hasVInstructionsF16Minimal())
1388 if (DstEltSz == 32)
1389 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1390 else
1391 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1392
1393 } else if (SrcEltSz > DstEltSz) {
1394 Ops = {RISCV::VFNCVT_X_F_W};
1395 } else if (SrcEltSz < DstEltSz) {
1396 Ops = {RISCV::VFWCVT_X_F_V};
1397 } else {
1398 Ops = {RISCV::VFCVT_X_F_V};
1399 }
1400
1401 // We need to use the source LMUL in the case of a narrowing op, and the
1402 // destination LMUL otherwise.
1403 if (SrcEltSz > DstEltSz)
1404 return SrcLT.first *
1405 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1406 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1407 }
1408 break;
1409 }
1410 case Intrinsic::ceil:
1411 case Intrinsic::floor:
1412 case Intrinsic::trunc:
1413 case Intrinsic::rint:
1414 case Intrinsic::round:
1415 case Intrinsic::roundeven: {
1416 // These all use the same code.
1417 auto LT = getTypeLegalizationCost(RetTy);
1418 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1419 return LT.first * 8;
1420 break;
1421 }
1422 case Intrinsic::umin:
1423 case Intrinsic::umax:
1424 case Intrinsic::smin:
1425 case Intrinsic::smax: {
1426 auto LT = getTypeLegalizationCost(RetTy);
1427 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1428 return LT.first;
1429
1430 if (ST->hasVInstructions() && LT.second.isVector()) {
1431 unsigned Op;
1432 switch (ICA.getID()) {
1433 case Intrinsic::umin:
1434 Op = RISCV::VMINU_VV;
1435 break;
1436 case Intrinsic::umax:
1437 Op = RISCV::VMAXU_VV;
1438 break;
1439 case Intrinsic::smin:
1440 Op = RISCV::VMIN_VV;
1441 break;
1442 case Intrinsic::smax:
1443 Op = RISCV::VMAX_VV;
1444 break;
1445 }
1446 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1447 }
1448 break;
1449 }
1450 case Intrinsic::sadd_sat:
1451 case Intrinsic::ssub_sat:
1452 case Intrinsic::uadd_sat:
1453 case Intrinsic::usub_sat: {
1454 auto LT = getTypeLegalizationCost(RetTy);
1455 if (ST->hasVInstructions() && LT.second.isVector()) {
1456 unsigned Op;
1457 switch (ICA.getID()) {
1458 case Intrinsic::sadd_sat:
1459 Op = RISCV::VSADD_VV;
1460 break;
1461 case Intrinsic::ssub_sat:
1462 Op = RISCV::VSSUBU_VV;
1463 break;
1464 case Intrinsic::uadd_sat:
1465 Op = RISCV::VSADDU_VV;
1466 break;
1467 case Intrinsic::usub_sat:
1468 Op = RISCV::VSSUBU_VV;
1469 break;
1470 }
1471 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1472 }
1473 break;
1474 }
1475 case Intrinsic::fma:
1476 case Intrinsic::fmuladd: {
1477 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1478 auto LT = getTypeLegalizationCost(RetTy);
1479 if (ST->hasVInstructions() && LT.second.isVector())
1480 return LT.first *
1481 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1482 break;
1483 }
1484 case Intrinsic::fabs: {
1485 auto LT = getTypeLegalizationCost(RetTy);
1486 if (ST->hasVInstructions() && LT.second.isVector()) {
1487 // lui a0, 8
1488 // addi a0, a0, -1
1489 // vsetvli a1, zero, e16, m1, ta, ma
1490 // vand.vx v8, v8, a0
1491 // f16 with zvfhmin and bf16 with zvfhbmin
1492 if (LT.second.getVectorElementType() == MVT::bf16 ||
1493 (LT.second.getVectorElementType() == MVT::f16 &&
1494 !ST->hasVInstructionsF16()))
1495 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1496 CostKind) +
1497 2;
1498 else
1499 return LT.first *
1500 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1501 }
1502 break;
1503 }
1504 case Intrinsic::sqrt: {
1505 auto LT = getTypeLegalizationCost(RetTy);
1506 if (ST->hasVInstructions() && LT.second.isVector()) {
1509 MVT ConvType = LT.second;
1510 MVT FsqrtType = LT.second;
1511 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1512 // will be spilt.
1513 if (LT.second.getVectorElementType() == MVT::bf16) {
1514 if (LT.second == MVT::nxv32bf16) {
1515 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1516 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1517 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1518 ConvType = MVT::nxv16f16;
1519 FsqrtType = MVT::nxv16f32;
1520 } else {
1521 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1522 FsqrtOp = {RISCV::VFSQRT_V};
1523 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1524 }
1525 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1526 !ST->hasVInstructionsF16()) {
1527 if (LT.second == MVT::nxv32f16) {
1528 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1529 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1530 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1531 ConvType = MVT::nxv16f16;
1532 FsqrtType = MVT::nxv16f32;
1533 } else {
1534 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1535 FsqrtOp = {RISCV::VFSQRT_V};
1536 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1537 }
1538 } else {
1539 FsqrtOp = {RISCV::VFSQRT_V};
1540 }
1541
1542 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1543 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1544 }
1545 break;
1546 }
1547 case Intrinsic::cttz:
1548 case Intrinsic::ctlz:
1549 case Intrinsic::ctpop: {
1550 auto LT = getTypeLegalizationCost(RetTy);
1551 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1552 unsigned Op;
1553 switch (ICA.getID()) {
1554 case Intrinsic::cttz:
1555 Op = RISCV::VCTZ_V;
1556 break;
1557 case Intrinsic::ctlz:
1558 Op = RISCV::VCLZ_V;
1559 break;
1560 case Intrinsic::ctpop:
1561 Op = RISCV::VCPOP_V;
1562 break;
1563 }
1564 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1565 }
1566 break;
1567 }
1568 case Intrinsic::abs: {
1569 auto LT = getTypeLegalizationCost(RetTy);
1570 if (ST->hasVInstructions() && LT.second.isVector()) {
1571 // vrsub.vi v10, v8, 0
1572 // vmax.vv v8, v8, v10
1573 return LT.first *
1574 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1575 LT.second, CostKind);
1576 }
1577 break;
1578 }
1579 case Intrinsic::fshl:
1580 case Intrinsic::fshr: {
1581 if (ICA.getArgs().empty())
1582 break;
1583
1584 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1585 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1586 // instruction.
1587 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1588 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1589 (RetTy->getIntegerBitWidth() == 32 ||
1590 RetTy->getIntegerBitWidth() == 64) &&
1591 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1592 return 1;
1593 }
1594 break;
1595 }
1596 case Intrinsic::get_active_lane_mask: {
1597 if (ST->hasVInstructions()) {
1598 Type *ExpRetTy = VectorType::get(
1599 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1600 auto LT = getTypeLegalizationCost(ExpRetTy);
1601
1602 // vid.v v8 // considered hoisted
1603 // vsaddu.vx v8, v8, a0
1604 // vmsltu.vx v0, v8, a1
1605 return LT.first *
1606 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1607 LT.second, CostKind);
1608 }
1609 break;
1610 }
1611 // TODO: add more intrinsic
1612 case Intrinsic::stepvector: {
1613 auto LT = getTypeLegalizationCost(RetTy);
1614 // Legalisation of illegal types involves an `index' instruction plus
1615 // (LT.first - 1) vector adds.
1616 if (ST->hasVInstructions())
1617 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1618 (LT.first - 1) *
1619 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1620 return 1 + (LT.first - 1);
1621 }
1622 case Intrinsic::experimental_cttz_elts: {
1623 Type *ArgTy = ICA.getArgTypes()[0];
1624 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1625 if (getTLI()->shouldExpandCttzElements(ArgType))
1626 break;
1627 InstructionCost Cost = getRISCVInstructionCost(
1628 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1629
1630 // If zero_is_poison is false, then we will generate additional
1631 // cmp + select instructions to convert -1 to EVL.
1632 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1633 if (ICA.getArgs().size() > 1 &&
1634 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1635 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1637 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1639
1640 return Cost;
1641 }
1642 case Intrinsic::experimental_vp_splice: {
1643 // To support type-based query from vectorizer, set the index to 0.
1644 // Note that index only change the cost from vslide.vx to vslide.vi and in
1645 // current implementations they have same costs.
1647 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1649 }
1650 case Intrinsic::fptoui_sat:
1651 case Intrinsic::fptosi_sat: {
1653 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1654 Type *SrcTy = ICA.getArgTypes()[0];
1655
1656 auto SrcLT = getTypeLegalizationCost(SrcTy);
1657 auto DstLT = getTypeLegalizationCost(RetTy);
1658 if (!SrcTy->isVectorTy())
1659 break;
1660
1661 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1663
1664 Cost +=
1665 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1666 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1667
1668 // Handle NaN.
1669 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1670 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1671 Type *CondTy = RetTy->getWithNewBitWidth(1);
1672 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1674 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1676 return Cost;
1677 }
1678 }
1679
1680 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1681 if (auto LT = getTypeLegalizationCost(RetTy);
1682 LT.second.isVector()) {
1683 MVT EltTy = LT.second.getVectorElementType();
1684 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1685 ICA.getID(), EltTy))
1686 return LT.first * Entry->Cost;
1687 }
1688 }
1689
1691}
1692
1695 const SCEV *Ptr,
1697 // Address computations for vector indexed load/store likely require an offset
1698 // and/or scaling.
1699 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1700 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1701
1702 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1703}
1704
1706 Type *Src,
1709 const Instruction *I) const {
1710 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1711 if (!IsVectorType)
1712 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1713
1714 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1715 // For now, skip all fixed vector cost analysis when P extension is available
1716 // to avoid crashes in getMinRVVVectorSizeInBits()
1717 if (ST->enablePExtSIMDCodeGen() &&
1719 return 1; // Treat as single instruction cost for now
1720 }
1721
1722 // FIXME: Need to compute legalizing cost for illegal types. The current
1723 // code handles only legal types and those which can be trivially
1724 // promoted to legal.
1725 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1726 Dst->getScalarSizeInBits() > ST->getELen())
1727 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1728
1729 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1730 assert(ISD && "Invalid opcode");
1731 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1732 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1733
1734 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1735 // The shared implementation doesn't model vector widening during legalization
1736 // and instead assumes scalarization. In order to scalarize an <N x i1>
1737 // vector, we need to extend/trunc to/from i8. If we don't special case
1738 // this, we can get an infinite recursion cycle.
1739 switch (ISD) {
1740 default:
1741 break;
1742 case ISD::SIGN_EXTEND:
1743 case ISD::ZERO_EXTEND:
1744 if (Src->getScalarSizeInBits() == 1) {
1745 // We do not use vsext/vzext to extend from mask vector.
1746 // Instead we use the following instructions to extend from mask vector:
1747 // vmv.v.i v8, 0
1748 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1749 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1750 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1751 DstLT.second, CostKind) +
1752 DstLT.first - 1;
1753 }
1754 break;
1755 case ISD::TRUNCATE:
1756 if (Dst->getScalarSizeInBits() == 1) {
1757 // We do not use several vncvt to truncate to mask vector. So we could
1758 // not use PowDiff to calculate it.
1759 // Instead we use the following instructions to truncate to mask vector:
1760 // vand.vi v8, v8, 1
1761 // vmsne.vi v0, v8, 0
1762 return SrcLT.first *
1763 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1764 SrcLT.second, CostKind) +
1765 SrcLT.first - 1;
1766 }
1767 break;
1768 };
1769
1770 // Our actual lowering for the case where a wider legal type is available
1771 // uses promotion to the wider type. This is reflected in the result of
1772 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1773 // scalarized if the legalized Src and Dst are not equal sized.
1774 const DataLayout &DL = this->getDataLayout();
1775 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1776 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1777 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1778 SrcLT.second.getSizeInBits()) ||
1779 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1780 DstLT.second.getSizeInBits()) ||
1781 SrcLT.first > 1 || DstLT.first > 1)
1782 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1783
1784 // The split cost is handled by the base getCastInstrCost
1785 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1786
1787 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1788 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1789 switch (ISD) {
1790 case ISD::SIGN_EXTEND:
1791 case ISD::ZERO_EXTEND: {
1792 if ((PowDiff < 1) || (PowDiff > 3))
1793 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1794 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1795 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1796 unsigned Op =
1797 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1798 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1799 }
1800 case ISD::TRUNCATE:
1801 case ISD::FP_EXTEND:
1802 case ISD::FP_ROUND: {
1803 // Counts of narrow/widen instructions.
1804 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1805 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1806
1807 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1808 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1809 : RISCV::VFNCVT_F_F_W;
1811 for (; SrcEltSize != DstEltSize;) {
1812 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1813 ? MVT::getIntegerVT(DstEltSize)
1814 : MVT::getFloatingPointVT(DstEltSize);
1815 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1816 DstEltSize =
1817 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1818 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1819 }
1820 return Cost;
1821 }
1822 case ISD::FP_TO_SINT:
1823 case ISD::FP_TO_UINT: {
1824 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1825 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1826 unsigned FWCVT =
1827 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1828 unsigned FNCVT =
1829 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1830 unsigned SrcEltSize = Src->getScalarSizeInBits();
1831 unsigned DstEltSize = Dst->getScalarSizeInBits();
1833 if ((SrcEltSize == 16) &&
1834 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1835 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1836 // pre-widening to f32 and then convert f32 to integer
1837 VectorType *VecF32Ty =
1838 VectorType::get(Type::getFloatTy(Dst->getContext()),
1839 cast<VectorType>(Dst)->getElementCount());
1840 std::pair<InstructionCost, MVT> VecF32LT =
1841 getTypeLegalizationCost(VecF32Ty);
1842 Cost +=
1843 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1844 VecF32LT.second, CostKind);
1845 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1846 return Cost;
1847 }
1848 if (DstEltSize == SrcEltSize)
1849 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1850 else if (DstEltSize > SrcEltSize)
1851 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1852 else { // (SrcEltSize > DstEltSize)
1853 // First do a narrowing conversion to an integer half the size, then
1854 // truncate if needed.
1855 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1856 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1857 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1858 if ((SrcEltSize / 2) > DstEltSize) {
1859 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1860 Cost +=
1861 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1862 }
1863 }
1864 return Cost;
1865 }
1866 case ISD::SINT_TO_FP:
1867 case ISD::UINT_TO_FP: {
1868 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1869 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1870 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1871 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1872 unsigned SrcEltSize = Src->getScalarSizeInBits();
1873 unsigned DstEltSize = Dst->getScalarSizeInBits();
1874
1876 if ((DstEltSize == 16) &&
1877 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1878 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1879 // it is converted to f32 and then converted to f16
1880 VectorType *VecF32Ty =
1881 VectorType::get(Type::getFloatTy(Dst->getContext()),
1882 cast<VectorType>(Dst)->getElementCount());
1883 std::pair<InstructionCost, MVT> VecF32LT =
1884 getTypeLegalizationCost(VecF32Ty);
1885 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1886 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1887 DstLT.second, CostKind);
1888 return Cost;
1889 }
1890
1891 if (DstEltSize == SrcEltSize)
1892 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1893 else if (DstEltSize > SrcEltSize) {
1894 if ((DstEltSize / 2) > SrcEltSize) {
1895 VectorType *VecTy =
1896 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1897 cast<VectorType>(Dst)->getElementCount());
1898 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1899 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1900 }
1901 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1902 } else
1903 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1904 return Cost;
1905 }
1906 }
1907 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1908}
1909
1910unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1911 if (isa<ScalableVectorType>(Ty)) {
1912 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1913 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1914 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1915 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1916 }
1917 return cast<FixedVectorType>(Ty)->getNumElements();
1918}
1919
1922 FastMathFlags FMF,
1924 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1925 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1926
1927 // Skip if scalar size of Ty is bigger than ELEN.
1928 if (Ty->getScalarSizeInBits() > ST->getELen())
1929 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1930
1931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1932 if (Ty->getElementType()->isIntegerTy(1)) {
1933 // SelectionDAGBuilder does following transforms:
1934 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1935 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1936 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1937 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1938 else
1939 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1940 }
1941
1942 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1944 InstructionCost ExtraCost = 0;
1945 switch (IID) {
1946 case Intrinsic::maximum:
1947 if (FMF.noNaNs()) {
1948 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1949 } else {
1950 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1951 RISCV::VFMV_F_S};
1952 // Cost of Canonical Nan + branch
1953 // lui a0, 523264
1954 // fmv.w.x fa0, a0
1955 Type *DstTy = Ty->getScalarType();
1956 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1957 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1958 ExtraCost = 1 +
1959 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1961 getCFInstrCost(Instruction::Br, CostKind);
1962 }
1963 break;
1964
1965 case Intrinsic::minimum:
1966 if (FMF.noNaNs()) {
1967 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1968 } else {
1969 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1970 RISCV::VFMV_F_S};
1971 // Cost of Canonical Nan + branch
1972 // lui a0, 523264
1973 // fmv.w.x fa0, a0
1974 Type *DstTy = Ty->getScalarType();
1975 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1976 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1977 ExtraCost = 1 +
1978 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1980 getCFInstrCost(Instruction::Br, CostKind);
1981 }
1982 break;
1983 }
1984 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1985 }
1986
1987 // IR Reduction is composed by one rvv reduction instruction and vmv
1988 unsigned SplitOp;
1990 switch (IID) {
1991 default:
1992 llvm_unreachable("Unsupported intrinsic");
1993 case Intrinsic::smax:
1994 SplitOp = RISCV::VMAX_VV;
1995 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1996 break;
1997 case Intrinsic::smin:
1998 SplitOp = RISCV::VMIN_VV;
1999 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2000 break;
2001 case Intrinsic::umax:
2002 SplitOp = RISCV::VMAXU_VV;
2003 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2004 break;
2005 case Intrinsic::umin:
2006 SplitOp = RISCV::VMINU_VV;
2007 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2008 break;
2009 case Intrinsic::maxnum:
2010 SplitOp = RISCV::VFMAX_VV;
2011 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2012 break;
2013 case Intrinsic::minnum:
2014 SplitOp = RISCV::VFMIN_VV;
2015 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2016 break;
2017 }
2018 // Add a cost for data larger than LMUL8
2019 InstructionCost SplitCost =
2020 (LT.first > 1) ? (LT.first - 1) *
2021 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2022 : 0;
2023 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2024}
2025
2028 std::optional<FastMathFlags> FMF,
2030 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2031 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2032
2033 // Skip if scalar size of Ty is bigger than ELEN.
2034 if (Ty->getScalarSizeInBits() > ST->getELen())
2035 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2036
2037 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2038 assert(ISD && "Invalid opcode");
2039
2040 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2041 ISD != ISD::FADD)
2042 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2043
2044 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2045 Type *ElementTy = Ty->getElementType();
2046 if (ElementTy->isIntegerTy(1)) {
2047 // Example sequences:
2048 // vfirst.m a0, v0
2049 // seqz a0, a0
2050 if (LT.second == MVT::v1i1)
2051 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2052 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2054
2055 if (ISD == ISD::AND) {
2056 // Example sequences:
2057 // vmand.mm v8, v9, v8 ; needed every time type is split
2058 // vmnot.m v8, v0 ; alias for vmnand
2059 // vcpop.m a0, v8
2060 // seqz a0, a0
2061
2062 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2063 // For LMUL <= 8, there is no splitting,
2064 // the sequences are vmnot, vcpop and seqz.
2065 // When LMUL > 8 and split = 1,
2066 // the sequences are vmnand, vcpop and seqz.
2067 // When LMUL > 8 and split > 1,
2068 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2069 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2070 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2071 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2072 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2073 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2075 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2076 // Example sequences:
2077 // vsetvli a0, zero, e8, mf8, ta, ma
2078 // vmxor.mm v8, v0, v8 ; needed every time type is split
2079 // vcpop.m a0, v8
2080 // andi a0, a0, 1
2081 return (LT.first - 1) *
2082 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2083 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2084 } else {
2085 assert(ISD == ISD::OR);
2086 // Example sequences:
2087 // vsetvli a0, zero, e8, mf8, ta, ma
2088 // vmor.mm v8, v9, v8 ; needed every time type is split
2089 // vcpop.m a0, v0
2090 // snez a0, a0
2091 return (LT.first - 1) *
2092 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2093 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2094 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2096 }
2097 }
2098
2099 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2100 // instruction, and others is composed by two vmv and one rvv reduction
2101 // instruction
2102 unsigned SplitOp;
2104 switch (ISD) {
2105 case ISD::ADD:
2106 SplitOp = RISCV::VADD_VV;
2107 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2108 break;
2109 case ISD::OR:
2110 SplitOp = RISCV::VOR_VV;
2111 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2112 break;
2113 case ISD::XOR:
2114 SplitOp = RISCV::VXOR_VV;
2115 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2116 break;
2117 case ISD::AND:
2118 SplitOp = RISCV::VAND_VV;
2119 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2120 break;
2121 case ISD::FADD:
2122 // We can't promote f16/bf16 fadd reductions.
2123 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2124 LT.second.getScalarType() == MVT::bf16)
2125 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2127 Opcodes.push_back(RISCV::VFMV_S_F);
2128 for (unsigned i = 0; i < LT.first.getValue(); i++)
2129 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2130 Opcodes.push_back(RISCV::VFMV_F_S);
2131 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2132 }
2133 SplitOp = RISCV::VFADD_VV;
2134 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2135 break;
2136 }
2137 // Add a cost for data larger than LMUL8
2138 InstructionCost SplitCost =
2139 (LT.first > 1) ? (LT.first - 1) *
2140 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2141 : 0;
2142 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2143}
2144
2146 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2147 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2148 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2149 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2150 FMF, CostKind);
2151
2152 // Skip if scalar size of ResTy is bigger than ELEN.
2153 if (ResTy->getScalarSizeInBits() > ST->getELen())
2154 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2155 FMF, CostKind);
2156
2157 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2158 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2159 FMF, CostKind);
2160
2161 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2162
2163 if (IsUnsigned && Opcode == Instruction::Add &&
2164 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2165 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2166 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2167 return LT.first *
2168 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2169 }
2170
2171 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2172 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2173 FMF, CostKind);
2174
2175 return (LT.first - 1) +
2176 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2177}
2178
2182 assert(OpInfo.isConstant() && "non constant operand?");
2183 if (!isa<VectorType>(Ty))
2184 // FIXME: We need to account for immediate materialization here, but doing
2185 // a decent job requires more knowledge about the immediate than we
2186 // currently have here.
2187 return 0;
2188
2189 if (OpInfo.isUniform())
2190 // vmv.v.i, vmv.v.x, or vfmv.v.f
2191 // We ignore the cost of the scalar constant materialization to be consistent
2192 // with how we treat scalar constants themselves just above.
2193 return 1;
2194
2195 return getConstantPoolLoadCost(Ty, CostKind);
2196}
2197
2199 Align Alignment,
2200 unsigned AddressSpace,
2202 TTI::OperandValueInfo OpInfo,
2203 const Instruction *I) const {
2204 EVT VT = TLI->getValueType(DL, Src, true);
2205 // Type legalization can't handle structs
2206 if (VT == MVT::Other)
2207 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2208 CostKind, OpInfo, I);
2209
2211 if (Opcode == Instruction::Store && OpInfo.isConstant())
2212 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2213
2214 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2215
2216 InstructionCost BaseCost = [&]() {
2217 InstructionCost Cost = LT.first;
2219 return Cost;
2220
2221 // Our actual lowering for the case where a wider legal type is available
2222 // uses the a VL predicated load on the wider type. This is reflected in
2223 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2224 // widened cases are scalarized.
2225 const DataLayout &DL = this->getDataLayout();
2226 if (Src->isVectorTy() && LT.second.isVector() &&
2227 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2228 LT.second.getSizeInBits()))
2229 return Cost;
2230
2231 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2232 CostKind, OpInfo, I);
2233 }();
2234
2235 // Assume memory ops cost scale with the number of vector registers
2236 // possible accessed by the instruction. Note that BasicTTI already
2237 // handles the LT.first term for us.
2238 if (ST->hasVInstructions() && LT.second.isVector() &&
2240 BaseCost *= TLI->getLMULCost(LT.second);
2241 return Cost + BaseCost;
2242}
2243
2245 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2247 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2249 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2250 Op1Info, Op2Info, I);
2251
2252 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2253 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2254 Op1Info, Op2Info, I);
2255
2256 // Skip if scalar size of ValTy is bigger than ELEN.
2257 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2258 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2259 Op1Info, Op2Info, I);
2260
2261 auto GetConstantMatCost =
2262 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2263 if (OpInfo.isUniform())
2264 // We return 0 we currently ignore the cost of materializing scalar
2265 // constants in GPRs.
2266 return 0;
2267
2268 return getConstantPoolLoadCost(ValTy, CostKind);
2269 };
2270
2271 InstructionCost ConstantMatCost;
2272 if (Op1Info.isConstant())
2273 ConstantMatCost += GetConstantMatCost(Op1Info);
2274 if (Op2Info.isConstant())
2275 ConstantMatCost += GetConstantMatCost(Op2Info);
2276
2277 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2278 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2279 if (CondTy->isVectorTy()) {
2280 if (ValTy->getScalarSizeInBits() == 1) {
2281 // vmandn.mm v8, v8, v9
2282 // vmand.mm v9, v0, v9
2283 // vmor.mm v0, v9, v8
2284 return ConstantMatCost +
2285 LT.first *
2286 getRISCVInstructionCost(
2287 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2288 LT.second, CostKind);
2289 }
2290 // vselect and max/min are supported natively.
2291 return ConstantMatCost +
2292 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2293 CostKind);
2294 }
2295
2296 if (ValTy->getScalarSizeInBits() == 1) {
2297 // vmv.v.x v9, a0
2298 // vmsne.vi v9, v9, 0
2299 // vmandn.mm v8, v8, v9
2300 // vmand.mm v9, v0, v9
2301 // vmor.mm v0, v9, v8
2302 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2303 return ConstantMatCost +
2304 LT.first *
2305 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2306 InterimVT, CostKind) +
2307 LT.first * getRISCVInstructionCost(
2308 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2309 LT.second, CostKind);
2310 }
2311
2312 // vmv.v.x v10, a0
2313 // vmsne.vi v0, v10, 0
2314 // vmerge.vvm v8, v9, v8, v0
2315 return ConstantMatCost +
2316 LT.first * getRISCVInstructionCost(
2317 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2318 LT.second, CostKind);
2319 }
2320
2321 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2322 CmpInst::isIntPredicate(VecPred)) {
2323 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2324 // provided they incur the same cost across all implementations
2325 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2326 LT.second,
2327 CostKind);
2328 }
2329
2330 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2331 CmpInst::isFPPredicate(VecPred)) {
2332
2333 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2334 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2335 return ConstantMatCost +
2336 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2337
2338 // If we do not support the input floating point vector type, use the base
2339 // one which will calculate as:
2340 // ScalarizeCost + Num * Cost for fixed vector,
2341 // InvalidCost for scalable vector.
2342 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2343 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2344 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2345 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2346 Op1Info, Op2Info, I);
2347
2348 // Assuming vector fp compare and mask instructions are all the same cost
2349 // until a need arises to differentiate them.
2350 switch (VecPred) {
2351 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2352 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2353 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2354 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2355 return ConstantMatCost +
2356 LT.first * getRISCVInstructionCost(
2357 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2358 LT.second, CostKind);
2359
2360 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2361 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2362 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2363 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2364 return ConstantMatCost +
2365 LT.first *
2366 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2367 LT.second, CostKind);
2368
2369 case CmpInst::FCMP_OEQ: // vmfeq.vv
2370 case CmpInst::FCMP_OGT: // vmflt.vv
2371 case CmpInst::FCMP_OGE: // vmfle.vv
2372 case CmpInst::FCMP_OLT: // vmflt.vv
2373 case CmpInst::FCMP_OLE: // vmfle.vv
2374 case CmpInst::FCMP_UNE: // vmfne.vv
2375 return ConstantMatCost +
2376 LT.first *
2377 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2378 default:
2379 break;
2380 }
2381 }
2382
2383 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2384 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2385 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2386 // be (0 + select instr cost).
2387 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2388 ValTy->isIntegerTy() && !I->user_empty()) {
2389 if (all_of(I->users(), [&](const User *U) {
2390 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2391 U->getType()->isIntegerTy() &&
2392 !isa<ConstantData>(U->getOperand(1)) &&
2393 !isa<ConstantData>(U->getOperand(2));
2394 }))
2395 return 0;
2396 }
2397
2398 // TODO: Add cost for scalar type.
2399
2400 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2401 Op1Info, Op2Info, I);
2402}
2403
2406 const Instruction *I) const {
2408 return Opcode == Instruction::PHI ? 0 : 1;
2409 // Branches are assumed to be predicted.
2410 return 0;
2411}
2412
2415 unsigned Index,
2416 const Value *Op0,
2417 const Value *Op1) const {
2418 assert(Val->isVectorTy() && "This must be a vector type");
2419
2420 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2421 // For now, skip all fixed vector cost analysis when P extension is available
2422 // to avoid crashes in getMinRVVVectorSizeInBits()
2423 if (ST->enablePExtSIMDCodeGen() && isa<FixedVectorType>(Val)) {
2424 return 1; // Treat as single instruction cost for now
2425 }
2426
2427 if (Opcode != Instruction::ExtractElement &&
2428 Opcode != Instruction::InsertElement)
2429 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2430
2431 // Legalize the type.
2432 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2433
2434 // This type is legalized to a scalar type.
2435 if (!LT.second.isVector()) {
2436 auto *FixedVecTy = cast<FixedVectorType>(Val);
2437 // If Index is a known constant, cost is zero.
2438 if (Index != -1U)
2439 return 0;
2440 // Extract/InsertElement with non-constant index is very costly when
2441 // scalarized; estimate cost of loads/stores sequence via the stack:
2442 // ExtractElement cost: store vector to stack, load scalar;
2443 // InsertElement cost: store vector to stack, store scalar, load vector.
2444 Type *ElemTy = FixedVecTy->getElementType();
2445 auto NumElems = FixedVecTy->getNumElements();
2446 auto Align = DL.getPrefTypeAlign(ElemTy);
2447 InstructionCost LoadCost =
2448 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2449 InstructionCost StoreCost =
2450 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2451 return Opcode == Instruction::ExtractElement
2452 ? StoreCost * NumElems + LoadCost
2453 : (StoreCost + LoadCost) * NumElems + StoreCost;
2454 }
2455
2456 // For unsupported scalable vector.
2457 if (LT.second.isScalableVector() && !LT.first.isValid())
2458 return LT.first;
2459
2460 // Mask vector extract/insert is expanded via e8.
2461 if (Val->getScalarSizeInBits() == 1) {
2462 VectorType *WideTy =
2464 cast<VectorType>(Val)->getElementCount());
2465 if (Opcode == Instruction::ExtractElement) {
2466 InstructionCost ExtendCost
2467 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2469 InstructionCost ExtractCost
2470 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2471 return ExtendCost + ExtractCost;
2472 }
2473 InstructionCost ExtendCost
2474 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2476 InstructionCost InsertCost
2477 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2478 InstructionCost TruncCost
2479 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2481 return ExtendCost + InsertCost + TruncCost;
2482 }
2483
2484
2485 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2486 // and vslideup + vmv.s.x to insert element to vector.
2487 unsigned BaseCost = 1;
2488 // When insertelement we should add the index with 1 as the input of vslideup.
2489 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2490
2491 if (Index != -1U) {
2492 // The type may be split. For fixed-width vectors we can normalize the
2493 // index to the new type.
2494 if (LT.second.isFixedLengthVector()) {
2495 unsigned Width = LT.second.getVectorNumElements();
2496 Index = Index % Width;
2497 }
2498
2499 // If exact VLEN is known, we will insert/extract into the appropriate
2500 // subvector with no additional subvector insert/extract cost.
2501 if (auto VLEN = ST->getRealVLen()) {
2502 unsigned EltSize = LT.second.getScalarSizeInBits();
2503 unsigned M1Max = *VLEN / EltSize;
2504 Index = Index % M1Max;
2505 }
2506
2507 if (Index == 0)
2508 // We can extract/insert the first element without vslidedown/vslideup.
2509 SlideCost = 0;
2510 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2511 Val->getScalarType()->isIntegerTy())
2512 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2513 else if (Opcode == Instruction::InsertElement)
2514 SlideCost = 1; // With a constant index, we do not need to use addi.
2515 }
2516
2517 // When the vector needs to split into multiple register groups and the index
2518 // exceeds single vector register group, we need to insert/extract the element
2519 // via stack.
2520 if (LT.first > 1 &&
2521 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2522 LT.second.isScalableVector()))) {
2523 Type *ScalarType = Val->getScalarType();
2524 Align VecAlign = DL.getPrefTypeAlign(Val);
2525 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2526 // Extra addi for unknown index.
2527 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2528
2529 // Store all split vectors into stack and load the target element.
2530 if (Opcode == Instruction::ExtractElement)
2531 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2532 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2533 CostKind) +
2534 IdxCost;
2535
2536 // Store all split vectors into stack and store the target element and load
2537 // vectors back.
2538 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2539 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2540 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2541 CostKind) +
2542 IdxCost;
2543 }
2544
2545 // Extract i64 in the target that has XLEN=32 need more instruction.
2546 if (Val->getScalarType()->isIntegerTy() &&
2547 ST->getXLen() < Val->getScalarSizeInBits()) {
2548 // For extractelement, we need the following instructions:
2549 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2550 // vslidedown.vx v8, v8, a0
2551 // vmv.x.s a0, v8
2552 // li a1, 32
2553 // vsrl.vx v8, v8, a1
2554 // vmv.x.s a1, v8
2555
2556 // For insertelement, we need the following instructions:
2557 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2558 // vmv.v.i v12, 0
2559 // vslide1up.vx v16, v12, a1
2560 // vslide1up.vx v12, v16, a0
2561 // addi a0, a2, 1
2562 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2563 // vslideup.vx v8, v12, a2
2564
2565 // TODO: should we count these special vsetvlis?
2566 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2567 }
2568 return BaseCost + SlideCost;
2569}
2570
2574 unsigned Index) const {
2575 if (isa<FixedVectorType>(Val))
2577 Index);
2578
2579 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2580 // for the cost of extracting the last lane of a scalable vector. It probably
2581 // needs a more accurate cost.
2582 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2583 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2584 return getVectorInstrCost(Opcode, Val, CostKind,
2585 EC.getKnownMinValue() - 1 - Index, nullptr,
2586 nullptr);
2587}
2588
2590 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2592 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2593
2594 // TODO: Handle more cost kinds.
2596 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2597 Args, CxtI);
2598
2599 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2600 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2601 Args, CxtI);
2602
2603 // Skip if scalar size of Ty is bigger than ELEN.
2604 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2605 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2606 Args, CxtI);
2607
2608 // Legalize the type.
2609 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2610
2611 // TODO: Handle scalar type.
2612 if (!LT.second.isVector())
2613 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2614 Args, CxtI);
2615
2616 // f16 with zvfhmin and bf16 will be promoted to f32.
2617 // FIXME: nxv32[b]f16 will be custom lowered and split.
2618 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2619 InstructionCost CastCost = 0;
2620 if ((LT.second.getVectorElementType() == MVT::f16 ||
2621 LT.second.getVectorElementType() == MVT::bf16) &&
2622 TLI->getOperationAction(ISDOpcode, LT.second) ==
2624 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2625 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2626 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2627 // Add cost of extending arguments
2628 CastCost += LT.first * Args.size() *
2629 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2631 // Add cost of truncating result
2632 CastCost +=
2633 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2635 // Compute cost of op in promoted type
2636 LT.second = PromotedVT;
2637 }
2638
2639 auto getConstantMatCost =
2640 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2641 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2642 // Two sub-cases:
2643 // * Has a 5 bit immediate operand which can be splatted.
2644 // * Has a larger immediate which must be materialized in scalar register
2645 // We return 0 for both as we currently ignore the cost of materializing
2646 // scalar constants in GPRs.
2647 return 0;
2648
2649 return getConstantPoolLoadCost(Ty, CostKind);
2650 };
2651
2652 // Add the cost of materializing any constant vectors required.
2653 InstructionCost ConstantMatCost = 0;
2654 if (Op1Info.isConstant())
2655 ConstantMatCost += getConstantMatCost(0, Op1Info);
2656 if (Op2Info.isConstant())
2657 ConstantMatCost += getConstantMatCost(1, Op2Info);
2658
2659 unsigned Op;
2660 switch (ISDOpcode) {
2661 case ISD::ADD:
2662 case ISD::SUB:
2663 Op = RISCV::VADD_VV;
2664 break;
2665 case ISD::SHL:
2666 case ISD::SRL:
2667 case ISD::SRA:
2668 Op = RISCV::VSLL_VV;
2669 break;
2670 case ISD::AND:
2671 case ISD::OR:
2672 case ISD::XOR:
2673 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2674 break;
2675 case ISD::MUL:
2676 case ISD::MULHS:
2677 case ISD::MULHU:
2678 Op = RISCV::VMUL_VV;
2679 break;
2680 case ISD::SDIV:
2681 case ISD::UDIV:
2682 Op = RISCV::VDIV_VV;
2683 break;
2684 case ISD::SREM:
2685 case ISD::UREM:
2686 Op = RISCV::VREM_VV;
2687 break;
2688 case ISD::FADD:
2689 case ISD::FSUB:
2690 Op = RISCV::VFADD_VV;
2691 break;
2692 case ISD::FMUL:
2693 Op = RISCV::VFMUL_VV;
2694 break;
2695 case ISD::FDIV:
2696 Op = RISCV::VFDIV_VV;
2697 break;
2698 case ISD::FNEG:
2699 Op = RISCV::VFSGNJN_VV;
2700 break;
2701 default:
2702 // Assuming all other instructions have the same cost until a need arises to
2703 // differentiate them.
2704 return CastCost + ConstantMatCost +
2705 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2706 Args, CxtI);
2707 }
2708
2709 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2710 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2711 // ops are twice as expensive as integer ops. Do the same for vectors so
2712 // scalar floating point ops aren't cheaper than their vector equivalents.
2713 if (Ty->isFPOrFPVectorTy())
2714 InstrCost *= 2;
2715 return CastCost + ConstantMatCost + LT.first * InstrCost;
2716}
2717
2718// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2720 ArrayRef<const Value *> Ptrs, const Value *Base,
2721 const TTI::PointersChainInfo &Info, Type *AccessTy,
2724 // In the basic model we take into account GEP instructions only
2725 // (although here can come alloca instruction, a value, constants and/or
2726 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2727 // pointer). Typically, if Base is a not a GEP-instruction and all the
2728 // pointers are relative to the same base address, all the rest are
2729 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2730 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2731 // any their index is a non-const.
2732 // If no known dependencies between the pointers cost is calculated as a sum
2733 // of costs of GEP instructions.
2734 for (auto [I, V] : enumerate(Ptrs)) {
2735 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2736 if (!GEP)
2737 continue;
2738 if (Info.isSameBase() && V != Base) {
2739 if (GEP->hasAllConstantIndices())
2740 continue;
2741 // If the chain is unit-stride and BaseReg + stride*i is a legal
2742 // addressing mode, then presume the base GEP is sitting around in a
2743 // register somewhere and check if we can fold the offset relative to
2744 // it.
2745 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2746 if (Info.isUnitStride() &&
2747 isLegalAddressingMode(AccessTy,
2748 /* BaseGV */ nullptr,
2749 /* BaseOffset */ Stride * I,
2750 /* HasBaseReg */ true,
2751 /* Scale */ 0,
2752 GEP->getType()->getPointerAddressSpace()))
2753 continue;
2754 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2755 {TTI::OK_AnyValue, TTI::OP_None},
2756 {TTI::OK_AnyValue, TTI::OP_None}, {});
2757 } else {
2758 SmallVector<const Value *> Indices(GEP->indices());
2759 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2760 Indices, AccessTy, CostKind);
2761 }
2762 }
2763 return Cost;
2764}
2765
2768 OptimizationRemarkEmitter *ORE) const {
2769 // TODO: More tuning on benchmarks and metrics with changes as needed
2770 // would apply to all settings below to enable performance.
2771
2772
2773 if (ST->enableDefaultUnroll())
2774 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2775
2776 // Enable Upper bound unrolling universally, not dependent upon the conditions
2777 // below.
2778 UP.UpperBound = true;
2779
2780 // Disable loop unrolling for Oz and Os.
2781 UP.OptSizeThreshold = 0;
2783 if (L->getHeader()->getParent()->hasOptSize())
2784 return;
2785
2786 SmallVector<BasicBlock *, 4> ExitingBlocks;
2787 L->getExitingBlocks(ExitingBlocks);
2788 LLVM_DEBUG(dbgs() << "Loop has:\n"
2789 << "Blocks: " << L->getNumBlocks() << "\n"
2790 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2791
2792 // Only allow another exit other than the latch. This acts as an early exit
2793 // as it mirrors the profitability calculation of the runtime unroller.
2794 if (ExitingBlocks.size() > 2)
2795 return;
2796
2797 // Limit the CFG of the loop body for targets with a branch predictor.
2798 // Allowing 4 blocks permits if-then-else diamonds in the body.
2799 if (L->getNumBlocks() > 4)
2800 return;
2801
2802 // Scan the loop: don't unroll loops with calls as this could prevent
2803 // inlining. Don't unroll auto-vectorized loops either, though do allow
2804 // unrolling of the scalar remainder.
2805 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2807 for (auto *BB : L->getBlocks()) {
2808 for (auto &I : *BB) {
2809 // Both auto-vectorized loops and the scalar remainder have the
2810 // isvectorized attribute, so differentiate between them by the presence
2811 // of vector instructions.
2812 if (IsVectorized && (I.getType()->isVectorTy() ||
2813 llvm::any_of(I.operand_values(), [](Value *V) {
2814 return V->getType()->isVectorTy();
2815 })))
2816 return;
2817
2818 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2819 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2820 if (!isLoweredToCall(F))
2821 continue;
2822 }
2823 return;
2824 }
2825
2826 SmallVector<const Value *> Operands(I.operand_values());
2827 Cost += getInstructionCost(&I, Operands,
2829 }
2830 }
2831
2832 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2833
2834 UP.Partial = true;
2835 UP.Runtime = true;
2836 UP.UnrollRemainder = true;
2837 UP.UnrollAndJam = true;
2838
2839 // Force unrolling small loops can be very useful because of the branch
2840 // taken cost of the backedge.
2841 if (Cost < 12)
2842 UP.Force = true;
2843}
2844
2849
2851 MemIntrinsicInfo &Info) const {
2852 const DataLayout &DL = getDataLayout();
2853 Intrinsic::ID IID = Inst->getIntrinsicID();
2854 LLVMContext &C = Inst->getContext();
2855 bool HasMask = false;
2856
2857 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2858 bool IsWrite) -> int64_t {
2859 if (auto *TarExtTy =
2860 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2861 return TarExtTy->getIntParameter(0);
2862
2863 return 1;
2864 };
2865
2866 switch (IID) {
2867 case Intrinsic::riscv_vle_mask:
2868 case Intrinsic::riscv_vse_mask:
2869 case Intrinsic::riscv_vlseg2_mask:
2870 case Intrinsic::riscv_vlseg3_mask:
2871 case Intrinsic::riscv_vlseg4_mask:
2872 case Intrinsic::riscv_vlseg5_mask:
2873 case Intrinsic::riscv_vlseg6_mask:
2874 case Intrinsic::riscv_vlseg7_mask:
2875 case Intrinsic::riscv_vlseg8_mask:
2876 case Intrinsic::riscv_vsseg2_mask:
2877 case Intrinsic::riscv_vsseg3_mask:
2878 case Intrinsic::riscv_vsseg4_mask:
2879 case Intrinsic::riscv_vsseg5_mask:
2880 case Intrinsic::riscv_vsseg6_mask:
2881 case Intrinsic::riscv_vsseg7_mask:
2882 case Intrinsic::riscv_vsseg8_mask:
2883 HasMask = true;
2884 [[fallthrough]];
2885 case Intrinsic::riscv_vle:
2886 case Intrinsic::riscv_vse:
2887 case Intrinsic::riscv_vlseg2:
2888 case Intrinsic::riscv_vlseg3:
2889 case Intrinsic::riscv_vlseg4:
2890 case Intrinsic::riscv_vlseg5:
2891 case Intrinsic::riscv_vlseg6:
2892 case Intrinsic::riscv_vlseg7:
2893 case Intrinsic::riscv_vlseg8:
2894 case Intrinsic::riscv_vsseg2:
2895 case Intrinsic::riscv_vsseg3:
2896 case Intrinsic::riscv_vsseg4:
2897 case Intrinsic::riscv_vsseg5:
2898 case Intrinsic::riscv_vsseg6:
2899 case Intrinsic::riscv_vsseg7:
2900 case Intrinsic::riscv_vsseg8: {
2901 // Intrinsic interface:
2902 // riscv_vle(merge, ptr, vl)
2903 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2904 // riscv_vse(val, ptr, vl)
2905 // riscv_vse_mask(val, ptr, mask, vl, policy)
2906 // riscv_vlseg#(merge, ptr, vl, sew)
2907 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2908 // riscv_vsseg#(val, ptr, vl, sew)
2909 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2910 bool IsWrite = Inst->getType()->isVoidTy();
2911 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2912 // The results of segment loads are TargetExtType.
2913 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2914 unsigned SEW =
2915 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2916 ->getZExtValue();
2917 Ty = TarExtTy->getTypeParameter(0U);
2919 IntegerType::get(C, SEW),
2920 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2921 }
2922 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2923 unsigned VLIndex = RVVIInfo->VLOperand;
2924 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2925 MaybeAlign Alignment =
2926 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2927 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2928 Value *Mask = ConstantInt::getTrue(MaskType);
2929 if (HasMask)
2930 Mask = Inst->getArgOperand(VLIndex - 1);
2931 Value *EVL = Inst->getArgOperand(VLIndex);
2932 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2933 // RVV uses contiguous elements as a segment.
2934 if (SegNum > 1) {
2935 unsigned ElemSize = Ty->getScalarSizeInBits();
2936 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2937 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2938 }
2939 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2940 Alignment, Mask, EVL);
2941 return true;
2942 }
2943 case Intrinsic::riscv_vlse_mask:
2944 case Intrinsic::riscv_vsse_mask:
2945 case Intrinsic::riscv_vlsseg2_mask:
2946 case Intrinsic::riscv_vlsseg3_mask:
2947 case Intrinsic::riscv_vlsseg4_mask:
2948 case Intrinsic::riscv_vlsseg5_mask:
2949 case Intrinsic::riscv_vlsseg6_mask:
2950 case Intrinsic::riscv_vlsseg7_mask:
2951 case Intrinsic::riscv_vlsseg8_mask:
2952 case Intrinsic::riscv_vssseg2_mask:
2953 case Intrinsic::riscv_vssseg3_mask:
2954 case Intrinsic::riscv_vssseg4_mask:
2955 case Intrinsic::riscv_vssseg5_mask:
2956 case Intrinsic::riscv_vssseg6_mask:
2957 case Intrinsic::riscv_vssseg7_mask:
2958 case Intrinsic::riscv_vssseg8_mask:
2959 HasMask = true;
2960 [[fallthrough]];
2961 case Intrinsic::riscv_vlse:
2962 case Intrinsic::riscv_vsse:
2963 case Intrinsic::riscv_vlsseg2:
2964 case Intrinsic::riscv_vlsseg3:
2965 case Intrinsic::riscv_vlsseg4:
2966 case Intrinsic::riscv_vlsseg5:
2967 case Intrinsic::riscv_vlsseg6:
2968 case Intrinsic::riscv_vlsseg7:
2969 case Intrinsic::riscv_vlsseg8:
2970 case Intrinsic::riscv_vssseg2:
2971 case Intrinsic::riscv_vssseg3:
2972 case Intrinsic::riscv_vssseg4:
2973 case Intrinsic::riscv_vssseg5:
2974 case Intrinsic::riscv_vssseg6:
2975 case Intrinsic::riscv_vssseg7:
2976 case Intrinsic::riscv_vssseg8: {
2977 // Intrinsic interface:
2978 // riscv_vlse(merge, ptr, stride, vl)
2979 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2980 // riscv_vsse(val, ptr, stride, vl)
2981 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2982 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2983 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2984 // riscv_vssseg#(val, ptr, offset, vl, sew)
2985 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2986 bool IsWrite = Inst->getType()->isVoidTy();
2987 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2988 // The results of segment loads are TargetExtType.
2989 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2990 unsigned SEW =
2991 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2992 ->getZExtValue();
2993 Ty = TarExtTy->getTypeParameter(0U);
2995 IntegerType::get(C, SEW),
2996 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2997 }
2998 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2999 unsigned VLIndex = RVVIInfo->VLOperand;
3000 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3001 MaybeAlign Alignment =
3002 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3003
3004 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3005 // Use the pointer alignment as the element alignment if the stride is a
3006 // multiple of the pointer alignment. Otherwise, the element alignment
3007 // should be the greatest common divisor of pointer alignment and stride.
3008 // For simplicity, just consider unalignment for elements.
3009 unsigned PointerAlign = Alignment.valueOrOne().value();
3010 if (!isa<ConstantInt>(Stride) ||
3011 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3012 Alignment = Align(1);
3013
3014 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3015 Value *Mask = ConstantInt::getTrue(MaskType);
3016 if (HasMask)
3017 Mask = Inst->getArgOperand(VLIndex - 1);
3018 Value *EVL = Inst->getArgOperand(VLIndex);
3019 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3020 // RVV uses contiguous elements as a segment.
3021 if (SegNum > 1) {
3022 unsigned ElemSize = Ty->getScalarSizeInBits();
3023 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3024 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3025 }
3026 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3027 Alignment, Mask, EVL, Stride);
3028 return true;
3029 }
3030 case Intrinsic::riscv_vloxei_mask:
3031 case Intrinsic::riscv_vluxei_mask:
3032 case Intrinsic::riscv_vsoxei_mask:
3033 case Intrinsic::riscv_vsuxei_mask:
3034 case Intrinsic::riscv_vloxseg2_mask:
3035 case Intrinsic::riscv_vloxseg3_mask:
3036 case Intrinsic::riscv_vloxseg4_mask:
3037 case Intrinsic::riscv_vloxseg5_mask:
3038 case Intrinsic::riscv_vloxseg6_mask:
3039 case Intrinsic::riscv_vloxseg7_mask:
3040 case Intrinsic::riscv_vloxseg8_mask:
3041 case Intrinsic::riscv_vluxseg2_mask:
3042 case Intrinsic::riscv_vluxseg3_mask:
3043 case Intrinsic::riscv_vluxseg4_mask:
3044 case Intrinsic::riscv_vluxseg5_mask:
3045 case Intrinsic::riscv_vluxseg6_mask:
3046 case Intrinsic::riscv_vluxseg7_mask:
3047 case Intrinsic::riscv_vluxseg8_mask:
3048 case Intrinsic::riscv_vsoxseg2_mask:
3049 case Intrinsic::riscv_vsoxseg3_mask:
3050 case Intrinsic::riscv_vsoxseg4_mask:
3051 case Intrinsic::riscv_vsoxseg5_mask:
3052 case Intrinsic::riscv_vsoxseg6_mask:
3053 case Intrinsic::riscv_vsoxseg7_mask:
3054 case Intrinsic::riscv_vsoxseg8_mask:
3055 case Intrinsic::riscv_vsuxseg2_mask:
3056 case Intrinsic::riscv_vsuxseg3_mask:
3057 case Intrinsic::riscv_vsuxseg4_mask:
3058 case Intrinsic::riscv_vsuxseg5_mask:
3059 case Intrinsic::riscv_vsuxseg6_mask:
3060 case Intrinsic::riscv_vsuxseg7_mask:
3061 case Intrinsic::riscv_vsuxseg8_mask:
3062 HasMask = true;
3063 [[fallthrough]];
3064 case Intrinsic::riscv_vloxei:
3065 case Intrinsic::riscv_vluxei:
3066 case Intrinsic::riscv_vsoxei:
3067 case Intrinsic::riscv_vsuxei:
3068 case Intrinsic::riscv_vloxseg2:
3069 case Intrinsic::riscv_vloxseg3:
3070 case Intrinsic::riscv_vloxseg4:
3071 case Intrinsic::riscv_vloxseg5:
3072 case Intrinsic::riscv_vloxseg6:
3073 case Intrinsic::riscv_vloxseg7:
3074 case Intrinsic::riscv_vloxseg8:
3075 case Intrinsic::riscv_vluxseg2:
3076 case Intrinsic::riscv_vluxseg3:
3077 case Intrinsic::riscv_vluxseg4:
3078 case Intrinsic::riscv_vluxseg5:
3079 case Intrinsic::riscv_vluxseg6:
3080 case Intrinsic::riscv_vluxseg7:
3081 case Intrinsic::riscv_vluxseg8:
3082 case Intrinsic::riscv_vsoxseg2:
3083 case Intrinsic::riscv_vsoxseg3:
3084 case Intrinsic::riscv_vsoxseg4:
3085 case Intrinsic::riscv_vsoxseg5:
3086 case Intrinsic::riscv_vsoxseg6:
3087 case Intrinsic::riscv_vsoxseg7:
3088 case Intrinsic::riscv_vsoxseg8:
3089 case Intrinsic::riscv_vsuxseg2:
3090 case Intrinsic::riscv_vsuxseg3:
3091 case Intrinsic::riscv_vsuxseg4:
3092 case Intrinsic::riscv_vsuxseg5:
3093 case Intrinsic::riscv_vsuxseg6:
3094 case Intrinsic::riscv_vsuxseg7:
3095 case Intrinsic::riscv_vsuxseg8: {
3096 // Intrinsic interface (only listed ordered version):
3097 // riscv_vloxei(merge, ptr, index, vl)
3098 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3099 // riscv_vsoxei(val, ptr, index, vl)
3100 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3101 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3102 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3103 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3104 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3105 bool IsWrite = Inst->getType()->isVoidTy();
3106 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3107 // The results of segment loads are TargetExtType.
3108 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3109 unsigned SEW =
3110 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3111 ->getZExtValue();
3112 Ty = TarExtTy->getTypeParameter(0U);
3114 IntegerType::get(C, SEW),
3115 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3116 }
3117 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3118 unsigned VLIndex = RVVIInfo->VLOperand;
3119 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3120 Value *Mask;
3121 if (HasMask) {
3122 Mask = Inst->getArgOperand(VLIndex - 1);
3123 } else {
3124 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3125 // and casting that to scalar i64 triggers a vector/scalar mismatch
3126 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3127 // via extractelement instead.
3128 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3129 Mask = ConstantInt::getTrue(MaskType);
3130 }
3131 Value *EVL = Inst->getArgOperand(VLIndex);
3132 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3133 // RVV uses contiguous elements as a segment.
3134 if (SegNum > 1) {
3135 unsigned ElemSize = Ty->getScalarSizeInBits();
3136 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3137 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3138 }
3139 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3140 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3141 Align(1), Mask, EVL,
3142 /* Stride */ nullptr, OffsetOp);
3143 return true;
3144 }
3145 }
3146 return false;
3147}
3148
3150 if (Ty->isVectorTy()) {
3151 // f16 with only zvfhmin and bf16 will be promoted to f32
3152 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3153 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3154 EltTy->isBFloatTy())
3155 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3156 cast<VectorType>(Ty));
3157
3158 TypeSize Size = DL.getTypeSizeInBits(Ty);
3159 if (Size.isScalable() && ST->hasVInstructions())
3160 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3161
3162 if (ST->useRVVForFixedLengthVectors())
3163 return divideCeil(Size, ST->getRealMinVLen());
3164 }
3165
3166 return BaseT::getRegUsageForType(Ty);
3167}
3168
3169unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3170 if (SLPMaxVF.getNumOccurrences())
3171 return SLPMaxVF;
3172
3173 // Return how many elements can fit in getRegisterBitwidth. This is the
3174 // same routine as used in LoopVectorizer. We should probably be
3175 // accounting for whether we actually have instructions with the right
3176 // lane type, but we don't have enough information to do that without
3177 // some additional plumbing which hasn't been justified yet.
3178 TypeSize RegWidth =
3180 // If no vector registers, or absurd element widths, disable
3181 // vectorization by returning 1.
3182 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3183}
3184
3188
3190 return ST->enableUnalignedVectorMem();
3191}
3192
3195 ScalarEvolution *SE) const {
3196 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3197 return TTI::AMK_PostIndexed;
3198
3200}
3201
3203 const TargetTransformInfo::LSRCost &C2) const {
3204 // RISC-V specific here are "instruction number 1st priority".
3205 // If we need to emit adds inside the loop to add up base registers, then
3206 // we need at least one extra temporary register.
3207 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3208 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3209 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3210 C1.NumIVMuls, C1.NumBaseAdds,
3211 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3212 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3213 C2.NumIVMuls, C2.NumBaseAdds,
3214 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3215}
3216
3218 Align Alignment) const {
3219 auto *VTy = dyn_cast<VectorType>(DataTy);
3220 if (!VTy || VTy->isScalableTy())
3221 return false;
3222
3223 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3224 return false;
3225
3226 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3227 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3228 if (VTy->getElementType()->isIntegerTy(8))
3229 if (VTy->getElementCount().getFixedValue() > 256)
3230 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3231 ST->getMaxLMULForFixedLengthVectors();
3232 return true;
3233}
3234
3236 Align Alignment) const {
3237 auto *VTy = dyn_cast<VectorType>(DataTy);
3238 if (!VTy || VTy->isScalableTy())
3239 return false;
3240
3241 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3242 return false;
3243 return true;
3244}
3245
3246/// See if \p I should be considered for address type promotion. We check if \p
3247/// I is a sext with right type and used in memory accesses. If it used in a
3248/// "complex" getelementptr, we allow it to be promoted without finding other
3249/// sext instructions that sign extended the same initial value. A getelementptr
3250/// is considered as "complex" if it has more than 2 operands.
3252 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3253 bool Considerable = false;
3254 AllowPromotionWithoutCommonHeader = false;
3255 if (!isa<SExtInst>(&I))
3256 return false;
3257 Type *ConsideredSExtType =
3258 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3259 if (I.getType() != ConsideredSExtType)
3260 return false;
3261 // See if the sext is the one with the right type and used in at least one
3262 // GetElementPtrInst.
3263 for (const User *U : I.users()) {
3264 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3265 Considerable = true;
3266 // A getelementptr is considered as "complex" if it has more than 2
3267 // operands. We will promote a SExt used in such complex GEP as we
3268 // expect some computation to be merged if they are done on 64 bits.
3269 if (GEPInst->getNumOperands() > 2) {
3270 AllowPromotionWithoutCommonHeader = true;
3271 break;
3272 }
3273 }
3274 }
3275 return Considerable;
3276}
3277
3278bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3279 switch (Opcode) {
3280 case Instruction::Add:
3281 case Instruction::Sub:
3282 case Instruction::Mul:
3283 case Instruction::And:
3284 case Instruction::Or:
3285 case Instruction::Xor:
3286 case Instruction::FAdd:
3287 case Instruction::FSub:
3288 case Instruction::FMul:
3289 case Instruction::FDiv:
3290 case Instruction::ICmp:
3291 case Instruction::FCmp:
3292 return true;
3293 case Instruction::Shl:
3294 case Instruction::LShr:
3295 case Instruction::AShr:
3296 case Instruction::UDiv:
3297 case Instruction::SDiv:
3298 case Instruction::URem:
3299 case Instruction::SRem:
3300 case Instruction::Select:
3301 return Operand == 1;
3302 default:
3303 return false;
3304 }
3305}
3306
3308 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3309 return false;
3310
3311 if (canSplatOperand(I->getOpcode(), Operand))
3312 return true;
3313
3314 auto *II = dyn_cast<IntrinsicInst>(I);
3315 if (!II)
3316 return false;
3317
3318 switch (II->getIntrinsicID()) {
3319 case Intrinsic::fma:
3320 case Intrinsic::vp_fma:
3321 case Intrinsic::fmuladd:
3322 case Intrinsic::vp_fmuladd:
3323 return Operand == 0 || Operand == 1;
3324 case Intrinsic::vp_shl:
3325 case Intrinsic::vp_lshr:
3326 case Intrinsic::vp_ashr:
3327 case Intrinsic::vp_udiv:
3328 case Intrinsic::vp_sdiv:
3329 case Intrinsic::vp_urem:
3330 case Intrinsic::vp_srem:
3331 case Intrinsic::ssub_sat:
3332 case Intrinsic::vp_ssub_sat:
3333 case Intrinsic::usub_sat:
3334 case Intrinsic::vp_usub_sat:
3335 case Intrinsic::vp_select:
3336 return Operand == 1;
3337 // These intrinsics are commutative.
3338 case Intrinsic::vp_add:
3339 case Intrinsic::vp_mul:
3340 case Intrinsic::vp_and:
3341 case Intrinsic::vp_or:
3342 case Intrinsic::vp_xor:
3343 case Intrinsic::vp_fadd:
3344 case Intrinsic::vp_fmul:
3345 case Intrinsic::vp_icmp:
3346 case Intrinsic::vp_fcmp:
3347 case Intrinsic::smin:
3348 case Intrinsic::vp_smin:
3349 case Intrinsic::umin:
3350 case Intrinsic::vp_umin:
3351 case Intrinsic::smax:
3352 case Intrinsic::vp_smax:
3353 case Intrinsic::umax:
3354 case Intrinsic::vp_umax:
3355 case Intrinsic::sadd_sat:
3356 case Intrinsic::vp_sadd_sat:
3357 case Intrinsic::uadd_sat:
3358 case Intrinsic::vp_uadd_sat:
3359 // These intrinsics have 'vr' versions.
3360 case Intrinsic::vp_sub:
3361 case Intrinsic::vp_fsub:
3362 case Intrinsic::vp_fdiv:
3363 return Operand == 0 || Operand == 1;
3364 default:
3365 return false;
3366 }
3367}
3368
3369/// Check if sinking \p I's operands to I's basic block is profitable, because
3370/// the operands can be folded into a target instruction, e.g.
3371/// splats of scalars can fold into vector instructions.
3374 using namespace llvm::PatternMatch;
3375
3376 if (I->isBitwiseLogicOp()) {
3377 if (!I->getType()->isVectorTy()) {
3378 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3379 for (auto &Op : I->operands()) {
3380 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3381 if (match(Op.get(), m_Not(m_Value()))) {
3382 Ops.push_back(&Op);
3383 return true;
3384 }
3385 }
3386 }
3387 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3388 for (auto &Op : I->operands()) {
3389 // (and X, (not Y)) -> (vandn.vv X, Y)
3390 if (match(Op.get(), m_Not(m_Value()))) {
3391 Ops.push_back(&Op);
3392 return true;
3393 }
3394 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3396 m_ZeroInt()),
3397 m_Value(), m_ZeroMask()))) {
3398 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3399 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3400 Ops.push_back(&Not);
3401 Ops.push_back(&InsertElt);
3402 Ops.push_back(&Op);
3403 return true;
3404 }
3405 }
3406 }
3407 }
3408
3409 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3410 return false;
3411
3412 // Don't sink splat operands if the target prefers it. Some targets requires
3413 // S2V transfer buffers and we can run out of them copying the same value
3414 // repeatedly.
3415 // FIXME: It could still be worth doing if it would improve vector register
3416 // pressure and prevent a vector spill.
3417 if (!ST->sinkSplatOperands())
3418 return false;
3419
3420 for (auto OpIdx : enumerate(I->operands())) {
3421 if (!canSplatOperand(I, OpIdx.index()))
3422 continue;
3423
3424 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3425 // Make sure we are not already sinking this operand
3426 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3427 continue;
3428
3429 // We are looking for a splat that can be sunk.
3431 m_Value(), m_ZeroMask())))
3432 continue;
3433
3434 // Don't sink i1 splats.
3435 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3436 continue;
3437
3438 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3439 // and vector registers
3440 for (Use &U : Op->uses()) {
3441 Instruction *Insn = cast<Instruction>(U.getUser());
3442 if (!canSplatOperand(Insn, U.getOperandNo()))
3443 return false;
3444 }
3445
3446 // Sink any fpexts since they might be used in a widening fp pattern.
3447 Use *InsertEltUse = &Op->getOperandUse(0);
3448 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3449 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3450 Ops.push_back(&InsertElt->getOperandUse(1));
3451 Ops.push_back(InsertEltUse);
3452 Ops.push_back(&OpIdx.value());
3453 }
3454 return true;
3455}
3456
3458RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3460 // TODO: Enable expansion when unaligned access is not supported after we fix
3461 // issues in ExpandMemcmp.
3462 if (!ST->enableUnalignedScalarMem())
3463 return Options;
3464
3465 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3466 return Options;
3467
3468 Options.AllowOverlappingLoads = true;
3469 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3470 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3471 if (ST->is64Bit()) {
3472 Options.LoadSizes = {8, 4, 2, 1};
3473 Options.AllowedTailExpansions = {3, 5, 6};
3474 } else {
3475 Options.LoadSizes = {4, 2, 1};
3476 Options.AllowedTailExpansions = {3};
3477 }
3478
3479 if (IsZeroCmp && ST->hasVInstructions()) {
3480 unsigned VLenB = ST->getRealMinVLen() / 8;
3481 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3482 // `VLenB * MaxLMUL` so that it fits in a single register group.
3483 unsigned MinSize = ST->getXLen() / 8 + 1;
3484 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3485 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3486 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3487 }
3488 return Options;
3489}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1883
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).