LLVM 20.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
19#include <cmath>
20#include <optional>
21using namespace llvm;
22using namespace llvm::PatternMatch;
23
24#define DEBUG_TYPE "riscvtti"
25
27 "riscv-v-register-bit-width-lmul",
29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
30 "by autovectorized code. Fractional LMULs are not supported."),
32
34 "riscv-v-slp-max-vf",
36 "Overrides result used for getMaximumVF query which is used "
37 "exclusively by SLP vectorizer."),
39
41RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
43 // Check if the type is valid for all CostKind
44 if (!VT.isVector())
46 size_t NumInstr = OpCodes.size();
48 return NumInstr;
49 InstructionCost LMULCost = TLI->getLMULCost(VT);
51 return LMULCost * NumInstr;
53 for (auto Op : OpCodes) {
54 switch (Op) {
55 case RISCV::VRGATHER_VI:
56 Cost += TLI->getVRGatherVICost(VT);
57 break;
58 case RISCV::VRGATHER_VV:
59 Cost += TLI->getVRGatherVVCost(VT);
60 break;
61 case RISCV::VSLIDEUP_VI:
62 case RISCV::VSLIDEDOWN_VI:
63 Cost += TLI->getVSlideVICost(VT);
64 break;
65 case RISCV::VSLIDEUP_VX:
66 case RISCV::VSLIDEDOWN_VX:
67 Cost += TLI->getVSlideVXCost(VT);
68 break;
69 case RISCV::VREDMAX_VS:
70 case RISCV::VREDMIN_VS:
71 case RISCV::VREDMAXU_VS:
72 case RISCV::VREDMINU_VS:
73 case RISCV::VREDSUM_VS:
74 case RISCV::VREDAND_VS:
75 case RISCV::VREDOR_VS:
76 case RISCV::VREDXOR_VS:
77 case RISCV::VFREDMAX_VS:
78 case RISCV::VFREDMIN_VS:
79 case RISCV::VFREDUSUM_VS: {
80 unsigned VL = VT.getVectorMinNumElements();
81 if (!VT.isFixedLengthVector())
82 VL *= *getVScaleForTuning();
83 Cost += Log2_32_Ceil(VL);
84 break;
85 }
86 case RISCV::VFREDOSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += VL;
91 break;
92 }
93 case RISCV::VMV_X_S:
94 case RISCV::VMV_S_X:
95 case RISCV::VFMV_F_S:
96 case RISCV::VFMV_S_F:
97 case RISCV::VMOR_MM:
98 case RISCV::VMXOR_MM:
99 case RISCV::VMAND_MM:
100 case RISCV::VMANDN_MM:
101 case RISCV::VMNAND_MM:
102 case RISCV::VCPOP_M:
103 case RISCV::VFIRST_M:
104 Cost += 1;
105 break;
106 default:
107 Cost += LMULCost;
108 }
109 }
110 return Cost;
111}
112
114 const RISCVSubtarget *ST,
115 const APInt &Imm, Type *Ty,
117 bool FreeZeroes) {
118 assert(Ty->isIntegerTy() &&
119 "getIntImmCost can only estimate cost of materialising integers");
120
121 // We have a Zero register, so 0 is always free.
122 if (Imm == 0)
123 return TTI::TCC_Free;
124
125 // Otherwise, we check how many instructions it will take to materialise.
126 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
127 /*CompressionCost=*/false, FreeZeroes);
128}
129
132 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
133}
134
135// Look for patterns of shift followed by AND that can be turned into a pair of
136// shifts. We won't need to materialize an immediate for the AND so these can
137// be considered free.
138static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
139 uint64_t Mask = Imm.getZExtValue();
140 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
141 if (!BO || !BO->hasOneUse())
142 return false;
143
144 if (BO->getOpcode() != Instruction::Shl)
145 return false;
146
147 if (!isa<ConstantInt>(BO->getOperand(1)))
148 return false;
149
150 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
151 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
152 // is a mask shifted by c2 bits with c3 leading zeros.
153 if (isShiftedMask_64(Mask)) {
154 unsigned Trailing = llvm::countr_zero(Mask);
155 if (ShAmt == Trailing)
156 return true;
157 }
158
159 return false;
160}
161
163 const APInt &Imm, Type *Ty,
165 Instruction *Inst) {
166 assert(Ty->isIntegerTy() &&
167 "getIntImmCost can only estimate cost of materialising integers");
168
169 // We have a Zero register, so 0 is always free.
170 if (Imm == 0)
171 return TTI::TCC_Free;
172
173 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
174 // commutative, in others the immediate comes from a specific argument index.
175 bool Takes12BitImm = false;
176 unsigned ImmArgIdx = ~0U;
177
178 switch (Opcode) {
179 case Instruction::GetElementPtr:
180 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
181 // split up large offsets in GEP into better parts than ConstantHoisting
182 // can.
183 return TTI::TCC_Free;
184 case Instruction::Store: {
185 // Use the materialization cost regardless of if it's the address or the
186 // value that is constant, except for if the store is misaligned and
187 // misaligned accesses are not legal (experience shows constant hoisting
188 // can sometimes be harmful in such cases).
189 if (Idx == 1 || !Inst)
190 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
191 /*FreeZeroes=*/true);
192
193 StoreInst *ST = cast<StoreInst>(Inst);
194 if (!getTLI()->allowsMemoryAccessForAlignment(
195 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
196 ST->getPointerAddressSpace(), ST->getAlign()))
197 return TTI::TCC_Free;
198
199 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
200 /*FreeZeroes=*/true);
201 }
202 case Instruction::Load:
203 // If the address is a constant, use the materialization cost.
204 return getIntImmCost(Imm, Ty, CostKind);
205 case Instruction::And:
206 // zext.h
207 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
208 return TTI::TCC_Free;
209 // zext.w
210 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
211 return TTI::TCC_Free;
212 // bclri
213 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
214 return TTI::TCC_Free;
215 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
216 canUseShiftPair(Inst, Imm))
217 return TTI::TCC_Free;
218 Takes12BitImm = true;
219 break;
220 case Instruction::Add:
221 Takes12BitImm = true;
222 break;
223 case Instruction::Or:
224 case Instruction::Xor:
225 // bseti/binvi
226 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
227 return TTI::TCC_Free;
228 Takes12BitImm = true;
229 break;
230 case Instruction::Mul:
231 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
232 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
233 return TTI::TCC_Free;
234 // One more or less than a power of 2 can use SLLI+ADD/SUB.
235 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
236 return TTI::TCC_Free;
237 // FIXME: There is no MULI instruction.
238 Takes12BitImm = true;
239 break;
240 case Instruction::Sub:
241 case Instruction::Shl:
242 case Instruction::LShr:
243 case Instruction::AShr:
244 Takes12BitImm = true;
245 ImmArgIdx = 1;
246 break;
247 default:
248 break;
249 }
250
251 if (Takes12BitImm) {
252 // Check immediate is the correct argument...
253 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
254 // ... and fits into the 12-bit immediate.
255 if (Imm.getSignificantBits() <= 64 &&
256 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
257 return TTI::TCC_Free;
258 }
259 }
260
261 // Otherwise, use the full materialisation cost.
262 return getIntImmCost(Imm, Ty, CostKind);
263 }
264
265 // By default, prevent hoisting.
266 return TTI::TCC_Free;
267}
268
271 const APInt &Imm, Type *Ty,
273 // Prevent hoisting in unknown cases.
274 return TTI::TCC_Free;
275}
276
277bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
278 return ST->hasVInstructions();
279}
280
283 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
284 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
287}
288
290 // Currently, the ExpandReductions pass can't expand scalable-vector
291 // reductions, but we still request expansion as RVV doesn't support certain
292 // reductions and the SelectionDAG can't legalize them either.
293 switch (II->getIntrinsicID()) {
294 default:
295 return false;
296 // These reductions have no equivalent in RVV
297 case Intrinsic::vector_reduce_mul:
298 case Intrinsic::vector_reduce_fmul:
299 return true;
300 }
301}
302
303std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
304 if (ST->hasVInstructions())
306 return BaseT::getMaxVScale();
307}
308
309std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
310 if (ST->hasVInstructions())
311 if (unsigned MinVLen = ST->getRealMinVLen();
312 MinVLen >= RISCV::RVVBitsPerBlock)
313 return MinVLen / RISCV::RVVBitsPerBlock;
315}
316
319 unsigned LMUL =
320 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
321 switch (K) {
323 return TypeSize::getFixed(ST->getXLen());
325 return TypeSize::getFixed(
326 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
329 (ST->hasVInstructions() &&
332 : 0);
333 }
334
335 llvm_unreachable("Unsupported register kind");
336}
337
339RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
340 // Add a cost of address generation + the cost of the load. The address
341 // is expected to be a PC relative offset to a constant pool entry
342 // using auipc/addi.
343 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
344 /*AddressSpace=*/0, CostKind);
345}
346
347static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
348 unsigned Size = Mask.size();
349 if (!isPowerOf2_32(Size))
350 return false;
351 for (unsigned I = 0; I != Size; ++I) {
352 if (static_cast<unsigned>(Mask[I]) == I)
353 continue;
354 if (Mask[I] != 0)
355 return false;
356 if (Size % I != 0)
357 return false;
358 for (unsigned J = I + 1; J != Size; ++J)
359 // Check the pattern is repeated.
360 if (static_cast<unsigned>(Mask[J]) != J % I)
361 return false;
362 SubVectorSize = I;
363 return true;
364 }
365 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
366 return false;
367}
368
370 LLVMContext &C) {
371 assert((DataVT.getScalarSizeInBits() != 8 ||
372 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
373 MVT IndexVT = DataVT.changeTypeToInteger();
374 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
375 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
376 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
377}
378
380 VectorType *Tp, ArrayRef<int> Mask,
382 int Index, VectorType *SubTp,
384 const Instruction *CxtI) {
385 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
386
387 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
388
389 // First, handle cases where having a fixed length vector enables us to
390 // give a more accurate cost than falling back to generic scalable codegen.
391 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
392 if (isa<FixedVectorType>(Tp)) {
393 switch (Kind) {
394 default:
395 break;
397 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
398 MVT EltTp = LT.second.getVectorElementType();
399 // If the size of the element is < ELEN then shuffles of interleaves and
400 // deinterleaves of 2 vectors can be lowered into the following
401 // sequences
402 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
403 // Example sequence:
404 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
405 // vwaddu.vv v10, v8, v9
406 // li a0, -1 (ignored)
407 // vwmaccu.vx v10, a0, v9
408 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
409 return 2 * LT.first * TLI->getLMULCost(LT.second);
410
411 if (Mask[0] == 0 || Mask[0] == 1) {
412 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
413 // Example sequence:
414 // vnsrl.wi v10, v8, 0
415 if (equal(DeinterleaveMask, Mask))
416 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
417 LT.second, CostKind);
418 }
419 }
420 int SubVectorSize;
421 if (LT.second.getScalarSizeInBits() != 1 &&
422 isRepeatedConcatMask(Mask, SubVectorSize)) {
424 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
425 // The cost of extraction from a subvector is 0 if the index is 0.
426 for (unsigned I = 0; I != NumSlides; ++I) {
427 unsigned InsertIndex = SubVectorSize * (1 << I);
428 FixedVectorType *SubTp =
429 FixedVectorType::get(Tp->getElementType(), InsertIndex);
430 FixedVectorType *DestTp =
432 std::pair<InstructionCost, MVT> DestLT =
434 // Add the cost of whole vector register move because the
435 // destination vector register group for vslideup cannot overlap the
436 // source.
437 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
439 CostKind, InsertIndex, SubTp);
440 }
441 return Cost;
442 }
443 }
444 // vrgather + cost of generating the mask constant.
445 // We model this for an unknown mask with a single vrgather.
446 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
447 (LT.second.getScalarSizeInBits() != 8 ||
448 LT.second.getVectorNumElements() <= 256)) {
449 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
450 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
451 return IndexCost +
452 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
453 }
454 [[fallthrough]];
455 }
458 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
459 // register for the second vrgather. We model this for an unknown
460 // (shuffle) mask.
461 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
462 (LT.second.getScalarSizeInBits() != 8 ||
463 LT.second.getVectorNumElements() <= 256)) {
464 auto &C = Tp->getContext();
465 auto EC = Tp->getElementCount();
466 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
468 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
469 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
470 return 2 * IndexCost +
471 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
472 LT.second, CostKind) +
473 MaskCost;
474 }
475 [[fallthrough]];
476 }
477 case TTI::SK_Select: {
478 // We are going to permute multiple sources and the result will be in
479 // multiple destinations. Providing an accurate cost only for splits where
480 // the element type remains the same.
481 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
482 LT.second.isFixedLengthVector() &&
483 LT.second.getVectorElementType().getSizeInBits() ==
485 LT.second.getVectorNumElements() <
486 cast<FixedVectorType>(Tp)->getNumElements() &&
487 divideCeil(Mask.size(),
488 cast<FixedVectorType>(Tp)->getNumElements()) ==
489 static_cast<unsigned>(*LT.first.getValue())) {
490 unsigned NumRegs = *LT.first.getValue();
491 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
492 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
493 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
494
496 for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
497 I < NumSrcRegs; ++I) {
498 bool IsSingleVector = true;
499 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
500 transform(
501 Mask.slice(I * SubVF,
502 I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
503 SubMask.begin(), [&](int I) -> int {
504 if (I == PoisonMaskElem)
505 return PoisonMaskElem;
506 bool SingleSubVector = I / VF == 0;
507 IsSingleVector &= SingleSubVector;
508 return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
509 });
510 if (all_of(enumerate(SubMask), [](auto &&P) {
511 return P.value() == PoisonMaskElem ||
512 static_cast<unsigned>(P.value()) == P.index();
513 }))
514 continue;
517 SubVecTy, SubMask, CostKind, 0, nullptr);
518 }
519 return Cost;
520 }
521 break;
522 }
523 }
524 };
525
526 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
527 switch (Kind) {
528 default:
529 // Fallthrough to generic handling.
530 // TODO: Most of these cases will return getInvalid in generic code, and
531 // must be implemented here.
532 break;
534 // Extract at zero is always a subregister extract
535 if (Index == 0)
536 return TTI::TCC_Free;
537
538 // If we're extracting a subvector of at most m1 size at a sub-register
539 // boundary - which unfortunately we need exact vlen to identify - this is
540 // a subregister extract at worst and thus won't require a vslidedown.
541 // TODO: Extend for aligned m2, m4 subvector extracts
542 // TODO: Extend for misalgined (but contained) extracts
543 // TODO: Extend for scalable subvector types
544 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
545 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
546 const unsigned MinVLen = ST->getRealMinVLen();
547 const unsigned MaxVLen = ST->getRealMaxVLen();
548 if (MinVLen == MaxVLen &&
549 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
550 SubLT.second.getSizeInBits() <= MinVLen)
551 return TTI::TCC_Free;
552 }
553
554 // Example sequence:
555 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
556 // vslidedown.vi v8, v9, 2
557 return LT.first *
558 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
560 // Example sequence:
561 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
562 // vslideup.vi v8, v9, 2
563 return LT.first *
564 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
565 case TTI::SK_Select: {
566 // Example sequence:
567 // li a0, 90
568 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
569 // vmv.s.x v0, a0
570 // vmerge.vvm v8, v9, v8, v0
571 // We use 2 for the cost of the mask materialization as this is the true
572 // cost for small masks and most shuffles are small. At worst, this cost
573 // should be a very small constant for the constant pool load. As such,
574 // we may bias towards large selects slightly more than truely warranted.
575 return LT.first *
576 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
577 LT.second, CostKind));
578 }
579 case TTI::SK_Broadcast: {
580 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
581 Instruction::InsertElement);
582 if (LT.second.getScalarSizeInBits() == 1) {
583 if (HasScalar) {
584 // Example sequence:
585 // andi a0, a0, 1
586 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
587 // vmv.v.x v8, a0
588 // vmsne.vi v0, v8, 0
589 return LT.first *
590 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
591 LT.second, CostKind));
592 }
593 // Example sequence:
594 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
595 // vmv.v.i v8, 0
596 // vmerge.vim v8, v8, 1, v0
597 // vmv.x.s a0, v8
598 // andi a0, a0, 1
599 // vmv.v.x v8, a0
600 // vmsne.vi v0, v8, 0
601
602 return LT.first *
603 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
604 RISCV::VMV_X_S, RISCV::VMV_V_X,
605 RISCV::VMSNE_VI},
606 LT.second, CostKind));
607 }
608
609 if (HasScalar) {
610 // Example sequence:
611 // vmv.v.x v8, a0
612 return LT.first *
613 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
614 }
615
616 // Example sequence:
617 // vrgather.vi v9, v8, 0
618 return LT.first *
619 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
620 }
621 case TTI::SK_Splice: {
622 // vslidedown+vslideup.
623 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
624 // of similar code, but I think we expand through memory.
625 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
626 if (Index >= 0 && Index < 32)
627 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
628 else if (Index < 0 && Index > -32)
629 Opcodes[1] = RISCV::VSLIDEUP_VI;
630 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
631 }
632 case TTI::SK_Reverse: {
633 // TODO: Cases to improve here:
634 // * Illegal vector types
635 // * i64 on RV32
636 // * i1 vector
637 // At low LMUL, most of the cost is producing the vrgather index register.
638 // At high LMUL, the cost of the vrgather itself will dominate.
639 // Example sequence:
640 // csrr a0, vlenb
641 // srli a0, a0, 3
642 // addi a0, a0, -1
643 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
644 // vid.v v9
645 // vrsub.vx v10, v9, a0
646 // vrgather.vv v9, v8, v10
647 InstructionCost LenCost = 3;
648 if (LT.second.isFixedLengthVector())
649 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
650 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
651 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
652 if (LT.second.isFixedLengthVector() &&
653 isInt<5>(LT.second.getVectorNumElements() - 1))
654 Opcodes[1] = RISCV::VRSUB_VI;
655 InstructionCost GatherCost =
656 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
657 // Mask operation additionally required extend and truncate
658 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
659 return LT.first * (LenCost + GatherCost + ExtendCost);
660 }
661 }
662 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
663}
664
665static unsigned isM1OrSmaller(MVT VT) {
667 return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
669}
670
672 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
674 if (isa<ScalableVectorType>(Ty))
676
677 // A build_vector (which is m1 sized or smaller) can be done in no
678 // worse than one vslide1down.vx per element in the type. We could
679 // in theory do an explode_vector in the inverse manner, but our
680 // lowering today does not have a first class node for this pattern.
682 Ty, DemandedElts, Insert, Extract, CostKind);
683 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
684 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
685 if (Ty->getScalarSizeInBits() == 1) {
686 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
687 // Note: Implicit scalar anyextend is assumed to be free since the i1
688 // must be stored in a GPR.
689 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
690 CostKind) +
691 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
693 }
694
695 assert(LT.second.isFixedLengthVector());
696 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
697 if (isM1OrSmaller(ContainerVT)) {
698 InstructionCost BV =
699 cast<FixedVectorType>(Ty)->getNumElements() *
700 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
701 if (BV < Cost)
702 Cost = BV;
703 }
704 }
705 return Cost;
706}
707
709RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
710 unsigned AddressSpace,
712 if (!isLegalMaskedLoadStore(Src, Alignment) ||
714 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
715 CostKind);
716
717 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
718}
719
721 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
722 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
723 bool UseMaskForCond, bool UseMaskForGaps) {
724
725 // The interleaved memory access pass will lower interleaved memory ops (i.e
726 // a load and store followed by a specific shuffle) to vlseg/vsseg
727 // intrinsics.
728 if (!UseMaskForCond && !UseMaskForGaps &&
729 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
730 auto *VTy = cast<VectorType>(VecTy);
731 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
732 // Need to make sure type has't been scalarized
733 if (LT.second.isVector()) {
734 auto *SubVecTy =
735 VectorType::get(VTy->getElementType(),
736 VTy->getElementCount().divideCoefficientBy(Factor));
737 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
738 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
739 AddressSpace, DL)) {
740
741 // Some processors optimize segment loads/stores as one wide memory op +
742 // Factor * LMUL shuffle ops.
743 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
745 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
746 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
747 Cost += Factor * TLI->getLMULCost(SubVecVT);
748 return LT.first * Cost;
749 }
750
751 // Otherwise, the cost is proportional to the number of elements (VL *
752 // Factor ops).
753 InstructionCost MemOpCost =
754 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
755 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
756 unsigned NumLoads = getEstimatedVLFor(VTy);
757 return NumLoads * MemOpCost;
758 }
759 }
760 }
761
762 // TODO: Return the cost of interleaved accesses for scalable vector when
763 // unable to convert to segment accesses instructions.
764 if (isa<ScalableVectorType>(VecTy))
766
767 auto *FVTy = cast<FixedVectorType>(VecTy);
768 InstructionCost MemCost =
769 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
770 unsigned VF = FVTy->getNumElements() / Factor;
771
772 // An interleaved load will look like this for Factor=3:
773 // %wide.vec = load <12 x i32>, ptr %3, align 4
774 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
775 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
776 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
777 if (Opcode == Instruction::Load) {
778 InstructionCost Cost = MemCost;
779 for (unsigned Index : Indices) {
780 FixedVectorType *SubVecTy =
781 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
782 auto Mask = createStrideMask(Index, Factor, VF);
783 InstructionCost ShuffleCost =
785 CostKind, 0, nullptr, {});
786 Cost += ShuffleCost;
787 }
788 return Cost;
789 }
790
791 // TODO: Model for NF > 2
792 // We'll need to enhance getShuffleCost to model shuffles that are just
793 // inserts and extracts into subvectors, since they won't have the full cost
794 // of a vrgather.
795 // An interleaved store for 3 vectors of 4 lanes will look like
796 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
797 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
798 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
799 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
800 // store <12 x i32> %interleaved.vec, ptr %10, align 4
801 if (Factor != 2)
802 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
803 Alignment, AddressSpace, CostKind,
804 UseMaskForCond, UseMaskForGaps);
805
806 assert(Opcode == Instruction::Store && "Opcode must be a store");
807 // For an interleaving store of 2 vectors, we perform one large interleaving
808 // shuffle that goes into the wide store
809 auto Mask = createInterleaveMask(VF, Factor);
810 InstructionCost ShuffleCost =
812 CostKind, 0, nullptr, {});
813 return MemCost + ShuffleCost;
814}
815
817 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
818 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
820 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
821 Alignment, CostKind, I);
822
823 if ((Opcode == Instruction::Load &&
824 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
825 (Opcode == Instruction::Store &&
826 !isLegalMaskedScatter(DataTy, Align(Alignment))))
827 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
828 Alignment, CostKind, I);
829
830 // Cost is proportional to the number of memory operations implied. For
831 // scalable vectors, we use an estimate on that number since we don't
832 // know exactly what VL will be.
833 auto &VTy = *cast<VectorType>(DataTy);
834 InstructionCost MemOpCost =
835 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
836 {TTI::OK_AnyValue, TTI::OP_None}, I);
837 unsigned NumLoads = getEstimatedVLFor(&VTy);
838 return NumLoads * MemOpCost;
839}
840
842 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
843 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
844 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
845 !isLegalStridedLoadStore(DataTy, Alignment)) ||
846 (Opcode != Instruction::Load && Opcode != Instruction::Store))
847 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
848 Alignment, CostKind, I);
849
851 return TTI::TCC_Basic;
852
853 // Cost is proportional to the number of memory operations implied. For
854 // scalable vectors, we use an estimate on that number since we don't
855 // know exactly what VL will be.
856 auto &VTy = *cast<VectorType>(DataTy);
857 InstructionCost MemOpCost =
858 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
859 {TTI::OK_AnyValue, TTI::OP_None}, I);
860 unsigned NumLoads = getEstimatedVLFor(&VTy);
861 return NumLoads * MemOpCost;
862}
863
866 // FIXME: This is a property of the default vector convention, not
867 // all possible calling conventions. Fixing that will require
868 // some TTI API and SLP rework.
871 for (auto *Ty : Tys) {
872 if (!Ty->isVectorTy())
873 continue;
875 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
876 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
877 }
878 return Cost;
879}
880
881// Currently, these represent both throughput and codesize costs
882// for the respective intrinsics. The costs in this table are simply
883// instruction counts with the following adjustments made:
884// * One vsetvli is considered free.
886 {Intrinsic::floor, MVT::f32, 9},
887 {Intrinsic::floor, MVT::f64, 9},
888 {Intrinsic::ceil, MVT::f32, 9},
889 {Intrinsic::ceil, MVT::f64, 9},
890 {Intrinsic::trunc, MVT::f32, 7},
891 {Intrinsic::trunc, MVT::f64, 7},
892 {Intrinsic::round, MVT::f32, 9},
893 {Intrinsic::round, MVT::f64, 9},
894 {Intrinsic::roundeven, MVT::f32, 9},
895 {Intrinsic::roundeven, MVT::f64, 9},
896 {Intrinsic::rint, MVT::f32, 7},
897 {Intrinsic::rint, MVT::f64, 7},
898 {Intrinsic::lrint, MVT::i32, 1},
899 {Intrinsic::lrint, MVT::i64, 1},
900 {Intrinsic::llrint, MVT::i64, 1},
901 {Intrinsic::nearbyint, MVT::f32, 9},
902 {Intrinsic::nearbyint, MVT::f64, 9},
903 {Intrinsic::bswap, MVT::i16, 3},
904 {Intrinsic::bswap, MVT::i32, 12},
905 {Intrinsic::bswap, MVT::i64, 31},
906 {Intrinsic::vp_bswap, MVT::i16, 3},
907 {Intrinsic::vp_bswap, MVT::i32, 12},
908 {Intrinsic::vp_bswap, MVT::i64, 31},
909 {Intrinsic::vp_fshl, MVT::i8, 7},
910 {Intrinsic::vp_fshl, MVT::i16, 7},
911 {Intrinsic::vp_fshl, MVT::i32, 7},
912 {Intrinsic::vp_fshl, MVT::i64, 7},
913 {Intrinsic::vp_fshr, MVT::i8, 7},
914 {Intrinsic::vp_fshr, MVT::i16, 7},
915 {Intrinsic::vp_fshr, MVT::i32, 7},
916 {Intrinsic::vp_fshr, MVT::i64, 7},
917 {Intrinsic::bitreverse, MVT::i8, 17},
918 {Intrinsic::bitreverse, MVT::i16, 24},
919 {Intrinsic::bitreverse, MVT::i32, 33},
920 {Intrinsic::bitreverse, MVT::i64, 52},
921 {Intrinsic::vp_bitreverse, MVT::i8, 17},
922 {Intrinsic::vp_bitreverse, MVT::i16, 24},
923 {Intrinsic::vp_bitreverse, MVT::i32, 33},
924 {Intrinsic::vp_bitreverse, MVT::i64, 52},
925 {Intrinsic::ctpop, MVT::i8, 12},
926 {Intrinsic::ctpop, MVT::i16, 19},
927 {Intrinsic::ctpop, MVT::i32, 20},
928 {Intrinsic::ctpop, MVT::i64, 21},
929 {Intrinsic::ctlz, MVT::i8, 19},
930 {Intrinsic::ctlz, MVT::i16, 28},
931 {Intrinsic::ctlz, MVT::i32, 31},
932 {Intrinsic::ctlz, MVT::i64, 35},
933 {Intrinsic::cttz, MVT::i8, 16},
934 {Intrinsic::cttz, MVT::i16, 23},
935 {Intrinsic::cttz, MVT::i32, 24},
936 {Intrinsic::cttz, MVT::i64, 25},
937 {Intrinsic::vp_ctpop, MVT::i8, 12},
938 {Intrinsic::vp_ctpop, MVT::i16, 19},
939 {Intrinsic::vp_ctpop, MVT::i32, 20},
940 {Intrinsic::vp_ctpop, MVT::i64, 21},
941 {Intrinsic::vp_ctlz, MVT::i8, 19},
942 {Intrinsic::vp_ctlz, MVT::i16, 28},
943 {Intrinsic::vp_ctlz, MVT::i32, 31},
944 {Intrinsic::vp_ctlz, MVT::i64, 35},
945 {Intrinsic::vp_cttz, MVT::i8, 16},
946 {Intrinsic::vp_cttz, MVT::i16, 23},
947 {Intrinsic::vp_cttz, MVT::i32, 24},
948 {Intrinsic::vp_cttz, MVT::i64, 25},
949};
950
952 switch (ID) {
953#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
954 case Intrinsic::VPID: \
955 return ISD::VPSD;
956#include "llvm/IR/VPIntrinsics.def"
957#undef HELPER_MAP_VPID_TO_VPSD
958 }
959 return ISD::DELETED_NODE;
960}
961
965 auto *RetTy = ICA.getReturnType();
966 switch (ICA.getID()) {
967 case Intrinsic::lrint:
968 case Intrinsic::llrint:
969 // We can't currently lower half or bfloat vector lrint/llrint.
970 if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
971 VecTy && VecTy->getElementType()->is16bitFPTy())
973 [[fallthrough]];
974 case Intrinsic::ceil:
975 case Intrinsic::floor:
976 case Intrinsic::trunc:
977 case Intrinsic::rint:
978 case Intrinsic::round:
979 case Intrinsic::roundeven: {
980 // These all use the same code.
982 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
983 return LT.first * 8;
984 break;
985 }
986 case Intrinsic::umin:
987 case Intrinsic::umax:
988 case Intrinsic::smin:
989 case Intrinsic::smax: {
991 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
992 return LT.first;
993
994 if (ST->hasVInstructions() && LT.second.isVector()) {
995 unsigned Op;
996 switch (ICA.getID()) {
997 case Intrinsic::umin:
998 Op = RISCV::VMINU_VV;
999 break;
1000 case Intrinsic::umax:
1001 Op = RISCV::VMAXU_VV;
1002 break;
1003 case Intrinsic::smin:
1004 Op = RISCV::VMIN_VV;
1005 break;
1006 case Intrinsic::smax:
1007 Op = RISCV::VMAX_VV;
1008 break;
1009 }
1010 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1011 }
1012 break;
1013 }
1014 case Intrinsic::sadd_sat:
1015 case Intrinsic::ssub_sat:
1016 case Intrinsic::uadd_sat:
1017 case Intrinsic::usub_sat: {
1018 auto LT = getTypeLegalizationCost(RetTy);
1019 if (ST->hasVInstructions() && LT.second.isVector()) {
1020 unsigned Op;
1021 switch (ICA.getID()) {
1022 case Intrinsic::sadd_sat:
1023 Op = RISCV::VSADD_VV;
1024 break;
1025 case Intrinsic::ssub_sat:
1026 Op = RISCV::VSSUBU_VV;
1027 break;
1028 case Intrinsic::uadd_sat:
1029 Op = RISCV::VSADDU_VV;
1030 break;
1031 case Intrinsic::usub_sat:
1032 Op = RISCV::VSSUBU_VV;
1033 break;
1034 }
1035 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1036 }
1037 break;
1038 }
1039 case Intrinsic::fabs: {
1040 auto LT = getTypeLegalizationCost(RetTy);
1041 if (ST->hasVInstructions() && LT.second.isVector()) {
1042 // lui a0, 8
1043 // addi a0, a0, -1
1044 // vsetvli a1, zero, e16, m1, ta, ma
1045 // vand.vx v8, v8, a0
1046 // f16 with zvfhmin and bf16 with zvfhbmin
1047 if (LT.second.getVectorElementType() == MVT::bf16 ||
1048 (LT.second.getVectorElementType() == MVT::f16 &&
1049 !ST->hasVInstructionsF16()))
1050 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1051 CostKind) +
1052 2;
1053 else
1054 return LT.first *
1055 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1056 }
1057 break;
1058 }
1059 case Intrinsic::sqrt: {
1060 auto LT = getTypeLegalizationCost(RetTy);
1061 if (ST->hasVInstructions() && LT.second.isVector()) {
1064 MVT ConvType = LT.second;
1065 MVT FsqrtType = LT.second;
1066 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1067 // will be spilt.
1068 if (LT.second.getVectorElementType() == MVT::bf16) {
1069 if (LT.second == MVT::nxv32bf16) {
1070 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1071 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1072 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1073 ConvType = MVT::nxv16f16;
1074 FsqrtType = MVT::nxv16f32;
1075 } else {
1076 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1077 FsqrtOp = {RISCV::VFSQRT_V};
1078 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1079 }
1080 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1081 !ST->hasVInstructionsF16()) {
1082 if (LT.second == MVT::nxv32f16) {
1083 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1084 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1085 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1086 ConvType = MVT::nxv16f16;
1087 FsqrtType = MVT::nxv16f32;
1088 } else {
1089 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1090 FsqrtOp = {RISCV::VFSQRT_V};
1091 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1092 }
1093 } else {
1094 FsqrtOp = {RISCV::VFSQRT_V};
1095 }
1096
1097 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1098 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1099 }
1100 break;
1101 }
1102 case Intrinsic::cttz:
1103 case Intrinsic::ctlz:
1104 case Intrinsic::ctpop: {
1105 auto LT = getTypeLegalizationCost(RetTy);
1106 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
1107 unsigned Op;
1108 switch (ICA.getID()) {
1109 case Intrinsic::cttz:
1110 Op = RISCV::VCTZ_V;
1111 break;
1112 case Intrinsic::ctlz:
1113 Op = RISCV::VCLZ_V;
1114 break;
1115 case Intrinsic::ctpop:
1116 Op = RISCV::VCPOP_V;
1117 break;
1118 }
1119 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1120 }
1121 break;
1122 }
1123 case Intrinsic::abs: {
1124 auto LT = getTypeLegalizationCost(RetTy);
1125 if (ST->hasVInstructions() && LT.second.isVector()) {
1126 // vrsub.vi v10, v8, 0
1127 // vmax.vv v8, v8, v10
1128 return LT.first *
1129 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1130 LT.second, CostKind);
1131 }
1132 break;
1133 }
1134 case Intrinsic::get_active_lane_mask: {
1135 if (ST->hasVInstructions()) {
1136 Type *ExpRetTy = VectorType::get(
1137 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1138 auto LT = getTypeLegalizationCost(ExpRetTy);
1139
1140 // vid.v v8 // considered hoisted
1141 // vsaddu.vx v8, v8, a0
1142 // vmsltu.vx v0, v8, a1
1143 return LT.first *
1144 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1145 LT.second, CostKind);
1146 }
1147 break;
1148 }
1149 // TODO: add more intrinsic
1150 case Intrinsic::stepvector: {
1151 auto LT = getTypeLegalizationCost(RetTy);
1152 // Legalisation of illegal types involves an `index' instruction plus
1153 // (LT.first - 1) vector adds.
1154 if (ST->hasVInstructions())
1155 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1156 (LT.first - 1) *
1157 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1158 return 1 + (LT.first - 1);
1159 }
1160 case Intrinsic::experimental_cttz_elts: {
1161 Type *ArgTy = ICA.getArgTypes()[0];
1162 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1163 if (getTLI()->shouldExpandCttzElements(ArgType))
1164 break;
1165 InstructionCost Cost = getRISCVInstructionCost(
1166 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1167
1168 // If zero_is_poison is false, then we will generate additional
1169 // cmp + select instructions to convert -1 to EVL.
1170 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1171 if (ICA.getArgs().size() > 1 &&
1172 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1173 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1175 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1177
1178 return Cost;
1179 }
1180 case Intrinsic::vp_rint: {
1181 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1182 unsigned Cost = 5;
1183 auto LT = getTypeLegalizationCost(RetTy);
1184 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1185 return Cost * LT.first;
1186 break;
1187 }
1188 case Intrinsic::vp_nearbyint: {
1189 // More one read and one write for fflags than vp_rint.
1190 unsigned Cost = 7;
1191 auto LT = getTypeLegalizationCost(RetTy);
1192 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1193 return Cost * LT.first;
1194 break;
1195 }
1196 case Intrinsic::vp_ceil:
1197 case Intrinsic::vp_floor:
1198 case Intrinsic::vp_round:
1199 case Intrinsic::vp_roundeven:
1200 case Intrinsic::vp_roundtozero: {
1201 // Rounding with static rounding mode needs two more instructions to
1202 // swap/write FRM than vp_rint.
1203 unsigned Cost = 7;
1204 auto LT = getTypeLegalizationCost(RetTy);
1205 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1206 if (TLI->isOperationCustom(VPISD, LT.second))
1207 return Cost * LT.first;
1208 break;
1209 }
1210 case Intrinsic::vp_fneg: {
1211 std::optional<unsigned> FOp =
1213 assert(FOp.has_value());
1214 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1215 break;
1216 }
1217 case Intrinsic::vp_select: {
1218 Intrinsic::ID IID = ICA.getID();
1219 std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
1220 assert(FOp.has_value());
1221 return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1223 }
1224 case Intrinsic::vp_merge:
1225 return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
1227 CostKind);
1228 case Intrinsic::experimental_vp_splat: {
1229 auto LT = getTypeLegalizationCost(RetTy);
1230 // TODO: Lower i1 experimental_vp_splat
1231 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1233 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1234 ? RISCV::VFMV_V_F
1235 : RISCV::VMV_V_X,
1236 LT.second, CostKind);
1237 }
1238 }
1239
1240 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1241 if (auto LT = getTypeLegalizationCost(RetTy);
1242 LT.second.isVector()) {
1243 MVT EltTy = LT.second.getVectorElementType();
1244 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1245 ICA.getID(), EltTy))
1246 return LT.first * Entry->Cost;
1247 }
1248 }
1249
1251}
1252
1254 Type *Src,
1257 const Instruction *I) {
1258 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1259 if (!IsVectorType)
1260 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1261
1262 // FIXME: Need to compute legalizing cost for illegal types. The current
1263 // code handles only legal types and those which can be trivially
1264 // promoted to legal.
1265 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1266 Dst->getScalarSizeInBits() > ST->getELen())
1267 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1268
1269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1270 assert(ISD && "Invalid opcode");
1271 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1272 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1273
1274 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1275 // The shared implementation doesn't model vector widening during legalization
1276 // and instead assumes scalarization. In order to scalarize an <N x i1>
1277 // vector, we need to extend/trunc to/from i8. If we don't special case
1278 // this, we can get an infinite recursion cycle.
1279 switch (ISD) {
1280 default:
1281 break;
1282 case ISD::SIGN_EXTEND:
1283 case ISD::ZERO_EXTEND:
1284 if (Src->getScalarSizeInBits() == 1) {
1285 // We do not use vsext/vzext to extend from mask vector.
1286 // Instead we use the following instructions to extend from mask vector:
1287 // vmv.v.i v8, 0
1288 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1289 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1290 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1291 DstLT.second, CostKind) +
1292 DstLT.first - 1;
1293 }
1294 break;
1295 case ISD::TRUNCATE:
1296 if (Dst->getScalarSizeInBits() == 1) {
1297 // We do not use several vncvt to truncate to mask vector. So we could
1298 // not use PowDiff to calculate it.
1299 // Instead we use the following instructions to truncate to mask vector:
1300 // vand.vi v8, v8, 1
1301 // vmsne.vi v0, v8, 0
1302 return SrcLT.first *
1303 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1304 SrcLT.second, CostKind) +
1305 SrcLT.first - 1;
1306 }
1307 break;
1308 };
1309
1310 // Our actual lowering for the case where a wider legal type is available
1311 // uses promotion to the wider type. This is reflected in the result of
1312 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1313 // scalarized if the legalized Src and Dst are not equal sized.
1314 const DataLayout &DL = this->getDataLayout();
1315 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1317 SrcLT.second.getSizeInBits()) ||
1319 DstLT.second.getSizeInBits()))
1320 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1321
1322 // The split cost is handled by the base getCastInstrCost
1323 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1324
1325 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1326 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1327 switch (ISD) {
1328 case ISD::SIGN_EXTEND:
1329 case ISD::ZERO_EXTEND: {
1330 if ((PowDiff < 1) || (PowDiff > 3))
1331 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1332 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1333 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1334 unsigned Op =
1335 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1336 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1337 }
1338 case ISD::TRUNCATE:
1339 case ISD::FP_EXTEND:
1340 case ISD::FP_ROUND: {
1341 // Counts of narrow/widen instructions.
1342 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1343 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1344
1345 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1346 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1347 : RISCV::VFNCVT_F_F_W;
1349 for (; SrcEltSize != DstEltSize;) {
1350 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1351 ? MVT::getIntegerVT(DstEltSize)
1352 : MVT::getFloatingPointVT(DstEltSize);
1353 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1354 DstEltSize =
1355 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1356 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1357 }
1358 return Cost;
1359 }
1360 case ISD::FP_TO_SINT:
1361 case ISD::FP_TO_UINT: {
1362 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1363 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1364 unsigned FWCVT =
1365 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1366 unsigned FNCVT =
1367 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1368 unsigned SrcEltSize = Src->getScalarSizeInBits();
1369 unsigned DstEltSize = Dst->getScalarSizeInBits();
1371 if ((SrcEltSize == 16) &&
1372 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1373 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1374 // pre-widening to f32 and then convert f32 to integer
1375 VectorType *VecF32Ty =
1376 VectorType::get(Type::getFloatTy(Dst->getContext()),
1377 cast<VectorType>(Dst)->getElementCount());
1378 std::pair<InstructionCost, MVT> VecF32LT =
1379 getTypeLegalizationCost(VecF32Ty);
1380 Cost +=
1381 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1382 VecF32LT.second, CostKind);
1383 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1384 return Cost;
1385 }
1386 if (DstEltSize == SrcEltSize)
1387 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1388 else if (DstEltSize > SrcEltSize)
1389 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1390 else { // (SrcEltSize > DstEltSize)
1391 // First do a narrowing conversion to an integer half the size, then
1392 // truncate if needed.
1393 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1394 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1395 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1396 if ((SrcEltSize / 2) > DstEltSize) {
1397 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1398 Cost +=
1399 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1400 }
1401 }
1402 return Cost;
1403 }
1404 case ISD::SINT_TO_FP:
1405 case ISD::UINT_TO_FP: {
1406 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1407 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1408 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1409 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1410 unsigned SrcEltSize = Src->getScalarSizeInBits();
1411 unsigned DstEltSize = Dst->getScalarSizeInBits();
1412
1414 if ((DstEltSize == 16) &&
1415 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1416 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1417 // it is converted to f32 and then converted to f16
1418 VectorType *VecF32Ty =
1419 VectorType::get(Type::getFloatTy(Dst->getContext()),
1420 cast<VectorType>(Dst)->getElementCount());
1421 std::pair<InstructionCost, MVT> VecF32LT =
1422 getTypeLegalizationCost(VecF32Ty);
1423 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1424 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1425 DstLT.second, CostKind);
1426 return Cost;
1427 }
1428
1429 if (DstEltSize == SrcEltSize)
1430 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1431 else if (DstEltSize > SrcEltSize) {
1432 if ((DstEltSize / 2) > SrcEltSize) {
1433 VectorType *VecTy =
1434 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1435 cast<VectorType>(Dst)->getElementCount());
1436 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1437 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1438 }
1439 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1440 } else
1441 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1442 return Cost;
1443 }
1444 }
1445 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1446}
1447
1448unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1449 if (isa<ScalableVectorType>(Ty)) {
1450 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1451 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1452 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1453 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1454 }
1455 return cast<FixedVectorType>(Ty)->getNumElements();
1456}
1457
1460 FastMathFlags FMF,
1462 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1463 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1464
1465 // Skip if scalar size of Ty is bigger than ELEN.
1466 if (Ty->getScalarSizeInBits() > ST->getELen())
1467 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1468
1469 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1470 if (Ty->getElementType()->isIntegerTy(1)) {
1471 // SelectionDAGBuilder does following transforms:
1472 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1473 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1474 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1475 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1476 else
1477 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1478 }
1479
1480 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1482 InstructionCost ExtraCost = 0;
1483 switch (IID) {
1484 case Intrinsic::maximum:
1485 if (FMF.noNaNs()) {
1486 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1487 } else {
1488 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1489 RISCV::VFMV_F_S};
1490 // Cost of Canonical Nan + branch
1491 // lui a0, 523264
1492 // fmv.w.x fa0, a0
1493 Type *DstTy = Ty->getScalarType();
1494 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1495 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1496 ExtraCost = 1 +
1497 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1499 getCFInstrCost(Instruction::Br, CostKind);
1500 }
1501 break;
1502
1503 case Intrinsic::minimum:
1504 if (FMF.noNaNs()) {
1505 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1506 } else {
1507 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1508 RISCV::VFMV_F_S};
1509 // Cost of Canonical Nan + branch
1510 // lui a0, 523264
1511 // fmv.w.x fa0, a0
1512 Type *DstTy = Ty->getScalarType();
1513 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1514 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1515 ExtraCost = 1 +
1516 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1518 getCFInstrCost(Instruction::Br, CostKind);
1519 }
1520 break;
1521 }
1522 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1523 }
1524
1525 // IR Reduction is composed by one rvv reduction instruction and vmv
1526 unsigned SplitOp;
1528 switch (IID) {
1529 default:
1530 llvm_unreachable("Unsupported intrinsic");
1531 case Intrinsic::smax:
1532 SplitOp = RISCV::VMAX_VV;
1533 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1534 break;
1535 case Intrinsic::smin:
1536 SplitOp = RISCV::VMIN_VV;
1537 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1538 break;
1539 case Intrinsic::umax:
1540 SplitOp = RISCV::VMAXU_VV;
1541 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1542 break;
1543 case Intrinsic::umin:
1544 SplitOp = RISCV::VMINU_VV;
1545 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1546 break;
1547 case Intrinsic::maxnum:
1548 SplitOp = RISCV::VFMAX_VV;
1549 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1550 break;
1551 case Intrinsic::minnum:
1552 SplitOp = RISCV::VFMIN_VV;
1553 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1554 break;
1555 }
1556 // Add a cost for data larger than LMUL8
1557 InstructionCost SplitCost =
1558 (LT.first > 1) ? (LT.first - 1) *
1559 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1560 : 0;
1561 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1562}
1563
1566 std::optional<FastMathFlags> FMF,
1568 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1569 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1570
1571 // Skip if scalar size of Ty is bigger than ELEN.
1572 if (Ty->getScalarSizeInBits() > ST->getELen())
1573 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1574
1575 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1576 assert(ISD && "Invalid opcode");
1577
1578 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1579 ISD != ISD::FADD)
1580 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1581
1582 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1583 Type *ElementTy = Ty->getElementType();
1584 if (ElementTy->isIntegerTy(1)) {
1585 // Example sequences:
1586 // vfirst.m a0, v0
1587 // seqz a0, a0
1588 if (LT.second == MVT::v1i1)
1589 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1590 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1592
1593 if (ISD == ISD::AND) {
1594 // Example sequences:
1595 // vmand.mm v8, v9, v8 ; needed every time type is split
1596 // vmnot.m v8, v0 ; alias for vmnand
1597 // vcpop.m a0, v8
1598 // seqz a0, a0
1599
1600 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1601 // For LMUL <= 8, there is no splitting,
1602 // the sequences are vmnot, vcpop and seqz.
1603 // When LMUL > 8 and split = 1,
1604 // the sequences are vmnand, vcpop and seqz.
1605 // When LMUL > 8 and split > 1,
1606 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1607 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1608 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1609 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1610 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1611 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1613 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1614 // Example sequences:
1615 // vsetvli a0, zero, e8, mf8, ta, ma
1616 // vmxor.mm v8, v0, v8 ; needed every time type is split
1617 // vcpop.m a0, v8
1618 // andi a0, a0, 1
1619 return (LT.first - 1) *
1620 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1621 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1622 } else {
1623 assert(ISD == ISD::OR);
1624 // Example sequences:
1625 // vsetvli a0, zero, e8, mf8, ta, ma
1626 // vmor.mm v8, v9, v8 ; needed every time type is split
1627 // vcpop.m a0, v0
1628 // snez a0, a0
1629 return (LT.first - 1) *
1630 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1631 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1632 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1634 }
1635 }
1636
1637 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1638 // instruction, and others is composed by two vmv and one rvv reduction
1639 // instruction
1640 unsigned SplitOp;
1642 switch (ISD) {
1643 case ISD::ADD:
1644 SplitOp = RISCV::VADD_VV;
1645 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1646 break;
1647 case ISD::OR:
1648 SplitOp = RISCV::VOR_VV;
1649 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1650 break;
1651 case ISD::XOR:
1652 SplitOp = RISCV::VXOR_VV;
1653 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1654 break;
1655 case ISD::AND:
1656 SplitOp = RISCV::VAND_VV;
1657 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1658 break;
1659 case ISD::FADD:
1660 // We can't promote f16/bf16 fadd reductions.
1661 if ((LT.second.getVectorElementType() == MVT::f16 &&
1662 !ST->hasVInstructionsF16()) ||
1663 LT.second.getVectorElementType() == MVT::bf16)
1664 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1666 Opcodes.push_back(RISCV::VFMV_S_F);
1667 for (unsigned i = 0; i < LT.first.getValue(); i++)
1668 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1669 Opcodes.push_back(RISCV::VFMV_F_S);
1670 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1671 }
1672 SplitOp = RISCV::VFADD_VV;
1673 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1674 break;
1675 }
1676 // Add a cost for data larger than LMUL8
1677 InstructionCost SplitCost =
1678 (LT.first > 1) ? (LT.first - 1) *
1679 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1680 : 0;
1681 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1682}
1683
1685 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1687 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1688 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1689 FMF, CostKind);
1690
1691 // Skip if scalar size of ResTy is bigger than ELEN.
1692 if (ResTy->getScalarSizeInBits() > ST->getELen())
1693 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1694 FMF, CostKind);
1695
1696 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1697 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1698 FMF, CostKind);
1699
1700 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1701
1702 if (IsUnsigned && Opcode == Instruction::Add &&
1703 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
1704 // Represent vector_reduce_add(ZExt(<n x i1>)) as
1705 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
1706 return LT.first *
1707 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
1708 }
1709
1710 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1711 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1712 FMF, CostKind);
1713
1714 return (LT.first - 1) +
1715 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1716}
1717
1719 TTI::OperandValueInfo OpInfo,
1721 assert(OpInfo.isConstant() && "non constant operand?");
1722 if (!isa<VectorType>(Ty))
1723 // FIXME: We need to account for immediate materialization here, but doing
1724 // a decent job requires more knowledge about the immediate than we
1725 // currently have here.
1726 return 0;
1727
1728 if (OpInfo.isUniform())
1729 // vmv.v.i, vmv.v.x, or vfmv.v.f
1730 // We ignore the cost of the scalar constant materialization to be consistent
1731 // with how we treat scalar constants themselves just above.
1732 return 1;
1733
1734 return getConstantPoolLoadCost(Ty, CostKind);
1735}
1736
1737
1739 MaybeAlign Alignment,
1740 unsigned AddressSpace,
1742 TTI::OperandValueInfo OpInfo,
1743 const Instruction *I) {
1744 EVT VT = TLI->getValueType(DL, Src, true);
1745 // Type legalization can't handle structs
1746 if (VT == MVT::Other)
1747 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1748 CostKind, OpInfo, I);
1749
1751 if (Opcode == Instruction::Store && OpInfo.isConstant())
1752 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1753
1754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1755
1756 InstructionCost BaseCost = [&]() {
1757 InstructionCost Cost = LT.first;
1759 return Cost;
1760
1761 // Our actual lowering for the case where a wider legal type is available
1762 // uses the a VL predicated load on the wider type. This is reflected in
1763 // the result of getTypeLegalizationCost, but BasicTTI assumes the
1764 // widened cases are scalarized.
1765 const DataLayout &DL = this->getDataLayout();
1766 if (Src->isVectorTy() && LT.second.isVector() &&
1768 LT.second.getSizeInBits()))
1769 return Cost;
1770
1771 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1772 CostKind, OpInfo, I);
1773 }();
1774
1775 // Assume memory ops cost scale with the number of vector registers
1776 // possible accessed by the instruction. Note that BasicTTI already
1777 // handles the LT.first term for us.
1778 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1779 BaseCost *= TLI->getLMULCost(LT.second);
1780 return Cost + BaseCost;
1781
1782}
1783
1785 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1787 TTI::OperandValueInfo Op2Info, const Instruction *I) {
1789 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1790 Op1Info, Op2Info, I);
1791
1792 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1793 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1794 Op1Info, Op2Info, I);
1795
1796 // Skip if scalar size of ValTy is bigger than ELEN.
1797 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1798 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1799 Op1Info, Op2Info, I);
1800
1801 auto GetConstantMatCost =
1802 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
1803 if (OpInfo.isUniform())
1804 // We return 0 we currently ignore the cost of materializing scalar
1805 // constants in GPRs.
1806 return 0;
1807
1808 return getConstantPoolLoadCost(ValTy, CostKind);
1809 };
1810
1811 InstructionCost ConstantMatCost;
1812 if (Op1Info.isConstant())
1813 ConstantMatCost += GetConstantMatCost(Op1Info);
1814 if (Op2Info.isConstant())
1815 ConstantMatCost += GetConstantMatCost(Op2Info);
1816
1817 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1818 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1819 if (CondTy->isVectorTy()) {
1820 if (ValTy->getScalarSizeInBits() == 1) {
1821 // vmandn.mm v8, v8, v9
1822 // vmand.mm v9, v0, v9
1823 // vmor.mm v0, v9, v8
1824 return ConstantMatCost +
1825 LT.first *
1826 getRISCVInstructionCost(
1827 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1828 LT.second, CostKind);
1829 }
1830 // vselect and max/min are supported natively.
1831 return ConstantMatCost +
1832 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
1833 CostKind);
1834 }
1835
1836 if (ValTy->getScalarSizeInBits() == 1) {
1837 // vmv.v.x v9, a0
1838 // vmsne.vi v9, v9, 0
1839 // vmandn.mm v8, v8, v9
1840 // vmand.mm v9, v0, v9
1841 // vmor.mm v0, v9, v8
1842 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1843 return ConstantMatCost +
1844 LT.first *
1845 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1846 InterimVT, CostKind) +
1847 LT.first * getRISCVInstructionCost(
1848 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1849 LT.second, CostKind);
1850 }
1851
1852 // vmv.v.x v10, a0
1853 // vmsne.vi v0, v10, 0
1854 // vmerge.vvm v8, v9, v8, v0
1855 return ConstantMatCost +
1856 LT.first * getRISCVInstructionCost(
1857 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1858 LT.second, CostKind);
1859 }
1860
1861 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1862 CmpInst::isIntPredicate(VecPred)) {
1863 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1864 // provided they incur the same cost across all implementations
1865 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
1866 LT.second,
1867 CostKind);
1868 }
1869
1870 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1871 CmpInst::isFPPredicate(VecPred)) {
1872
1873 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1874 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1875 return ConstantMatCost +
1876 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1877
1878 // If we do not support the input floating point vector type, use the base
1879 // one which will calculate as:
1880 // ScalarizeCost + Num * Cost for fixed vector,
1881 // InvalidCost for scalable vector.
1882 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1883 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1884 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1885 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1886 Op1Info, Op2Info, I);
1887
1888 // Assuming vector fp compare and mask instructions are all the same cost
1889 // until a need arises to differentiate them.
1890 switch (VecPred) {
1891 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1892 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1893 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1894 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1895 return ConstantMatCost +
1896 LT.first * getRISCVInstructionCost(
1897 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1898 LT.second, CostKind);
1899
1900 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1901 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1902 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1903 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1904 return ConstantMatCost +
1905 LT.first *
1906 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1907 LT.second, CostKind);
1908
1909 case CmpInst::FCMP_OEQ: // vmfeq.vv
1910 case CmpInst::FCMP_OGT: // vmflt.vv
1911 case CmpInst::FCMP_OGE: // vmfle.vv
1912 case CmpInst::FCMP_OLT: // vmflt.vv
1913 case CmpInst::FCMP_OLE: // vmfle.vv
1914 case CmpInst::FCMP_UNE: // vmfne.vv
1915 return ConstantMatCost +
1916 LT.first *
1917 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1918 default:
1919 break;
1920 }
1921 }
1922
1923 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1924 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1925 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1926 // be (0 + select instr cost).
1927 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1928 ValTy->isIntegerTy() && !I->user_empty()) {
1929 if (all_of(I->users(), [&](const User *U) {
1930 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1931 U->getType()->isIntegerTy() &&
1932 !isa<ConstantData>(U->getOperand(1)) &&
1933 !isa<ConstantData>(U->getOperand(2));
1934 }))
1935 return 0;
1936 }
1937
1938 // TODO: Add cost for scalar type.
1939
1940 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1941 Op1Info, Op2Info, I);
1942}
1943
1946 const Instruction *I) {
1948 return Opcode == Instruction::PHI ? 0 : 1;
1949 // Branches are assumed to be predicted.
1950 return 0;
1951}
1952
1955 unsigned Index, Value *Op0,
1956 Value *Op1) {
1957 assert(Val->isVectorTy() && "This must be a vector type");
1958
1959 if (Opcode != Instruction::ExtractElement &&
1960 Opcode != Instruction::InsertElement)
1961 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1962
1963 // Legalize the type.
1964 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1965
1966 // This type is legalized to a scalar type.
1967 if (!LT.second.isVector()) {
1968 auto *FixedVecTy = cast<FixedVectorType>(Val);
1969 // If Index is a known constant, cost is zero.
1970 if (Index != -1U)
1971 return 0;
1972 // Extract/InsertElement with non-constant index is very costly when
1973 // scalarized; estimate cost of loads/stores sequence via the stack:
1974 // ExtractElement cost: store vector to stack, load scalar;
1975 // InsertElement cost: store vector to stack, store scalar, load vector.
1976 Type *ElemTy = FixedVecTy->getElementType();
1977 auto NumElems = FixedVecTy->getNumElements();
1978 auto Align = DL.getPrefTypeAlign(ElemTy);
1979 InstructionCost LoadCost =
1980 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1981 InstructionCost StoreCost =
1982 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1983 return Opcode == Instruction::ExtractElement
1984 ? StoreCost * NumElems + LoadCost
1985 : (StoreCost + LoadCost) * NumElems + StoreCost;
1986 }
1987
1988 // For unsupported scalable vector.
1989 if (LT.second.isScalableVector() && !LT.first.isValid())
1990 return LT.first;
1991
1992 // Mask vector extract/insert is expanded via e8.
1993 if (Val->getScalarSizeInBits() == 1) {
1994 VectorType *WideTy =
1996 cast<VectorType>(Val)->getElementCount());
1997 if (Opcode == Instruction::ExtractElement) {
1998 InstructionCost ExtendCost
1999 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2001 InstructionCost ExtractCost
2002 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2003 return ExtendCost + ExtractCost;
2004 }
2005 InstructionCost ExtendCost
2006 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2008 InstructionCost InsertCost
2009 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2010 InstructionCost TruncCost
2011 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2013 return ExtendCost + InsertCost + TruncCost;
2014 }
2015
2016
2017 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2018 // and vslideup + vmv.s.x to insert element to vector.
2019 unsigned BaseCost = 1;
2020 // When insertelement we should add the index with 1 as the input of vslideup.
2021 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2022
2023 if (Index != -1U) {
2024 // The type may be split. For fixed-width vectors we can normalize the
2025 // index to the new type.
2026 if (LT.second.isFixedLengthVector()) {
2027 unsigned Width = LT.second.getVectorNumElements();
2028 Index = Index % Width;
2029 }
2030
2031 // If exact VLEN is known, we will insert/extract into the appropriate
2032 // subvector with no additional subvector insert/extract cost.
2033 if (auto VLEN = ST->getRealVLen()) {
2034 unsigned EltSize = LT.second.getScalarSizeInBits();
2035 unsigned M1Max = *VLEN / EltSize;
2036 Index = Index % M1Max;
2037 }
2038
2039 // We could extract/insert the first element without vslidedown/vslideup.
2040 if (Index == 0)
2041 SlideCost = 0;
2042 else if (Opcode == Instruction::InsertElement)
2043 SlideCost = 1; // With a constant index, we do not need to use addi.
2044 }
2045
2046 // When the vector needs to split into multiple register groups and the index
2047 // exceeds single vector register group, we need to insert/extract the element
2048 // via stack.
2049 if (LT.first > 1 &&
2050 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2051 LT.second.isScalableVector()))) {
2052 Type *ScalarType = Val->getScalarType();
2053 Align VecAlign = DL.getPrefTypeAlign(Val);
2054 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2055 // Extra addi for unknown index.
2056 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2057
2058 // Store all split vectors into stack and load the target element.
2059 if (Opcode == Instruction::ExtractElement)
2060 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2061 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2062 CostKind) +
2063 IdxCost;
2064
2065 // Store all split vectors into stack and store the target element and load
2066 // vectors back.
2067 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2068 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2069 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2070 CostKind) +
2071 IdxCost;
2072 }
2073
2074 // Extract i64 in the target that has XLEN=32 need more instruction.
2075 if (Val->getScalarType()->isIntegerTy() &&
2076 ST->getXLen() < Val->getScalarSizeInBits()) {
2077 // For extractelement, we need the following instructions:
2078 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2079 // vslidedown.vx v8, v8, a0
2080 // vmv.x.s a0, v8
2081 // li a1, 32
2082 // vsrl.vx v8, v8, a1
2083 // vmv.x.s a1, v8
2084
2085 // For insertelement, we need the following instructions:
2086 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2087 // vmv.v.i v12, 0
2088 // vslide1up.vx v16, v12, a1
2089 // vslide1up.vx v12, v16, a0
2090 // addi a0, a2, 1
2091 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2092 // vslideup.vx v8, v12, a2
2093
2094 // TODO: should we count these special vsetvlis?
2095 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2096 }
2097 return BaseCost + SlideCost;
2098}
2099
2101 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2103 ArrayRef<const Value *> Args, const Instruction *CxtI) {
2104
2105 // TODO: Handle more cost kinds.
2107 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2108 Args, CxtI);
2109
2110 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2111 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2112 Args, CxtI);
2113
2114 // Skip if scalar size of Ty is bigger than ELEN.
2115 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2116 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2117 Args, CxtI);
2118
2119 // Legalize the type.
2120 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2121
2122 // TODO: Handle scalar type.
2123 if (!LT.second.isVector())
2124 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2125 Args, CxtI);
2126
2127 // f16 with zvfhmin and bf16 will be promoted to f32.
2128 // FIXME: nxv32[b]f16 will be custom lowered and split.
2129 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2130 InstructionCost CastCost = 0;
2131 if ((LT.second.getVectorElementType() == MVT::f16 ||
2132 LT.second.getVectorElementType() == MVT::bf16) &&
2133 TLI->getOperationAction(ISDOpcode, LT.second) ==
2135 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2136 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2137 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2138 // Add cost of extending arguments
2139 CastCost += LT.first * Args.size() *
2140 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2142 // Add cost of truncating result
2143 CastCost +=
2144 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2146 // Compute cost of op in promoted type
2147 LT.second = PromotedVT;
2148 }
2149
2150 auto getConstantMatCost =
2151 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2152 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2153 // Two sub-cases:
2154 // * Has a 5 bit immediate operand which can be splatted.
2155 // * Has a larger immediate which must be materialized in scalar register
2156 // We return 0 for both as we currently ignore the cost of materializing
2157 // scalar constants in GPRs.
2158 return 0;
2159
2160 return getConstantPoolLoadCost(Ty, CostKind);
2161 };
2162
2163 // Add the cost of materializing any constant vectors required.
2164 InstructionCost ConstantMatCost = 0;
2165 if (Op1Info.isConstant())
2166 ConstantMatCost += getConstantMatCost(0, Op1Info);
2167 if (Op2Info.isConstant())
2168 ConstantMatCost += getConstantMatCost(1, Op2Info);
2169
2170 unsigned Op;
2171 switch (ISDOpcode) {
2172 case ISD::ADD:
2173 case ISD::SUB:
2174 Op = RISCV::VADD_VV;
2175 break;
2176 case ISD::SHL:
2177 case ISD::SRL:
2178 case ISD::SRA:
2179 Op = RISCV::VSLL_VV;
2180 break;
2181 case ISD::AND:
2182 case ISD::OR:
2183 case ISD::XOR:
2184 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2185 break;
2186 case ISD::MUL:
2187 case ISD::MULHS:
2188 case ISD::MULHU:
2189 Op = RISCV::VMUL_VV;
2190 break;
2191 case ISD::SDIV:
2192 case ISD::UDIV:
2193 Op = RISCV::VDIV_VV;
2194 break;
2195 case ISD::SREM:
2196 case ISD::UREM:
2197 Op = RISCV::VREM_VV;
2198 break;
2199 case ISD::FADD:
2200 case ISD::FSUB:
2201 Op = RISCV::VFADD_VV;
2202 break;
2203 case ISD::FMUL:
2204 Op = RISCV::VFMUL_VV;
2205 break;
2206 case ISD::FDIV:
2207 Op = RISCV::VFDIV_VV;
2208 break;
2209 case ISD::FNEG:
2210 Op = RISCV::VFSGNJN_VV;
2211 break;
2212 default:
2213 // Assuming all other instructions have the same cost until a need arises to
2214 // differentiate them.
2215 return CastCost + ConstantMatCost +
2216 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2217 Args, CxtI);
2218 }
2219
2220 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2221 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2222 // ops are twice as expensive as integer ops. Do the same for vectors so
2223 // scalar floating point ops aren't cheaper than their vector equivalents.
2224 if (Ty->isFPOrFPVectorTy())
2225 InstrCost *= 2;
2226 return CastCost + ConstantMatCost + LT.first * InstrCost;
2227}
2228
2229// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2231 ArrayRef<const Value *> Ptrs, const Value *Base,
2232 const TTI::PointersChainInfo &Info, Type *AccessTy,
2235 // In the basic model we take into account GEP instructions only
2236 // (although here can come alloca instruction, a value, constants and/or
2237 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2238 // pointer). Typically, if Base is a not a GEP-instruction and all the
2239 // pointers are relative to the same base address, all the rest are
2240 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2241 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2242 // any their index is a non-const.
2243 // If no known dependecies between the pointers cost is calculated as a sum
2244 // of costs of GEP instructions.
2245 for (auto [I, V] : enumerate(Ptrs)) {
2246 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2247 if (!GEP)
2248 continue;
2249 if (Info.isSameBase() && V != Base) {
2250 if (GEP->hasAllConstantIndices())
2251 continue;
2252 // If the chain is unit-stride and BaseReg + stride*i is a legal
2253 // addressing mode, then presume the base GEP is sitting around in a
2254 // register somewhere and check if we can fold the offset relative to
2255 // it.
2256 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2257 if (Info.isUnitStride() &&
2258 isLegalAddressingMode(AccessTy,
2259 /* BaseGV */ nullptr,
2260 /* BaseOffset */ Stride * I,
2261 /* HasBaseReg */ true,
2262 /* Scale */ 0,
2263 GEP->getType()->getPointerAddressSpace()))
2264 continue;
2265 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2266 {TTI::OK_AnyValue, TTI::OP_None},
2267 {TTI::OK_AnyValue, TTI::OP_None}, {});
2268 } else {
2269 SmallVector<const Value *> Indices(GEP->indices());
2270 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2271 Indices, AccessTy, CostKind);
2272 }
2273 }
2274 return Cost;
2275}
2276
2280 // TODO: More tuning on benchmarks and metrics with changes as needed
2281 // would apply to all settings below to enable performance.
2282
2283
2284 if (ST->enableDefaultUnroll())
2285 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2286
2287 // Enable Upper bound unrolling universally, not dependant upon the conditions
2288 // below.
2289 UP.UpperBound = true;
2290
2291 // Disable loop unrolling for Oz and Os.
2292 UP.OptSizeThreshold = 0;
2294 if (L->getHeader()->getParent()->hasOptSize())
2295 return;
2296
2297 SmallVector<BasicBlock *, 4> ExitingBlocks;
2298 L->getExitingBlocks(ExitingBlocks);
2299 LLVM_DEBUG(dbgs() << "Loop has:\n"
2300 << "Blocks: " << L->getNumBlocks() << "\n"
2301 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2302
2303 // Only allow another exit other than the latch. This acts as an early exit
2304 // as it mirrors the profitability calculation of the runtime unroller.
2305 if (ExitingBlocks.size() > 2)
2306 return;
2307
2308 // Limit the CFG of the loop body for targets with a branch predictor.
2309 // Allowing 4 blocks permits if-then-else diamonds in the body.
2310 if (L->getNumBlocks() > 4)
2311 return;
2312
2313 // Don't unroll vectorized loops, including the remainder loop
2314 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2315 return;
2316
2317 // Scan the loop: don't unroll loops with calls as this could prevent
2318 // inlining.
2320 for (auto *BB : L->getBlocks()) {
2321 for (auto &I : *BB) {
2322 // Initial setting - Don't unroll loops containing vectorized
2323 // instructions.
2324 if (I.getType()->isVectorTy())
2325 return;
2326
2327 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2328 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2329 if (!isLoweredToCall(F))
2330 continue;
2331 }
2332 return;
2333 }
2334
2335 SmallVector<const Value *> Operands(I.operand_values());
2338 }
2339 }
2340
2341 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2342
2343 UP.Partial = true;
2344 UP.Runtime = true;
2345 UP.UnrollRemainder = true;
2346 UP.UnrollAndJam = true;
2347
2348 // Force unrolling small loops can be very useful because of the branch
2349 // taken cost of the backedge.
2350 if (Cost < 12)
2351 UP.Force = true;
2352}
2353
2357}
2358
2360 if (Ty->isVectorTy()) {
2361 // f16 with only zvfhmin and bf16 will be promoted to f32
2362 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2363 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2364 EltTy->isBFloatTy())
2366 cast<VectorType>(Ty));
2367
2369 if (Size.isScalable() && ST->hasVInstructions())
2370 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2371
2373 return divideCeil(Size, ST->getRealMinVLen());
2374 }
2375
2376 return BaseT::getRegUsageForType(Ty);
2377}
2378
2379unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2380 if (SLPMaxVF.getNumOccurrences())
2381 return SLPMaxVF;
2382
2383 // Return how many elements can fit in getRegisterBitwidth. This is the
2384 // same routine as used in LoopVectorizer. We should probably be
2385 // accounting for whether we actually have instructions with the right
2386 // lane type, but we don't have enough information to do that without
2387 // some additional plumbing which hasn't been justified yet.
2388 TypeSize RegWidth =
2390 // If no vector registers, or absurd element widths, disable
2391 // vectorization by returning 1.
2392 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2393}
2394
2397 ScalarEvolution *SE) const {
2398 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2399 return TTI::AMK_PostIndexed;
2400
2402}
2403
2405 const TargetTransformInfo::LSRCost &C2) {
2406 // RISC-V specific here are "instruction number 1st priority".
2407 // If we need to emit adds inside the loop to add up base registers, then
2408 // we need at least one extra temporary register.
2409 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2410 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2411 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2412 C1.NumIVMuls, C1.NumBaseAdds,
2413 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2414 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2415 C2.NumIVMuls, C2.NumBaseAdds,
2416 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2417}
2418
2420 auto *VTy = dyn_cast<VectorType>(DataTy);
2421 if (!VTy || VTy->isScalableTy())
2422 return false;
2423
2424 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2425 return false;
2426
2427 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2428 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2429 if (VTy->getElementType()->isIntegerTy(8))
2430 if (VTy->getElementCount().getFixedValue() > 256)
2431 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2433 return true;
2434}
2435
2437 auto *VTy = dyn_cast<VectorType>(DataTy);
2438 if (!VTy || VTy->isScalableTy())
2439 return false;
2440
2441 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2442 return false;
2443 return true;
2444}
2445
2446/// See if \p I should be considered for address type promotion. We check if \p
2447/// I is a sext with right type and used in memory accesses. If it used in a
2448/// "complex" getelementptr, we allow it to be promoted without finding other
2449/// sext instructions that sign extended the same initial value. A getelementptr
2450/// is considered as "complex" if it has more than 2 operands.
2452 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2453 bool Considerable = false;
2454 AllowPromotionWithoutCommonHeader = false;
2455 if (!isa<SExtInst>(&I))
2456 return false;
2457 Type *ConsideredSExtType =
2458 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2459 if (I.getType() != ConsideredSExtType)
2460 return false;
2461 // See if the sext is the one with the right type and used in at least one
2462 // GetElementPtrInst.
2463 for (const User *U : I.users()) {
2464 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2465 Considerable = true;
2466 // A getelementptr is considered as "complex" if it has more than 2
2467 // operands. We will promote a SExt used in such complex GEP as we
2468 // expect some computation to be merged if they are done on 64 bits.
2469 if (GEPInst->getNumOperands() > 2) {
2470 AllowPromotionWithoutCommonHeader = true;
2471 break;
2472 }
2473 }
2474 }
2475 return Considerable;
2476}
2477
2478bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2479 switch (Opcode) {
2480 case Instruction::Add:
2481 case Instruction::Sub:
2482 case Instruction::Mul:
2483 case Instruction::And:
2484 case Instruction::Or:
2485 case Instruction::Xor:
2486 case Instruction::FAdd:
2487 case Instruction::FSub:
2488 case Instruction::FMul:
2489 case Instruction::FDiv:
2490 case Instruction::ICmp:
2491 case Instruction::FCmp:
2492 return true;
2493 case Instruction::Shl:
2494 case Instruction::LShr:
2495 case Instruction::AShr:
2496 case Instruction::UDiv:
2497 case Instruction::SDiv:
2498 case Instruction::URem:
2499 case Instruction::SRem:
2500 case Instruction::Select:
2501 return Operand == 1;
2502 default:
2503 return false;
2504 }
2505}
2506
2508 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2509 return false;
2510
2511 if (canSplatOperand(I->getOpcode(), Operand))
2512 return true;
2513
2514 auto *II = dyn_cast<IntrinsicInst>(I);
2515 if (!II)
2516 return false;
2517
2518 switch (II->getIntrinsicID()) {
2519 case Intrinsic::fma:
2520 case Intrinsic::vp_fma:
2521 case Intrinsic::fmuladd:
2522 case Intrinsic::vp_fmuladd:
2523 return Operand == 0 || Operand == 1;
2524 case Intrinsic::vp_shl:
2525 case Intrinsic::vp_lshr:
2526 case Intrinsic::vp_ashr:
2527 case Intrinsic::vp_udiv:
2528 case Intrinsic::vp_sdiv:
2529 case Intrinsic::vp_urem:
2530 case Intrinsic::vp_srem:
2531 case Intrinsic::ssub_sat:
2532 case Intrinsic::vp_ssub_sat:
2533 case Intrinsic::usub_sat:
2534 case Intrinsic::vp_usub_sat:
2535 case Intrinsic::vp_select:
2536 return Operand == 1;
2537 // These intrinsics are commutative.
2538 case Intrinsic::vp_add:
2539 case Intrinsic::vp_mul:
2540 case Intrinsic::vp_and:
2541 case Intrinsic::vp_or:
2542 case Intrinsic::vp_xor:
2543 case Intrinsic::vp_fadd:
2544 case Intrinsic::vp_fmul:
2545 case Intrinsic::vp_icmp:
2546 case Intrinsic::vp_fcmp:
2547 case Intrinsic::smin:
2548 case Intrinsic::vp_smin:
2549 case Intrinsic::umin:
2550 case Intrinsic::vp_umin:
2551 case Intrinsic::smax:
2552 case Intrinsic::vp_smax:
2553 case Intrinsic::umax:
2554 case Intrinsic::vp_umax:
2555 case Intrinsic::sadd_sat:
2556 case Intrinsic::vp_sadd_sat:
2557 case Intrinsic::uadd_sat:
2558 case Intrinsic::vp_uadd_sat:
2559 // These intrinsics have 'vr' versions.
2560 case Intrinsic::vp_sub:
2561 case Intrinsic::vp_fsub:
2562 case Intrinsic::vp_fdiv:
2563 return Operand == 0 || Operand == 1;
2564 default:
2565 return false;
2566 }
2567}
2568
2569/// Check if sinking \p I's operands to I's basic block is profitable, because
2570/// the operands can be folded into a target instruction, e.g.
2571/// splats of scalars can fold into vector instructions.
2573 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2574 using namespace llvm::PatternMatch;
2575
2576 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2577 return false;
2578
2579 // Don't sink splat operands if the target prefers it. Some targets requires
2580 // S2V transfer buffers and we can run out of them copying the same value
2581 // repeatedly.
2582 // FIXME: It could still be worth doing if it would improve vector register
2583 // pressure and prevent a vector spill.
2584 if (!ST->sinkSplatOperands())
2585 return false;
2586
2587 for (auto OpIdx : enumerate(I->operands())) {
2588 if (!canSplatOperand(I, OpIdx.index()))
2589 continue;
2590
2591 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2592 // Make sure we are not already sinking this operand
2593 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2594 continue;
2595
2596 // We are looking for a splat that can be sunk.
2598 m_Undef(), m_ZeroMask())))
2599 continue;
2600
2601 // Don't sink i1 splats.
2602 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2603 continue;
2604
2605 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2606 // and vector registers
2607 for (Use &U : Op->uses()) {
2608 Instruction *Insn = cast<Instruction>(U.getUser());
2609 if (!canSplatOperand(Insn, U.getOperandNo()))
2610 return false;
2611 }
2612
2613 Ops.push_back(&Op->getOperandUse(0));
2614 Ops.push_back(&OpIdx.value());
2615 }
2616 return true;
2617}
2618
2620RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2622 // TODO: Enable expansion when unaligned access is not supported after we fix
2623 // issues in ExpandMemcmp.
2624 if (!ST->enableUnalignedScalarMem())
2625 return Options;
2626
2627 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
2628 return Options;
2629
2630 Options.AllowOverlappingLoads = true;
2631 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2632 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2633 if (ST->is64Bit()) {
2634 Options.LoadSizes = {8, 4, 2, 1};
2635 Options.AllowedTailExpansions = {3, 5, 6};
2636 } else {
2637 Options.LoadSizes = {4, 2, 1};
2638 Options.AllowedTailExpansions = {3};
2639 }
2640 return Options;
2641}
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:623
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:801
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:800
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:479
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:695
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:598
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:42
The optimization diagnostic interface.
unsigned getMaxLMULForFixedLengthVectors() const
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
std::optional< unsigned > getRealVLen() const
bool hasOptimizedSegmentLoadStore(unsigned NF) const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
MVT getContainerForFixedLengthVector(MVT VT) const
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVII::VLMUL getLMUL(MVT VT)
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const
If the action for this operation is to promote, this method returns the ValueType to promote to.
const DataLayout & getDataLayout() const
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:228
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2067
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).