LLVM 17.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
18#include "llvm/IR/BasicBlock.h"
19#include "llvm/IR/DataLayout.h"
21#include "llvm/IR/Instruction.h"
24#include "llvm/IR/Intrinsics.h"
25#include "llvm/IR/IntrinsicsARM.h"
27#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60
62
64
65/// Convert a vector load intrinsic into a simple llvm load instruction.
66/// This is beneficial when the underlying object being addressed comes
67/// from a constant, since we get constant-folding for free.
68static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69 InstCombiner::BuilderTy &Builder) {
70 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71
72 if (!IntrAlign)
73 return nullptr;
74
75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76 ? MemAlign
77 : IntrAlign->getLimitedValue();
78
79 if (!isPowerOf2_32(Alignment))
80 return nullptr;
81
82 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83 PointerType::get(II.getType(), 0));
84 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85}
86
88 const Function *Callee) const {
89 const TargetMachine &TM = getTLI()->getTargetMachine();
90 const FeatureBitset &CallerBits =
91 TM.getSubtargetImpl(*Caller)->getFeatureBits();
92 const FeatureBitset &CalleeBits =
93 TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95 // To inline a callee, all features not in the allowed list must match exactly.
96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97 (CalleeBits & ~InlineFeaturesAllowed);
98 // For features in the allowed list, the callee's features must be a subset of
99 // the callers'.
100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101 (CalleeBits & InlineFeaturesAllowed);
102 return MatchExact && MatchSubset;
103}
104
107 ScalarEvolution *SE) const {
108 if (ST->hasMVEIntegerOps())
110
111 if (L->getHeader()->getParent()->hasOptSize())
112 return TTI::AMK_None;
113
114 if (ST->isMClass() && ST->isThumb2() &&
115 L->getNumBlocks() == 1)
116 return TTI::AMK_PreIndexed;
117
118 return TTI::AMK_None;
119}
120
121std::optional<Instruction *>
123 using namespace PatternMatch;
124 Intrinsic::ID IID = II.getIntrinsicID();
125 switch (IID) {
126 default:
127 break;
128 case Intrinsic::arm_neon_vld1: {
129 Align MemAlign =
132 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133 return IC.replaceInstUsesWith(II, V);
134 }
135 break;
136 }
137
138 case Intrinsic::arm_neon_vld2:
139 case Intrinsic::arm_neon_vld3:
140 case Intrinsic::arm_neon_vld4:
141 case Intrinsic::arm_neon_vld2lane:
142 case Intrinsic::arm_neon_vld3lane:
143 case Intrinsic::arm_neon_vld4lane:
144 case Intrinsic::arm_neon_vst1:
145 case Intrinsic::arm_neon_vst2:
146 case Intrinsic::arm_neon_vst3:
147 case Intrinsic::arm_neon_vst4:
148 case Intrinsic::arm_neon_vst2lane:
149 case Intrinsic::arm_neon_vst3lane:
150 case Intrinsic::arm_neon_vst4lane: {
151 Align MemAlign =
154 unsigned AlignArg = II.arg_size() - 1;
155 Value *AlignArgOp = II.getArgOperand(AlignArg);
156 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157 if (Align && *Align < MemAlign) {
158 return IC.replaceOperand(
159 II, AlignArg,
161 false));
162 }
163 break;
164 }
165
166 case Intrinsic::arm_mve_pred_i2v: {
167 Value *Arg = II.getArgOperand(0);
168 Value *ArgArg;
169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170 PatternMatch::m_Value(ArgArg))) &&
171 II.getType() == ArgArg->getType()) {
172 return IC.replaceInstUsesWith(II, ArgArg);
173 }
174 Constant *XorMask;
175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176 PatternMatch::m_Value(ArgArg)),
177 PatternMatch::m_Constant(XorMask))) &&
178 II.getType() == ArgArg->getType()) {
179 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180 if (CI->getValue().trunc(16).isAllOnes()) {
181 auto TrueVector = IC.Builder.CreateVectorSplat(
182 cast<FixedVectorType>(II.getType())->getNumElements(),
183 IC.Builder.getTrue());
184 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185 }
186 }
187 }
188 KnownBits ScalarKnown(32);
189 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190 ScalarKnown, 0)) {
191 return &II;
192 }
193 break;
194 }
195 case Intrinsic::arm_mve_pred_v2i: {
196 Value *Arg = II.getArgOperand(0);
197 Value *ArgArg;
198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199 PatternMatch::m_Value(ArgArg)))) {
200 return IC.replaceInstUsesWith(II, ArgArg);
201 }
202 if (!II.getMetadata(LLVMContext::MD_range)) {
203 Type *IntTy32 = Type::getInt32Ty(II.getContext());
204 Metadata *M[] = {
206 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
207 II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
208 return &II;
209 }
210 break;
211 }
212 case Intrinsic::arm_mve_vadc:
213 case Intrinsic::arm_mve_vadc_predicated: {
214 unsigned CarryOp =
215 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
216 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
217 "Bad type for intrinsic!");
218
219 KnownBits CarryKnown(32);
220 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
221 CarryKnown)) {
222 return &II;
223 }
224 break;
225 }
226 case Intrinsic::arm_mve_vmldava: {
227 Instruction *I = cast<Instruction>(&II);
228 if (I->hasOneUse()) {
229 auto *User = cast<Instruction>(*I->user_begin());
230 Value *OpZ;
231 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
232 match(I->getOperand(3), m_Zero())) {
233 Value *OpX = I->getOperand(4);
234 Value *OpY = I->getOperand(5);
235 Type *OpTy = OpX->getType();
236
238 Value *V =
239 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
240 {I->getOperand(0), I->getOperand(1),
241 I->getOperand(2), OpZ, OpX, OpY});
242
244 return IC.eraseInstFromFunction(*User);
245 }
246 }
247 return std::nullopt;
248 }
249 }
250 return std::nullopt;
251}
252
254 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
255 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
256 std::function<void(Instruction *, unsigned, APInt, APInt &)>
257 SimplifyAndSetOp) const {
258
259 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
260 // opcode specifying a Top/Bottom instruction, which can change between
261 // instructions.
262 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
263 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
264 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
265
266 // The only odd/even lanes of operand 0 will only be demanded depending
267 // on whether this is a top/bottom instruction.
268 APInt DemandedElts =
269 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
270 : APInt::getHighBitsSet(2, 1));
271 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
272 // The other lanes will be defined from the inserted elements.
273 UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
274 : APInt::getHighBitsSet(2, 1));
275 return std::nullopt;
276 };
277
278 switch (II.getIntrinsicID()) {
279 default:
280 break;
281 case Intrinsic::arm_mve_vcvt_narrow:
282 SimplifyNarrowInstrTopBottom(2);
283 break;
284 case Intrinsic::arm_mve_vqmovn:
285 SimplifyNarrowInstrTopBottom(4);
286 break;
287 case Intrinsic::arm_mve_vshrn:
288 SimplifyNarrowInstrTopBottom(7);
289 break;
290 }
291
292 return std::nullopt;
293}
294
297 assert(Ty->isIntegerTy());
298
299 unsigned Bits = Ty->getPrimitiveSizeInBits();
300 if (Bits == 0 || Imm.getActiveBits() >= 64)
301 return 4;
302
303 int64_t SImmVal = Imm.getSExtValue();
304 uint64_t ZImmVal = Imm.getZExtValue();
305 if (!ST->isThumb()) {
306 if ((SImmVal >= 0 && SImmVal < 65536) ||
307 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
308 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
309 return 1;
310 return ST->hasV6T2Ops() ? 2 : 3;
311 }
312 if (ST->isThumb2()) {
313 if ((SImmVal >= 0 && SImmVal < 65536) ||
314 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
315 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
316 return 1;
317 return ST->hasV6T2Ops() ? 2 : 3;
318 }
319 // Thumb1, any i8 imm cost 1.
320 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
321 return 1;
322 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
323 return 2;
324 // Load from constantpool.
325 return 3;
326}
327
328// Constants smaller than 256 fit in the immediate field of
329// Thumb1 instructions so we return a zero cost and 1 otherwise.
331 const APInt &Imm, Type *Ty) {
332 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
333 return 0;
334
335 return 1;
336}
337
338// Checks whether Inst is part of a min(max()) or max(min()) pattern
339// that will match to an SSAT instruction. Returns the instruction being
340// saturated, or null if no saturation pattern was found.
341static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
342 Value *LHS, *RHS;
343 ConstantInt *C;
345
346 if (InstSPF == SPF_SMAX &&
348 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
349
350 auto isSSatMin = [&](Value *MinInst) {
351 if (isa<SelectInst>(MinInst)) {
352 Value *MinLHS, *MinRHS;
353 ConstantInt *MinC;
354 SelectPatternFlavor MinSPF =
355 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
356 if (MinSPF == SPF_SMIN &&
358 MinC->getValue() == ((-Imm) - 1))
359 return true;
360 }
361 return false;
362 };
363
364 if (isSSatMin(Inst->getOperand(1)))
365 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
366 if (Inst->hasNUses(2) &&
367 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
368 return Inst->getOperand(1);
369 }
370 return nullptr;
371}
372
373// Look for a FP Saturation pattern, where the instruction can be simplified to
374// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
375static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
376 if (Imm.getBitWidth() != 64 ||
377 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
378 return false;
379 Value *FP = isSSATMinMaxPattern(Inst, Imm);
380 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
381 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
382 if (!FP)
383 return false;
384 return isa<FPToSIInst>(FP);
385}
386
388 const APInt &Imm, Type *Ty,
390 Instruction *Inst) {
391 // Division by a constant can be turned into multiplication, but only if we
392 // know it's constant. So it's not so much that the immediate is cheap (it's
393 // not), but that the alternative is worse.
394 // FIXME: this is probably unneeded with GlobalISel.
395 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
396 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
397 Idx == 1)
398 return 0;
399
400 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
401 // splitting any large offsets.
402 if (Opcode == Instruction::GetElementPtr && Idx != 0)
403 return 0;
404
405 if (Opcode == Instruction::And) {
406 // UXTB/UXTH
407 if (Imm == 255 || Imm == 65535)
408 return 0;
409 // Conversion to BIC is free, and means we can use ~Imm instead.
410 return std::min(getIntImmCost(Imm, Ty, CostKind),
411 getIntImmCost(~Imm, Ty, CostKind));
412 }
413
414 if (Opcode == Instruction::Add)
415 // Conversion to SUB is free, and means we can use -Imm instead.
416 return std::min(getIntImmCost(Imm, Ty, CostKind),
417 getIntImmCost(-Imm, Ty, CostKind));
418
419 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
420 Ty->getIntegerBitWidth() == 32) {
421 int64_t NegImm = -Imm.getSExtValue();
422 if (ST->isThumb2() && NegImm < 1<<12)
423 // icmp X, #-C -> cmn X, #C
424 return 0;
425 if (ST->isThumb() && NegImm < 1<<8)
426 // icmp X, #-C -> adds X, #C
427 return 0;
428 }
429
430 // xor a, -1 can always be folded to MVN
431 if (Opcode == Instruction::Xor && Imm.isAllOnes())
432 return 0;
433
434 // Ensures negative constant of min(max()) or max(min()) patterns that
435 // match to SSAT instructions don't get hoisted
436 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
437 Ty->getIntegerBitWidth() <= 32) {
438 if (isSSATMinMaxPattern(Inst, Imm) ||
439 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
440 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
441 return 0;
442 }
443
444 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
445 return 0;
446
447 // We can convert <= -1 to < 0, which is generally quite cheap.
448 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
449 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
450 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
451 return std::min(getIntImmCost(Imm, Ty, CostKind),
452 getIntImmCost(Imm + 1, Ty, CostKind));
453 }
454
455 return getIntImmCost(Imm, Ty, CostKind);
456}
457
460 const Instruction *I) {
462 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
463 // FIXME: The vectorizer is highly sensistive to the cost of these
464 // instructions, which suggests that it may be using the costs incorrectly.
465 // But, for now, just make them free to avoid performance regressions for
466 // vector targets.
467 return 0;
468 }
469 return BaseT::getCFInstrCost(Opcode, CostKind, I);
470}
471
473 Type *Src,
476 const Instruction *I) {
477 int ISD = TLI->InstructionOpcodeToISD(Opcode);
478 assert(ISD && "Invalid opcode");
479
480 // TODO: Allow non-throughput costs that aren't binary.
481 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
483 return Cost == 0 ? 0 : 1;
484 return Cost;
485 };
486 auto IsLegalFPType = [this](EVT VT) {
487 EVT EltVT = VT.getScalarType();
488 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
489 (EltVT == MVT::f64 && ST->hasFP64()) ||
490 (EltVT == MVT::f16 && ST->hasFullFP16());
491 };
492
493 EVT SrcTy = TLI->getValueType(DL, Src);
494 EVT DstTy = TLI->getValueType(DL, Dst);
495
496 if (!SrcTy.isSimple() || !DstTy.isSimple())
497 return AdjustCost(
498 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
499
500 // Extending masked load/Truncating masked stores is expensive because we
501 // currently don't split them. This means that we'll likely end up
502 // loading/storing each element individually (hence the high cost).
503 if ((ST->hasMVEIntegerOps() &&
504 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
505 Opcode == Instruction::SExt)) ||
506 (ST->hasMVEFloatOps() &&
507 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
508 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
509 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
510 return 2 * DstTy.getVectorNumElements() *
512
513 // The extend of other kinds of load is free
514 if (CCH == TTI::CastContextHint::Normal ||
516 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
529 };
530 if (const auto *Entry = ConvertCostTableLookup(
531 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
532 return AdjustCost(Entry->Cost);
533
534 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
541 // The following extend from a legal type to an illegal type, so need to
542 // split the load. This introduced an extra load operation, but the
543 // extend is still "free".
550 };
551 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
552 if (const auto *Entry =
553 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
554 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
555 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
556 }
557
558 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
559 // FPExtends are similar but also require the VCVT instructions.
562 };
563 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
564 if (const auto *Entry =
565 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
566 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
567 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
568 }
569
570 // The truncate of a store is free. This is the mirror of extends above.
571 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
579 };
580 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
581 if (const auto *Entry =
582 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
583 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
584 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
585 }
586
587 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
590 };
591 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
592 if (const auto *Entry =
593 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
594 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
595 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
596 }
597 }
598
599 // NEON vector operations that can extend their inputs.
600 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
601 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
602 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
603 // vaddl
606 // vsubl
609 // vmull
612 // vshll
615 };
616
617 auto *User = cast<Instruction>(*I->user_begin());
618 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
619 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
620 DstTy.getSimpleVT(),
621 SrcTy.getSimpleVT())) {
622 return AdjustCost(Entry->Cost);
623 }
624 }
625
626 // Single to/from double precision conversions.
627 if (Src->isVectorTy() && ST->hasNEON() &&
628 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
629 DstTy.getScalarType() == MVT::f32) ||
630 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
631 DstTy.getScalarType() == MVT::f64))) {
632 static const CostTblEntry NEONFltDblTbl[] = {
633 // Vector fptrunc/fpext conversions.
637
638 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
639 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
640 return AdjustCost(LT.first * Entry->Cost);
641 }
642
643 // Some arithmetic, load and store operations have specific instructions
644 // to cast up/down their types automatically at no extra cost.
645 // TODO: Get these tables to know at least what the related operations are.
646 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
653
654 // The number of vmovl instructions for the extension.
673
674 // Operations that we legalize using splitting.
677
678 // Vector float <-> i32 conversions.
681
702
709
710 // Vector double <-> i32 conversions.
713
720
727 };
728
729 if (SrcTy.isVector() && ST->hasNEON()) {
730 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
731 DstTy.getSimpleVT(),
732 SrcTy.getSimpleVT()))
733 return AdjustCost(Entry->Cost);
734 }
735
736 // Scalar float to integer conversions.
737 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
758 };
759 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
760 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
761 DstTy.getSimpleVT(),
762 SrcTy.getSimpleVT()))
763 return AdjustCost(Entry->Cost);
764 }
765
766 // Scalar integer to float conversions.
767 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
788 };
789
790 if (SrcTy.isInteger() && ST->hasNEON()) {
791 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
792 ISD, DstTy.getSimpleVT(),
793 SrcTy.getSimpleVT()))
794 return AdjustCost(Entry->Cost);
795 }
796
797 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
798 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
799 // are linearised so take more.
800 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
813 };
814
815 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
816 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
817 ISD, DstTy.getSimpleVT(),
818 SrcTy.getSimpleVT()))
819 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
820 }
821
822 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
823 // As general rule, fp converts that were not matched above are scalarized
824 // and cost 1 vcvt for each lane, so long as the instruction is available.
825 // If not it will become a series of function calls.
826 const InstructionCost CallCost =
827 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
828 int Lanes = 1;
829 if (SrcTy.isFixedLengthVector())
830 Lanes = SrcTy.getVectorNumElements();
831
832 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
833 return Lanes;
834 else
835 return Lanes * CallCost;
836 }
837
838 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
839 SrcTy.isFixedLengthVector()) {
840 // Treat a truncate with larger than legal source (128bits for MVE) as
841 // expensive, 2 instructions per lane.
842 if ((SrcTy.getScalarType() == MVT::i8 ||
843 SrcTy.getScalarType() == MVT::i16 ||
844 SrcTy.getScalarType() == MVT::i32) &&
845 SrcTy.getSizeInBits() > 128 &&
846 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
847 return SrcTy.getVectorNumElements() * 2;
848 }
849
850 // Scalar integer conversion costs.
851 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
852 // i16 -> i64 requires two dependent operations.
854
855 // Truncates on i64 are assumed to be free.
860 };
861
862 if (SrcTy.isInteger()) {
863 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
864 DstTy.getSimpleVT(),
865 SrcTy.getSimpleVT()))
866 return AdjustCost(Entry->Cost);
867 }
868
869 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
871 : 1;
872 return AdjustCost(
873 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
874}
875
878 unsigned Index, Value *Op0,
879 Value *Op1) {
880 // Penalize inserting into an D-subregister. We end up with a three times
881 // lower estimated throughput on swift.
882 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
883 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
884 return 3;
885
886 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
887 Opcode == Instruction::ExtractElement)) {
888 // Cross-class copies are expensive on many microarchitectures,
889 // so assume they are expensive by default.
890 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
891 return 3;
892
893 // Even if it's not a cross class copy, this likely leads to mixing
894 // of NEON and VFP code and should be therefore penalized.
895 if (ValTy->isVectorTy() &&
896 ValTy->getScalarSizeInBits() <= 32)
897 return std::max<InstructionCost>(
898 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
899 2U);
900 }
901
902 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
903 Opcode == Instruction::ExtractElement)) {
904 // Integer cross-lane moves are more expensive than float, which can
905 // sometimes just be vmovs. Integer involve being passes to GPR registers,
906 // causing more of a delay.
907 std::pair<InstructionCost, MVT> LT =
909 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
910 }
911
912 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
913}
914
916 Type *CondTy,
917 CmpInst::Predicate VecPred,
919 const Instruction *I) {
920 int ISD = TLI->InstructionOpcodeToISD(Opcode);
921
922 // Thumb scalar code size cost for select.
923 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
924 ST->isThumb() && !ValTy->isVectorTy()) {
925 // Assume expensive structs.
926 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
927 return TTI::TCC_Expensive;
928
929 // Select costs can vary because they:
930 // - may require one or more conditional mov (including an IT),
931 // - can't operate directly on immediates,
932 // - require live flags, which we can't copy around easily.
934
935 // Possible IT instruction for Thumb2, or more for Thumb1.
936 ++Cost;
937
938 // i1 values may need rematerialising by using mov immediates and/or
939 // flag setting instructions.
940 if (ValTy->isIntegerTy(1))
941 ++Cost;
942
943 return Cost;
944 }
945
946 // If this is a vector min/max/abs, use the cost of that intrinsic directly
947 // instead. Hopefully when min/max intrinsics are more prevalent this code
948 // will not be needed.
949 const Instruction *Sel = I;
950 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
951 Sel->hasOneUse())
952 Sel = cast<Instruction>(Sel->user_back());
953 if (Sel && ValTy->isVectorTy() &&
954 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
955 const Value *LHS, *RHS;
957 unsigned IID = 0;
958 switch (SPF) {
959 case SPF_ABS:
960 IID = Intrinsic::abs;
961 break;
962 case SPF_SMIN:
963 IID = Intrinsic::smin;
964 break;
965 case SPF_SMAX:
966 IID = Intrinsic::smax;
967 break;
968 case SPF_UMIN:
969 IID = Intrinsic::umin;
970 break;
971 case SPF_UMAX:
972 IID = Intrinsic::umax;
973 break;
974 case SPF_FMINNUM:
975 IID = Intrinsic::minnum;
976 break;
977 case SPF_FMAXNUM:
978 IID = Intrinsic::maxnum;
979 break;
980 default:
981 break;
982 }
983 if (IID) {
984 // The ICmp is free, the select gets the cost of the min/max/etc
985 if (Sel != I)
986 return 0;
987 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
988 return getIntrinsicInstrCost(CostAttrs, CostKind);
989 }
990 }
991
992 // On NEON a vector select gets lowered to vbsl.
993 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
994 // Lowering of some vector selects is currently far from perfect.
995 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
996 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
999 };
1000
1001 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1002 EVT SelValTy = TLI->getValueType(DL, ValTy);
1003 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1004 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1005 SelCondTy.getSimpleVT(),
1006 SelValTy.getSimpleVT()))
1007 return Entry->Cost;
1008 }
1009
1010 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1011 return LT.first;
1012 }
1013
1014 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1015 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1016 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1017 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1018 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1019 if (!VecCondTy)
1020 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1021
1022 // If we don't have mve.fp any fp operations will need to be scalarized.
1023 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1024 // One scalaization insert, one scalarization extract and the cost of the
1025 // fcmps.
1026 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1027 /*Extract*/ true, CostKind) +
1028 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1029 /*Extract*/ false, CostKind) +
1030 VecValTy->getNumElements() *
1031 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1032 VecCondTy->getScalarType(), VecPred,
1033 CostKind, I);
1034 }
1035
1036 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1037 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1038 // There are two types - the input that specifies the type of the compare
1039 // and the output vXi1 type. Because we don't know how the output will be
1040 // split, we may need an expensive shuffle to get two in sync. This has the
1041 // effect of making larger than legal compares (v8i32 for example)
1042 // expensive.
1043 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1044 if (LT.first > 1)
1045 return LT.first * BaseCost +
1046 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1047 /*Extract*/ false, CostKind);
1048 return BaseCost;
1049 }
1050 }
1051
1052 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1053 // for "multiple beats" potentially needed by MVE instructions.
1054 int BaseCost = 1;
1055 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1056 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1057
1058 return BaseCost *
1059 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1060}
1061
1063 ScalarEvolution *SE,
1064 const SCEV *Ptr) {
1065 // Address computations in vectorized code with non-consecutive addresses will
1066 // likely result in more instructions compared to scalar code where the
1067 // computation can more often be merged into the index mode. The resulting
1068 // extra micro-ops can significantly decrease throughput.
1069 unsigned NumVectorInstToHideOverhead = 10;
1070 int MaxMergeDistance = 64;
1071
1072 if (ST->hasNEON()) {
1073 if (Ty->isVectorTy() && SE &&
1074 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1075 return NumVectorInstToHideOverhead;
1076
1077 // In many cases the address computation is not merged into the instruction
1078 // addressing mode.
1079 return 1;
1080 }
1081 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1082}
1083
1085 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1086 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1087 // optimized, else LSR may block tail-predication.
1088 switch (II->getIntrinsicID()) {
1089 case Intrinsic::arm_mve_vctp8:
1090 case Intrinsic::arm_mve_vctp16:
1091 case Intrinsic::arm_mve_vctp32:
1092 case Intrinsic::arm_mve_vctp64:
1093 return true;
1094 default:
1095 break;
1096 }
1097 }
1098 return false;
1099}
1100
1101bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1102 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1103 return false;
1104
1105 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1106 // Don't support v2i1 yet.
1107 if (VecTy->getNumElements() == 2)
1108 return false;
1109
1110 // We don't support extending fp types.
1111 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1112 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1113 return false;
1114 }
1115
1116 unsigned EltWidth = DataTy->getScalarSizeInBits();
1117 return (EltWidth == 32 && Alignment >= 4) ||
1118 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1119}
1120
1122 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1123 return false;
1124
1125 unsigned EltWidth = Ty->getScalarSizeInBits();
1126 return ((EltWidth == 32 && Alignment >= 4) ||
1127 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1128}
1129
1130/// Given a memcpy/memset/memmove instruction, return the number of memory
1131/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1132/// call is used.
1134 MemOp MOp;
1135 unsigned DstAddrSpace = ~0u;
1136 unsigned SrcAddrSpace = ~0u;
1137 const Function *F = I->getParent()->getParent();
1138
1139 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1140 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1141 // If 'size' is not a constant, a library call will be generated.
1142 if (!C)
1143 return -1;
1144
1145 const unsigned Size = C->getValue().getZExtValue();
1146 const Align DstAlign = *MC->getDestAlign();
1147 const Align SrcAlign = *MC->getSourceAlign();
1148
1149 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1150 /*IsVolatile*/ false);
1151 DstAddrSpace = MC->getDestAddressSpace();
1152 SrcAddrSpace = MC->getSourceAddressSpace();
1153 }
1154 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1155 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1156 // If 'size' is not a constant, a library call will be generated.
1157 if (!C)
1158 return -1;
1159
1160 const unsigned Size = C->getValue().getZExtValue();
1161 const Align DstAlign = *MS->getDestAlign();
1162
1163 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1164 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1165 DstAddrSpace = MS->getDestAddressSpace();
1166 }
1167 else
1168 llvm_unreachable("Expected a memcpy/move or memset!");
1169
1170 unsigned Limit, Factor = 2;
1171 switch(I->getIntrinsicID()) {
1172 case Intrinsic::memcpy:
1173 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1174 break;
1175 case Intrinsic::memmove:
1176 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1177 break;
1178 case Intrinsic::memset:
1179 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1180 Factor = 1;
1181 break;
1182 default:
1183 llvm_unreachable("Expected a memcpy/move or memset!");
1184 }
1185
1186 // MemOps will be poplulated with a list of data types that needs to be
1187 // loaded and stored. That's why we multiply the number of elements by 2 to
1188 // get the cost for this memcpy.
1189 std::vector<EVT> MemOps;
1190 if (getTLI()->findOptimalMemOpLowering(
1191 MemOps, Limit, MOp, DstAddrSpace,
1192 SrcAddrSpace, F->getAttributes()))
1193 return MemOps.size() * Factor;
1194
1195 // If we can't find an optimal memop lowering, return the default cost
1196 return -1;
1197}
1198
1200 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1201
1202 // To model the cost of a library call, we assume 1 for the call, and
1203 // 3 for the argument setup.
1204 if (NumOps == -1)
1205 return 4;
1206 return NumOps;
1207}
1208
1210 VectorType *Tp, ArrayRef<int> Mask,
1212 int Index, VectorType *SubTp,
1214 Kind = improveShuffleKindFromMask(Kind, Mask);
1215 if (ST->hasNEON()) {
1216 if (Kind == TTI::SK_Broadcast) {
1217 static const CostTblEntry NEONDupTbl[] = {
1218 // VDUP handles these cases.
1225
1230
1231 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1232 if (const auto *Entry =
1233 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1234 return LT.first * Entry->Cost;
1235 }
1236 if (Kind == TTI::SK_Reverse) {
1237 static const CostTblEntry NEONShuffleTbl[] = {
1238 // Reverse shuffle cost one instruction if we are shuffling within a
1239 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1246
1251
1252 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1253 if (const auto *Entry =
1254 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1255 return LT.first * Entry->Cost;
1256 }
1257 if (Kind == TTI::SK_Select) {
1258 static const CostTblEntry NEONSelShuffleTbl[] = {
1259 // Select shuffle cost table for ARM. Cost is the number of
1260 // instructions
1261 // required to create the shuffled vector.
1262
1267
1271
1273
1275
1276 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1277 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1278 ISD::VECTOR_SHUFFLE, LT.second))
1279 return LT.first * Entry->Cost;
1280 }
1281 }
1282 if (ST->hasMVEIntegerOps()) {
1283 if (Kind == TTI::SK_Broadcast) {
1284 static const CostTblEntry MVEDupTbl[] = {
1285 // VDUP handles these cases.
1291
1292 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1293 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1294 LT.second))
1295 return LT.first * Entry->Cost *
1297 }
1298
1299 if (!Mask.empty()) {
1300 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1301 if (LT.second.isVector() &&
1302 Mask.size() <= LT.second.getVectorNumElements() &&
1303 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1304 isVREVMask(Mask, LT.second, 64)))
1305 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1306 }
1307 }
1308
1309 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1311 : 1;
1312 return BaseCost *
1313 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1314}
1315
1317 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1320 const Instruction *CxtI) {
1321 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1322 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1323 // Make operations on i1 relatively expensive as this often involves
1324 // combining predicates. AND and XOR should be easier to handle with IT
1325 // blocks.
1326 switch (ISDOpcode) {
1327 default:
1328 break;
1329 case ISD::AND:
1330 case ISD::XOR:
1331 return 2;
1332 case ISD::OR:
1333 return 3;
1334 }
1335 }
1336
1337 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1338
1339 if (ST->hasNEON()) {
1340 const unsigned FunctionCallDivCost = 20;
1341 const unsigned ReciprocalDivCost = 10;
1342 static const CostTblEntry CostTbl[] = {
1343 // Division.
1344 // These costs are somewhat random. Choose a cost of 20 to indicate that
1345 // vectorizing devision (added function call) is going to be very expensive.
1346 // Double registers types.
1347 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1348 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1349 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1350 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1351 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1352 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1353 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1354 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1355 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1356 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1357 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1358 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1359 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1360 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1361 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1362 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1363 // Quad register types.
1364 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1365 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1366 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1367 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1368 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1369 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1370 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1371 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1372 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1373 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1374 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1375 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1376 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1377 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1378 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1379 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1380 // Multiplication.
1381 };
1382
1383 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1384 return LT.first * Entry->Cost;
1385
1387 Opcode, Ty, CostKind, Op1Info, Op2Info);
1388
1389 // This is somewhat of a hack. The problem that we are facing is that SROA
1390 // creates a sequence of shift, and, or instructions to construct values.
1391 // These sequences are recognized by the ISel and have zero-cost. Not so for
1392 // the vectorized code. Because we have support for v2i64 but not i64 those
1393 // sequences look particularly beneficial to vectorize.
1394 // To work around this we increase the cost of v2i64 operations to make them
1395 // seem less beneficial.
1396 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1397 Cost += 4;
1398
1399 return Cost;
1400 }
1401
1402 // If this operation is a shift on arm/thumb2, it might well be folded into
1403 // the following instruction, hence having a cost of 0.
1404 auto LooksLikeAFreeShift = [&]() {
1405 if (ST->isThumb1Only() || Ty->isVectorTy())
1406 return false;
1407
1408 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1409 return false;
1410 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1411 return false;
1412
1413 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1414 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1415 case Instruction::Add:
1416 case Instruction::Sub:
1417 case Instruction::And:
1418 case Instruction::Xor:
1419 case Instruction::Or:
1420 case Instruction::ICmp:
1421 return true;
1422 default:
1423 return false;
1424 }
1425 };
1426 if (LooksLikeAFreeShift())
1427 return 0;
1428
1429 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1430 // for "multiple beats" potentially needed by MVE instructions.
1431 int BaseCost = 1;
1432 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1433 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1434
1435 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1436 // without treating floats as more expensive that scalars or increasing the
1437 // costs for custom operations. The results is also multiplied by the
1438 // MVEVectorCostFactor where appropriate.
1439 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1440 return LT.first * BaseCost;
1441
1442 // Else this is expand, assume that we need to scalarize this op.
1443 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1444 unsigned Num = VTy->getNumElements();
1447 // Return the cost of multiple scalar invocation plus the cost of
1448 // inserting and extracting the values.
1449 SmallVector<Type *> Tys(Args.size(), Ty);
1450 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1451 Num * Cost;
1452 }
1453
1454 return BaseCost;
1455}
1456
1458 MaybeAlign Alignment,
1459 unsigned AddressSpace,
1461 TTI::OperandValueInfo OpInfo,
1462 const Instruction *I) {
1463 // TODO: Handle other cost kinds.
1465 return 1;
1466
1467 // Type legalization can't handle structs
1468 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1469 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1470 CostKind);
1471
1472 if (ST->hasNEON() && Src->isVectorTy() &&
1473 (Alignment && *Alignment != Align(16)) &&
1474 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1475 // Unaligned loads/stores are extremely inefficient.
1476 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1477 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1478 return LT.first * 4;
1479 }
1480
1481 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1482 // Same for stores.
1483 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1484 ((Opcode == Instruction::Load && I->hasOneUse() &&
1485 isa<FPExtInst>(*I->user_begin())) ||
1486 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1487 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1488 Type *DstTy =
1489 Opcode == Instruction::Load
1490 ? (*I->user_begin())->getType()
1491 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1492 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1493 DstTy->getScalarType()->isFloatTy())
1494 return ST->getMVEVectorCostFactor(CostKind);
1495 }
1496
1497 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1499 : 1;
1500 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1501 CostKind, OpInfo, I);
1502}
1503
1505ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1506 unsigned AddressSpace,
1508 if (ST->hasMVEIntegerOps()) {
1509 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1510 return ST->getMVEVectorCostFactor(CostKind);
1511 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1512 return ST->getMVEVectorCostFactor(CostKind);
1513 }
1514 if (!isa<FixedVectorType>(Src))
1515 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1516 CostKind);
1517 // Scalar cost, which is currently very high due to the efficiency of the
1518 // generated code.
1519 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1520}
1521
1523 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1524 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1525 bool UseMaskForCond, bool UseMaskForGaps) {
1526 assert(Factor >= 2 && "Invalid interleave factor");
1527 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1528
1529 // vldN/vstN doesn't support vector types of i64/f64 element.
1530 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1531
1532 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1533 !UseMaskForCond && !UseMaskForGaps) {
1534 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1535 auto *SubVecTy =
1536 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1537
1538 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1539 // Accesses having vector types that are a multiple of 128 bits can be
1540 // matched to more than one vldN/vstN instruction.
1541 int BaseCost =
1542 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1543 if (NumElts % Factor == 0 &&
1544 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1545 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1546
1547 // Some smaller than legal interleaved patterns are cheap as we can make
1548 // use of the vmovn or vrev patterns to interleave a standard load. This is
1549 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1550 // promoted differently). The cost of 2 here is then a load and vrev or
1551 // vmovn.
1552 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1553 VecTy->isIntOrIntVectorTy() &&
1554 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1555 return 2 * BaseCost;
1556 }
1557
1558 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1559 Alignment, AddressSpace, CostKind,
1560 UseMaskForCond, UseMaskForGaps);
1561}
1562
1564 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1565 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1566 using namespace PatternMatch;
1567 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1568 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1569 Alignment, CostKind, I);
1570
1571 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1572 auto *VTy = cast<FixedVectorType>(DataTy);
1573
1574 // TODO: Splitting, once we do that.
1575
1576 unsigned NumElems = VTy->getNumElements();
1577 unsigned EltSize = VTy->getScalarSizeInBits();
1578 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1579
1580 // For now, it is assumed that for the MVE gather instructions the loads are
1581 // all effectively serialised. This means the cost is the scalar cost
1582 // multiplied by the number of elements being loaded. This is possibly very
1583 // conservative, but even so we still end up vectorising loops because the
1584 // cost per iteration for many loops is lower than for scalar loops.
1585 InstructionCost VectorCost =
1586 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1587 // The scalarization cost should be a lot higher. We use the number of vector
1588 // elements plus the scalarization overhead.
1589 InstructionCost ScalarCost =
1590 NumElems * LT.first +
1591 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1592 CostKind) +
1593 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1594 CostKind);
1595
1596 if (EltSize < 8 || Alignment < EltSize / 8)
1597 return ScalarCost;
1598
1599 unsigned ExtSize = EltSize;
1600 // Check whether there's a single user that asks for an extended type
1601 if (I != nullptr) {
1602 // Dependent of the caller of this function, a gather instruction will
1603 // either have opcode Instruction::Load or be a call to the masked_gather
1604 // intrinsic
1605 if ((I->getOpcode() == Instruction::Load ||
1606 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1607 I->hasOneUse()) {
1608 const User *Us = *I->users().begin();
1609 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1610 // only allow valid type combinations
1611 unsigned TypeSize =
1612 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1613 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1614 (TypeSize == 16 && EltSize == 8)) &&
1615 TypeSize * NumElems == 128) {
1616 ExtSize = TypeSize;
1617 }
1618 }
1619 }
1620 // Check whether the input data needs to be truncated
1621 TruncInst *T;
1622 if ((I->getOpcode() == Instruction::Store ||
1623 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1624 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1625 // Only allow valid type combinations
1626 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1627 if (((EltSize == 16 && TypeSize == 32) ||
1628 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1629 TypeSize * NumElems == 128)
1630 ExtSize = TypeSize;
1631 }
1632 }
1633
1634 if (ExtSize * NumElems != 128 || NumElems < 4)
1635 return ScalarCost;
1636
1637 // Any (aligned) i32 gather will not need to be scalarised.
1638 if (ExtSize == 32)
1639 return VectorCost;
1640 // For smaller types, we need to ensure that the gep's inputs are correctly
1641 // extended from a small enough value. Other sizes (including i64) are
1642 // scalarized for now.
1643 if (ExtSize != 8 && ExtSize != 16)
1644 return ScalarCost;
1645
1646 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1647 Ptr = BC->getOperand(0);
1648 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1649 if (GEP->getNumOperands() != 2)
1650 return ScalarCost;
1651 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1652 // Scale needs to be correct (which is only relevant for i16s).
1653 if (Scale != 1 && Scale * 8 != ExtSize)
1654 return ScalarCost;
1655 // And we need to zext (not sext) the indexes from a small enough type.
1656 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1657 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1658 return VectorCost;
1659 }
1660 return ScalarCost;
1661 }
1662 return ScalarCost;
1663}
1664
1667 std::optional<FastMathFlags> FMF,
1670 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1671
1672 EVT ValVT = TLI->getValueType(DL, ValTy);
1673 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1674 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1675 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1676
1677 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1678
1679 static const CostTblEntry CostTblAdd[]{
1680 {ISD::ADD, MVT::v16i8, 1},
1681 {ISD::ADD, MVT::v8i16, 1},
1682 {ISD::ADD, MVT::v4i32, 1},
1683 };
1684 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1685 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1686
1687 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1688}
1689
1691 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1692 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) {
1693 EVT ValVT = TLI->getValueType(DL, ValTy);
1694 EVT ResVT = TLI->getValueType(DL, ResTy);
1695
1696 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1697
1698 switch (ISD) {
1699 case ISD::ADD:
1700 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1701 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1702
1703 // The legal cases are:
1704 // VADDV u/s 8/16/32
1705 // VADDLV u/s 32
1706 // Codegen currently cannot always handle larger than legal vectors very
1707 // well, especially for predicated reductions where the mask needs to be
1708 // split, so restrict to 128bit or smaller input types.
1709 unsigned RevVTSize = ResVT.getSizeInBits();
1710 if (ValVT.getSizeInBits() <= 128 &&
1711 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1712 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1713 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1714 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1715 }
1716 break;
1717 default:
1718 break;
1719 }
1720 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1721 CostKind);
1722}
1723
1726 VectorType *ValTy,
1728 EVT ValVT = TLI->getValueType(DL, ValTy);
1729 EVT ResVT = TLI->getValueType(DL, ResTy);
1730
1731 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1733
1734 // The legal cases are:
1735 // VMLAV u/s 8/16/32
1736 // VMLALV u/s 16/32
1737 // Codegen currently cannot always handle larger than legal vectors very
1738 // well, especially for predicated reductions where the mask needs to be
1739 // split, so restrict to 128bit or smaller input types.
1740 unsigned RevVTSize = ResVT.getSizeInBits();
1741 if (ValVT.getSizeInBits() <= 128 &&
1742 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1743 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1744 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1745 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1746 }
1747
1748 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1749}
1750
1754 switch (ICA.getID()) {
1755 case Intrinsic::get_active_lane_mask:
1756 // Currently we make a somewhat optimistic assumption that
1757 // active_lane_mask's are always free. In reality it may be freely folded
1758 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1759 // of add/icmp code. We may need to improve this in the future, but being
1760 // able to detect if it is free or not involves looking at a lot of other
1761 // code. We currently assume that the vectorizer inserted these, and knew
1762 // what it was doing in adding one.
1763 if (ST->hasMVEIntegerOps())
1764 return 0;
1765 break;
1766 case Intrinsic::sadd_sat:
1767 case Intrinsic::ssub_sat:
1768 case Intrinsic::uadd_sat:
1769 case Intrinsic::usub_sat: {
1770 if (!ST->hasMVEIntegerOps())
1771 break;
1772 Type *VT = ICA.getReturnType();
1773
1774 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1775 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1776 LT.second == MVT::v16i8) {
1777 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1778 // need to extend the type, as it uses shr(qadd(shl, shl)).
1779 unsigned Instrs =
1780 LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1781 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1782 }
1783 break;
1784 }
1785 case Intrinsic::abs:
1786 case Intrinsic::smin:
1787 case Intrinsic::smax:
1788 case Intrinsic::umin:
1789 case Intrinsic::umax: {
1790 if (!ST->hasMVEIntegerOps())
1791 break;
1792 Type *VT = ICA.getReturnType();
1793
1794 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1795 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1796 LT.second == MVT::v16i8)
1797 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1798 break;
1799 }
1800 case Intrinsic::minnum:
1801 case Intrinsic::maxnum: {
1802 if (!ST->hasMVEFloatOps())
1803 break;
1804 Type *VT = ICA.getReturnType();
1805 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1806 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1807 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1808 break;
1809 }
1810 case Intrinsic::fptosi_sat:
1811 case Intrinsic::fptoui_sat: {
1812 if (ICA.getArgTypes().empty())
1813 break;
1814 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1815 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1816 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1817 // Check for the legal types, with the corect subtarget features.
1818 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1819 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1820 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1821 return LT.first;
1822
1823 // Equally for MVE vector types
1824 if (ST->hasMVEFloatOps() &&
1825 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1826 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1827 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1828
1829 // Otherwise we use a legal convert followed by a min+max
1830 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1831 (ST->hasFP64() && LT.second == MVT::f64) ||
1832 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1833 (ST->hasMVEFloatOps() &&
1834 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1835 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1836 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1837 LT.second.getScalarSizeInBits());
1839 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1840 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1841 : Intrinsic::umin,
1842 LegalTy, {LegalTy, LegalTy});
1844 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1845 : Intrinsic::umax,
1846 LegalTy, {LegalTy, LegalTy});
1848 return LT.first * Cost;
1849 }
1850 break;
1851 }
1852 }
1853
1855}
1856
1858 if (!F->isIntrinsic())
1859 return BaseT::isLoweredToCall(F);
1860
1861 // Assume all Arm-specific intrinsics map to an instruction.
1862 if (F->getName().startswith("llvm.arm"))
1863 return false;
1864
1865 switch (F->getIntrinsicID()) {
1866 default: break;
1867 case Intrinsic::powi:
1868 case Intrinsic::sin:
1869 case Intrinsic::cos:
1870 case Intrinsic::pow:
1871 case Intrinsic::log:
1872 case Intrinsic::log10:
1873 case Intrinsic::log2:
1874 case Intrinsic::exp:
1875 case Intrinsic::exp2:
1876 return true;
1877 case Intrinsic::sqrt:
1878 case Intrinsic::fabs:
1879 case Intrinsic::copysign:
1880 case Intrinsic::floor:
1881 case Intrinsic::ceil:
1882 case Intrinsic::trunc:
1883 case Intrinsic::rint:
1884 case Intrinsic::nearbyint:
1885 case Intrinsic::round:
1886 case Intrinsic::canonicalize:
1887 case Intrinsic::lround:
1888 case Intrinsic::llround:
1889 case Intrinsic::lrint:
1890 case Intrinsic::llrint:
1891 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
1892 return true;
1893 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
1894 return true;
1895 // Some operations can be handled by vector instructions and assume
1896 // unsupported vectors will be expanded into supported scalar ones.
1897 // TODO Handle scalar operations properly.
1898 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
1899 case Intrinsic::masked_store:
1900 case Intrinsic::masked_load:
1901 case Intrinsic::masked_gather:
1902 case Intrinsic::masked_scatter:
1903 return !ST->hasMVEIntegerOps();
1904 case Intrinsic::sadd_with_overflow:
1905 case Intrinsic::uadd_with_overflow:
1906 case Intrinsic::ssub_with_overflow:
1907 case Intrinsic::usub_with_overflow:
1908 case Intrinsic::sadd_sat:
1909 case Intrinsic::uadd_sat:
1910 case Intrinsic::ssub_sat:
1911 case Intrinsic::usub_sat:
1912 return false;
1913 }
1914
1915 return BaseT::isLoweredToCall(F);
1916}
1917
1919 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
1920 EVT VT = TLI->getValueType(DL, I.getType(), true);
1921 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
1922 return true;
1923
1924 // Check if an intrinsic will be lowered to a call and assume that any
1925 // other CallInst will generate a bl.
1926 if (auto *Call = dyn_cast<CallInst>(&I)) {
1927 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1928 switch(II->getIntrinsicID()) {
1929 case Intrinsic::memcpy:
1930 case Intrinsic::memset:
1931 case Intrinsic::memmove:
1932 return getNumMemOps(II) == -1;
1933 default:
1934 if (const Function *F = Call->getCalledFunction())
1935 return isLoweredToCall(F);
1936 }
1937 }
1938 return true;
1939 }
1940
1941 // FPv5 provides conversions between integer, double-precision,
1942 // single-precision, and half-precision formats.
1943 switch (I.getOpcode()) {
1944 default:
1945 break;
1946 case Instruction::FPToSI:
1947 case Instruction::FPToUI:
1948 case Instruction::SIToFP:
1949 case Instruction::UIToFP:
1950 case Instruction::FPTrunc:
1951 case Instruction::FPExt:
1952 return !ST->hasFPARMv8Base();
1953 }
1954
1955 // FIXME: Unfortunately the approach of checking the Operation Action does
1956 // not catch all cases of Legalization that use library calls. Our
1957 // Legalization step categorizes some transformations into library calls as
1958 // Custom, Expand or even Legal when doing type legalization. So for now
1959 // we have to special case for instance the SDIV of 64bit integers and the
1960 // use of floating point emulation.
1961 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
1962 switch (ISD) {
1963 default:
1964 break;
1965 case ISD::SDIV:
1966 case ISD::UDIV:
1967 case ISD::SREM:
1968 case ISD::UREM:
1969 case ISD::SDIVREM:
1970 case ISD::UDIVREM:
1971 return true;
1972 }
1973 }
1974
1975 // Assume all other non-float operations are supported.
1976 if (!VT.isFloatingPoint())
1977 return false;
1978
1979 // We'll need a library call to handle most floats when using soft.
1980 if (TLI->useSoftFloat()) {
1981 switch (I.getOpcode()) {
1982 default:
1983 return true;
1984 case Instruction::Alloca:
1985 case Instruction::Load:
1986 case Instruction::Store:
1987 case Instruction::Select:
1988 case Instruction::PHI:
1989 return false;
1990 }
1991 }
1992
1993 // We'll need a libcall to perform double precision operations on a single
1994 // precision only FPU.
1995 if (I.getType()->isDoubleTy() && !ST->hasFP64())
1996 return true;
1997
1998 // Likewise for half precision arithmetic.
1999 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2000 return true;
2001
2002 return false;
2003}
2004
2006 AssumptionCache &AC,
2007 TargetLibraryInfo *LibInfo,
2008 HardwareLoopInfo &HWLoopInfo) {
2009 // Low-overhead branches are only supported in the 'low-overhead branch'
2010 // extension of v8.1-m.
2011 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2012 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2013 return false;
2014 }
2015
2017 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2018 return false;
2019 }
2020
2021 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2022 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2023 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2024 return false;
2025 }
2026
2027 const SCEV *TripCountSCEV =
2028 SE.getAddExpr(BackedgeTakenCount,
2029 SE.getOne(BackedgeTakenCount->getType()));
2030
2031 // We need to store the trip count in LR, a 32-bit register.
2032 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2033 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2034 return false;
2035 }
2036
2037 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2038 // point in generating a hardware loop if that's going to happen.
2039
2040 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2041 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2042 switch (Call->getIntrinsicID()) {
2043 default:
2044 break;
2045 case Intrinsic::start_loop_iterations:
2046 case Intrinsic::test_start_loop_iterations:
2047 case Intrinsic::loop_decrement:
2048 case Intrinsic::loop_decrement_reg:
2049 return true;
2050 }
2051 }
2052 return false;
2053 };
2054
2055 // Scan the instructions to see if there's any that we know will turn into a
2056 // call or if this loop is already a low-overhead loop or will become a tail
2057 // predicated loop.
2058 bool IsTailPredLoop = false;
2059 auto ScanLoop = [&](Loop *L) {
2060 for (auto *BB : L->getBlocks()) {
2061 for (auto &I : *BB) {
2062 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2063 isa<InlineAsm>(I)) {
2064 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2065 return false;
2066 }
2067 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2068 IsTailPredLoop |=
2069 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2070 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2071 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2072 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2073 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2074 }
2075 }
2076 return true;
2077 };
2078
2079 // Visit inner loops.
2080 for (auto *Inner : *L)
2081 if (!ScanLoop(Inner))
2082 return false;
2083
2084 if (!ScanLoop(L))
2085 return false;
2086
2087 // TODO: Check whether the trip count calculation is expensive. If L is the
2088 // inner loop but we know it has a low trip count, calculating that trip
2089 // count (in the parent loop) may be detrimental.
2090
2091 LLVMContext &C = L->getHeader()->getContext();
2092 HWLoopInfo.CounterInReg = true;
2093 HWLoopInfo.IsNestingLegal = false;
2094 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2095 HWLoopInfo.CountType = Type::getInt32Ty(C);
2096 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2097 return true;
2098}
2099
2100static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2101 // We don't allow icmp's, and because we only look at single block loops,
2102 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2103 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2104 return false;
2105 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2106 // not currently canonical, but soon will be. Code without them uses icmp, and
2107 // so is not tail predicated as per the condition above. In order to get the
2108 // same performance we treat min and max the same as an icmp for tailpred
2109 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2110 // pick more optimial instructions like VQDMULH. They need to be recognized
2111 // directly by the vectorizer).
2112 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2113 if ((II->getIntrinsicID() == Intrinsic::smin ||
2114 II->getIntrinsicID() == Intrinsic::smax ||
2115 II->getIntrinsicID() == Intrinsic::umin ||
2116 II->getIntrinsicID() == Intrinsic::umax) &&
2117 ++ICmpCount > 1)
2118 return false;
2119
2120 if (isa<FCmpInst>(&I))
2121 return false;
2122
2123 // We could allow extending/narrowing FP loads/stores, but codegen is
2124 // too inefficient so reject this for now.
2125 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2126 return false;
2127
2128 // Extends have to be extending-loads
2129 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2130 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2131 return false;
2132
2133 // Truncs have to be narrowing-stores
2134 if (isa<TruncInst>(&I) )
2135 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2136 return false;
2137
2138 return true;
2139}
2140
2141// To set up a tail-predicated loop, we need to know the total number of
2142// elements processed by that loop. Thus, we need to determine the element
2143// size and:
2144// 1) it should be uniform for all operations in the vector loop, so we
2145// e.g. don't want any widening/narrowing operations.
2146// 2) it should be smaller than i64s because we don't have vector operations
2147// that work on i64s.
2148// 3) we don't want elements to be reversed or shuffled, to make sure the
2149// tail-predication masks/predicates the right lanes.
2150//
2152 const DataLayout &DL,
2153 const LoopAccessInfo *LAI) {
2154 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2155
2156 // If there are live-out values, it is probably a reduction. We can predicate
2157 // most reduction operations freely under MVE using a combination of
2158 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2159 // floating point and integer reductions, but don't check for operators
2160 // specifically here. If the value ends up not being a reduction (and so the
2161 // vectorizer cannot tailfold the loop), we should fall back to standard
2162 // vectorization automatically.
2164 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2165 bool ReductionsDisabled =
2168
2169 for (auto *I : LiveOuts) {
2170 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2171 !I->getType()->isHalfTy()) {
2172 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2173 "live-out value\n");
2174 return false;
2175 }
2176 if (ReductionsDisabled) {
2177 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2178 return false;
2179 }
2180 }
2181
2182 // Next, check that all instructions can be tail-predicated.
2183 PredicatedScalarEvolution PSE = LAI->getPSE();
2185 int ICmpCount = 0;
2186
2187 for (BasicBlock *BB : L->blocks()) {
2188 for (Instruction &I : BB->instructionsWithoutDebug()) {
2189 if (isa<PHINode>(&I))
2190 continue;
2191 if (!canTailPredicateInstruction(I, ICmpCount)) {
2192 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2193 return false;
2194 }
2195
2196 Type *T = I.getType();
2197 if (T->getScalarSizeInBits() > 32) {
2198 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2199 return false;
2200 }
2201 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2203 Type *AccessTy = getLoadStoreType(&I);
2204 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2205 if (NextStride == 1) {
2206 // TODO: for now only allow consecutive strides of 1. We could support
2207 // other strides as long as it is uniform, but let's keep it simple
2208 // for now.
2209 continue;
2210 } else if (NextStride == -1 ||
2211 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2212 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2214 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2215 "be tail-predicated\n.");
2216 return false;
2217 // TODO: don't tail predicate if there is a reversed load?
2218 } else if (EnableMaskedGatherScatters) {
2219 // Gather/scatters do allow loading from arbitrary strides, at
2220 // least if they are loop invariant.
2221 // TODO: Loop variant strides should in theory work, too, but
2222 // this requires further testing.
2223 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2224 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2225 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2226 if (PSE.getSE()->isLoopInvariant(Step, L))
2227 continue;
2228 }
2229 }
2230 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2231 "tail-predicate\n.");
2232 return false;
2233 }
2234 }
2235 }
2236
2237 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2238 return true;
2239}
2240
2242 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
2244 InterleavedAccessInfo *IAI) {
2245 if (!EnableTailPredication) {
2246 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2247 return false;
2248 }
2249
2250 // Creating a predicated vector loop is the first step for generating a
2251 // tail-predicated hardware loop, for which we need the MVE masked
2252 // load/stores instructions:
2253 if (!ST->hasMVEIntegerOps())
2254 return false;
2255
2256 // For now, restrict this to single block loops.
2257 if (L->getNumBlocks() > 1) {
2258 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2259 "loop.\n");
2260 return false;
2261 }
2262
2263 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2264
2265 HardwareLoopInfo HWLoopInfo(L);
2266 if (!HWLoopInfo.canAnalyze(*LI)) {
2267 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2268 "analyzable.\n");
2269 return false;
2270 }
2271
2272 // This checks if we have the low-overhead branch architecture
2273 // extension, and if we will create a hardware-loop:
2274 if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2275 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2276 "profitable.\n");
2277 return false;
2278 }
2279
2280 if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2281 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2282 "a candidate.\n");
2283 return false;
2284 }
2285
2286 return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
2287}
2288
2290ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2291 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2293
2294 // Intrinsic @llvm.get.active.lane.mask is supported.
2295 // It is used in the MVETailPredication pass, which requires the number of
2296 // elements processed by this vector loop to setup the tail-predicated
2297 // loop.
2299}
2303 // Enable Upper bound unrolling universally, not dependant upon the conditions
2304 // below.
2305 UP.UpperBound = true;
2306
2307 // Only currently enable these preferences for M-Class cores.
2308 if (!ST->isMClass())
2309 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2310
2311 // Disable loop unrolling for Oz and Os.
2312 UP.OptSizeThreshold = 0;
2314 if (L->getHeader()->getParent()->hasOptSize())
2315 return;
2316
2317 SmallVector<BasicBlock*, 4> ExitingBlocks;
2318 L->getExitingBlocks(ExitingBlocks);
2319 LLVM_DEBUG(dbgs() << "Loop has:\n"
2320 << "Blocks: " << L->getNumBlocks() << "\n"
2321 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2322
2323 // Only allow another exit other than the latch. This acts as an early exit
2324 // as it mirrors the profitability calculation of the runtime unroller.
2325 if (ExitingBlocks.size() > 2)
2326 return;
2327
2328 // Limit the CFG of the loop body for targets with a branch predictor.
2329 // Allowing 4 blocks permits if-then-else diamonds in the body.
2330 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2331 return;
2332
2333 // Don't unroll vectorized loops, including the remainder loop
2334 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2335 return;
2336
2337 // Scan the loop: don't unroll loops with calls as this could prevent
2338 // inlining.
2340 for (auto *BB : L->getBlocks()) {
2341 for (auto &I : *BB) {
2342 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2343 // scalar code.
2344 if (I.getType()->isVectorTy())
2345 return;
2346
2347 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2348 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2349 if (!isLoweredToCall(F))
2350 continue;
2351 }
2352 return;
2353 }
2354
2355 SmallVector<const Value*, 4> Operands(I.operand_values());
2358 }
2359 }
2360
2361 // On v6m cores, there are very few registers available. We can easily end up
2362 // spilling and reloading more registers in an unrolled loop. Look at the
2363 // number of LCSSA phis as a rough measure of how many registers will need to
2364 // be live out of the loop, reducing the default unroll count if more than 1
2365 // value is needed. In the long run, all of this should be being learnt by a
2366 // machine.
2367 unsigned UnrollCount = 4;
2368 if (ST->isThumb1Only()) {
2369 unsigned ExitingValues = 0;
2371 L->getExitBlocks(ExitBlocks);
2372 for (auto *Exit : ExitBlocks) {
2373 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2374 // only the last is expected to be needed for address operands.
2375 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2376 return PH.getNumOperands() != 1 ||
2377 !isa<GetElementPtrInst>(PH.getOperand(0));
2378 });
2379 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2380 }
2381 if (ExitingValues)
2382 UnrollCount /= ExitingValues;
2383 if (UnrollCount <= 1)
2384 return;
2385 }
2386
2387 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2388 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2389
2390 UP.Partial = true;
2391 UP.Runtime = true;
2392 UP.UnrollRemainder = true;
2394 UP.UnrollAndJam = true;
2396
2397 // Force unrolling small loops can be very useful because of the branch
2398 // taken cost of the backedge.
2399 if (Cost < 12)
2400 UP.Force = true;
2401}
2402
2406}
2407
2408bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2409 TTI::ReductionFlags Flags) const {
2410 if (!ST->hasMVEIntegerOps())
2411 return false;
2412
2413 unsigned ScalarBits = Ty->getScalarSizeInBits();
2414 switch (Opcode) {
2415 case Instruction::Add:
2416 return ScalarBits <= 64;
2417 default:
2418 return false;
2419 }
2420}
2421
2423 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2424 if (!ST->hasMVEIntegerOps())
2425 return false;
2426 return true;
2427}
2428
2430 int64_t BaseOffset,
2431 bool HasBaseReg, int64_t Scale,
2432 unsigned AddrSpace) const {
2434 AM.BaseGV = BaseGV;
2435 AM.BaseOffs = BaseOffset;
2436 AM.HasBaseReg = HasBaseReg;
2437 AM.Scale = Scale;
2438 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2439 if (ST->hasFPAO())
2440 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2441 return 0;
2442 }
2443 return -1;
2444}
2445
2446bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2447 if (Thumb) {
2448 // B.W is available in any Thumb2-supporting target, and also in every
2449 // version of Armv8-M, even Baseline which does not include the rest of
2450 // Thumb2.
2451 return ST->isThumb2() || ST->hasV8MBaselineOps();
2452 } else {
2453 // B is available in all versions of the Arm ISA, so the only question is
2454 // whether that ISA is available at all.
2455 return ST->hasARMOps();
2456 }
2457}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
assume Assume Builder
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:75
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:612
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
bool hasARMOps() const
Definition: ARMSubtarget.h:326
bool isThumb1Only() const
Definition: ARMSubtarget.h:421
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:335
bool isThumb2() const
Definition: ARMSubtarget.h:422
bool hasVFP2Base() const
Definition: ARMSubtarget.h:332
bool isMClass() const
Definition: ARMSubtarget.h:423
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:541
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool maybeLoweredToCall(Instruction &I)
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
InstructionCost getMemcpyCost(const Instruction *I)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLoweredToCall(const Function *F)
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool hasArmWideBranch(bool Thumb) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalMaskedGather(Type *Ty, Align Alignment)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool isProfitableLSRChainElement(Instruction *I)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:850
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:964
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:721
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:328
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:814
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:928
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:994
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1353
unsigned arg_size() const
Definition: InstrTypes.h:1351
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1054
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:748
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:745
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:132
This is an important base class in LLVM.
Definition: Constant.h:41
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:500
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:669
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
unsigned getNumElements() const
Definition: DerivedTypes.h:568
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1259
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:452
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:975
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
The core instruction combiner logic.
Definition: InstCombiner.h:45
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:372
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:371
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:418
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:442
BuilderTy & Builder
Definition: InstCombiner.h:58
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:369
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:87
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:275
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1455
bool isShift() const
Definition: Instruction.h:175
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:780
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
const LoopAccessInfo * getLAI() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1399
Root of the metadata hierarchy.
Definition: Metadata.h:61
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:237
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:231
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:219
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:148
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:786
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:773
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:726
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:704
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:599
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:776
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:883
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:832
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:679
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:865
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:782
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:144
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:222
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:123
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMIN
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1981
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:129
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:139
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:299
bool isFixedLengthVector() const
Definition: ValueTypes.h:170
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:160
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:306
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:144
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55