LLVM 20.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60
62
64
65/// Convert a vector load intrinsic into a simple llvm load instruction.
66/// This is beneficial when the underlying object being addressed comes
67/// from a constant, since we get constant-folding for free.
68static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69 InstCombiner::BuilderTy &Builder) {
70 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71
72 if (!IntrAlign)
73 return nullptr;
74
75 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76 ? MemAlign
77 : IntrAlign->getLimitedValue();
78
79 if (!isPowerOf2_32(Alignment))
80 return nullptr;
81
82 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83 PointerType::get(II.getType(), 0));
84 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85}
86
88 const Function *Callee) const {
89 const TargetMachine &TM = getTLI()->getTargetMachine();
90 const FeatureBitset &CallerBits =
91 TM.getSubtargetImpl(*Caller)->getFeatureBits();
92 const FeatureBitset &CalleeBits =
93 TM.getSubtargetImpl(*Callee)->getFeatureBits();
94
95 // To inline a callee, all features not in the allowed list must match exactly.
96 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
97 (CalleeBits & ~InlineFeaturesAllowed);
98 // For features in the allowed list, the callee's features must be a subset of
99 // the callers'.
100 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
101 (CalleeBits & InlineFeaturesAllowed);
102 return MatchExact && MatchSubset;
103}
104
107 ScalarEvolution *SE) const {
108 if (ST->hasMVEIntegerOps())
110
111 if (L->getHeader()->getParent()->hasOptSize())
112 return TTI::AMK_None;
113
114 if (ST->isMClass() && ST->isThumb2() &&
115 L->getNumBlocks() == 1)
116 return TTI::AMK_PreIndexed;
117
118 return TTI::AMK_None;
119}
120
121std::optional<Instruction *>
123 using namespace PatternMatch;
124 Intrinsic::ID IID = II.getIntrinsicID();
125 switch (IID) {
126 default:
127 break;
128 case Intrinsic::arm_neon_vld1: {
129 Align MemAlign =
130 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
132 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133 return IC.replaceInstUsesWith(II, V);
134 }
135 break;
136 }
137
138 case Intrinsic::arm_neon_vld2:
139 case Intrinsic::arm_neon_vld3:
140 case Intrinsic::arm_neon_vld4:
141 case Intrinsic::arm_neon_vld2lane:
142 case Intrinsic::arm_neon_vld3lane:
143 case Intrinsic::arm_neon_vld4lane:
144 case Intrinsic::arm_neon_vst1:
145 case Intrinsic::arm_neon_vst2:
146 case Intrinsic::arm_neon_vst3:
147 case Intrinsic::arm_neon_vst4:
148 case Intrinsic::arm_neon_vst2lane:
149 case Intrinsic::arm_neon_vst3lane:
150 case Intrinsic::arm_neon_vst4lane: {
151 Align MemAlign =
152 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
154 unsigned AlignArg = II.arg_size() - 1;
155 Value *AlignArgOp = II.getArgOperand(AlignArg);
156 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157 if (Align && *Align < MemAlign) {
158 return IC.replaceOperand(
159 II, AlignArg,
160 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161 false));
162 }
163 break;
164 }
165
166 case Intrinsic::arm_mve_pred_i2v: {
167 Value *Arg = II.getArgOperand(0);
168 Value *ArgArg;
169 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170 PatternMatch::m_Value(ArgArg))) &&
171 II.getType() == ArgArg->getType()) {
172 return IC.replaceInstUsesWith(II, ArgArg);
173 }
174 Constant *XorMask;
175 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176 PatternMatch::m_Value(ArgArg)),
177 PatternMatch::m_Constant(XorMask))) &&
178 II.getType() == ArgArg->getType()) {
179 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180 if (CI->getValue().trunc(16).isAllOnes()) {
181 auto TrueVector = IC.Builder.CreateVectorSplat(
182 cast<FixedVectorType>(II.getType())->getNumElements(),
183 IC.Builder.getTrue());
184 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185 }
186 }
187 }
188 KnownBits ScalarKnown(32);
189 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190 ScalarKnown)) {
191 return &II;
192 }
193 break;
194 }
195 case Intrinsic::arm_mve_pred_v2i: {
196 Value *Arg = II.getArgOperand(0);
197 Value *ArgArg;
198 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199 PatternMatch::m_Value(ArgArg)))) {
200 return IC.replaceInstUsesWith(II, ArgArg);
201 }
202
203 if (II.getMetadata(LLVMContext::MD_range))
204 break;
205
206 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
207
208 if (auto CurrentRange = II.getRange()) {
209 Range = Range.intersectWith(*CurrentRange);
210 if (Range == CurrentRange)
211 break;
212 }
213
214 II.addRangeRetAttr(Range);
215 II.addRetAttr(Attribute::NoUndef);
216 return &II;
217 }
218 case Intrinsic::arm_mve_vadc:
219 case Intrinsic::arm_mve_vadc_predicated: {
220 unsigned CarryOp =
221 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
222 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
223 "Bad type for intrinsic!");
224
225 KnownBits CarryKnown(32);
226 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
227 CarryKnown)) {
228 return &II;
229 }
230 break;
231 }
232 case Intrinsic::arm_mve_vmldava: {
233 Instruction *I = cast<Instruction>(&II);
234 if (I->hasOneUse()) {
235 auto *User = cast<Instruction>(*I->user_begin());
236 Value *OpZ;
237 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
238 match(I->getOperand(3), m_Zero())) {
239 Value *OpX = I->getOperand(4);
240 Value *OpY = I->getOperand(5);
241 Type *OpTy = OpX->getType();
242
244 Value *V =
245 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
246 {I->getOperand(0), I->getOperand(1),
247 I->getOperand(2), OpZ, OpX, OpY});
248
250 return IC.eraseInstFromFunction(*User);
251 }
252 }
253 return std::nullopt;
254 }
255 }
256 return std::nullopt;
257}
258
260 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
261 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
262 std::function<void(Instruction *, unsigned, APInt, APInt &)>
263 SimplifyAndSetOp) const {
264
265 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
266 // opcode specifying a Top/Bottom instruction, which can change between
267 // instructions.
268 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
269 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
270 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
271
272 // The only odd/even lanes of operand 0 will only be demanded depending
273 // on whether this is a top/bottom instruction.
274 APInt DemandedElts =
275 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276 : APInt::getHighBitsSet(2, 1));
277 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
278 // The other lanes will be defined from the inserted elements.
279 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
280 : APInt::getHighBitsSet(2, 1));
281 return std::nullopt;
282 };
283
284 switch (II.getIntrinsicID()) {
285 default:
286 break;
287 case Intrinsic::arm_mve_vcvt_narrow:
288 SimplifyNarrowInstrTopBottom(2);
289 break;
290 case Intrinsic::arm_mve_vqmovn:
291 SimplifyNarrowInstrTopBottom(4);
292 break;
293 case Intrinsic::arm_mve_vshrn:
294 SimplifyNarrowInstrTopBottom(7);
295 break;
296 }
297
298 return std::nullopt;
299}
300
303 assert(Ty->isIntegerTy());
304
305 unsigned Bits = Ty->getPrimitiveSizeInBits();
306 if (Bits == 0 || Imm.getActiveBits() >= 64)
307 return 4;
308
309 int64_t SImmVal = Imm.getSExtValue();
310 uint64_t ZImmVal = Imm.getZExtValue();
311 if (!ST->isThumb()) {
312 if ((SImmVal >= 0 && SImmVal < 65536) ||
313 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
314 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
315 return 1;
316 return ST->hasV6T2Ops() ? 2 : 3;
317 }
318 if (ST->isThumb2()) {
319 if ((SImmVal >= 0 && SImmVal < 65536) ||
320 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
321 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
322 return 1;
323 return ST->hasV6T2Ops() ? 2 : 3;
324 }
325 // Thumb1, any i8 imm cost 1.
326 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
327 return 1;
328 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
329 return 2;
330 // Load from constantpool.
331 return 3;
332}
333
334// Constants smaller than 256 fit in the immediate field of
335// Thumb1 instructions so we return a zero cost and 1 otherwise.
337 const APInt &Imm, Type *Ty) {
338 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
339 return 0;
340
341 return 1;
342}
343
344// Checks whether Inst is part of a min(max()) or max(min()) pattern
345// that will match to an SSAT instruction. Returns the instruction being
346// saturated, or null if no saturation pattern was found.
347static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
348 Value *LHS, *RHS;
349 ConstantInt *C;
351
352 if (InstSPF == SPF_SMAX &&
354 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
355
356 auto isSSatMin = [&](Value *MinInst) {
357 if (isa<SelectInst>(MinInst)) {
358 Value *MinLHS, *MinRHS;
359 ConstantInt *MinC;
360 SelectPatternFlavor MinSPF =
361 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
362 if (MinSPF == SPF_SMIN &&
364 MinC->getValue() == ((-Imm) - 1))
365 return true;
366 }
367 return false;
368 };
369
370 if (isSSatMin(Inst->getOperand(1)))
371 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
372 if (Inst->hasNUses(2) &&
373 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
374 return Inst->getOperand(1);
375 }
376 return nullptr;
377}
378
379// Look for a FP Saturation pattern, where the instruction can be simplified to
380// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
381static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
382 if (Imm.getBitWidth() != 64 ||
383 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
384 return false;
385 Value *FP = isSSATMinMaxPattern(Inst, Imm);
386 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
387 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
388 if (!FP)
389 return false;
390 return isa<FPToSIInst>(FP);
391}
392
394 const APInt &Imm, Type *Ty,
396 Instruction *Inst) {
397 // Division by a constant can be turned into multiplication, but only if we
398 // know it's constant. So it's not so much that the immediate is cheap (it's
399 // not), but that the alternative is worse.
400 // FIXME: this is probably unneeded with GlobalISel.
401 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
402 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
403 Idx == 1)
404 return 0;
405
406 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
407 // splitting any large offsets.
408 if (Opcode == Instruction::GetElementPtr && Idx != 0)
409 return 0;
410
411 if (Opcode == Instruction::And) {
412 // UXTB/UXTH
413 if (Imm == 255 || Imm == 65535)
414 return 0;
415 // Conversion to BIC is free, and means we can use ~Imm instead.
416 return std::min(getIntImmCost(Imm, Ty, CostKind),
417 getIntImmCost(~Imm, Ty, CostKind));
418 }
419
420 if (Opcode == Instruction::Add)
421 // Conversion to SUB is free, and means we can use -Imm instead.
422 return std::min(getIntImmCost(Imm, Ty, CostKind),
423 getIntImmCost(-Imm, Ty, CostKind));
424
425 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
426 Ty->getIntegerBitWidth() == 32) {
427 int64_t NegImm = -Imm.getSExtValue();
428 if (ST->isThumb2() && NegImm < 1<<12)
429 // icmp X, #-C -> cmn X, #C
430 return 0;
431 if (ST->isThumb() && NegImm < 1<<8)
432 // icmp X, #-C -> adds X, #C
433 return 0;
434 }
435
436 // xor a, -1 can always be folded to MVN
437 if (Opcode == Instruction::Xor && Imm.isAllOnes())
438 return 0;
439
440 // Ensures negative constant of min(max()) or max(min()) patterns that
441 // match to SSAT instructions don't get hoisted
442 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
443 Ty->getIntegerBitWidth() <= 32) {
444 if (isSSATMinMaxPattern(Inst, Imm) ||
445 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
446 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
447 return 0;
448 }
449
450 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
451 return 0;
452
453 // We can convert <= -1 to < 0, which is generally quite cheap.
454 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
455 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
456 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
457 return std::min(getIntImmCost(Imm, Ty, CostKind),
458 getIntImmCost(Imm + 1, Ty, CostKind));
459 }
460
461 return getIntImmCost(Imm, Ty, CostKind);
462}
463
466 const Instruction *I) {
468 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
469 // FIXME: The vectorizer is highly sensistive to the cost of these
470 // instructions, which suggests that it may be using the costs incorrectly.
471 // But, for now, just make them free to avoid performance regressions for
472 // vector targets.
473 return 0;
474 }
475 return BaseT::getCFInstrCost(Opcode, CostKind, I);
476}
477
479 Type *Src,
482 const Instruction *I) {
483 int ISD = TLI->InstructionOpcodeToISD(Opcode);
484 assert(ISD && "Invalid opcode");
485
486 // TODO: Allow non-throughput costs that aren't binary.
487 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
489 return Cost == 0 ? 0 : 1;
490 return Cost;
491 };
492 auto IsLegalFPType = [this](EVT VT) {
493 EVT EltVT = VT.getScalarType();
494 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
495 (EltVT == MVT::f64 && ST->hasFP64()) ||
496 (EltVT == MVT::f16 && ST->hasFullFP16());
497 };
498
499 EVT SrcTy = TLI->getValueType(DL, Src);
500 EVT DstTy = TLI->getValueType(DL, Dst);
501
502 if (!SrcTy.isSimple() || !DstTy.isSimple())
503 return AdjustCost(
504 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
505
506 // Extending masked load/Truncating masked stores is expensive because we
507 // currently don't split them. This means that we'll likely end up
508 // loading/storing each element individually (hence the high cost).
509 if ((ST->hasMVEIntegerOps() &&
510 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
511 Opcode == Instruction::SExt)) ||
512 (ST->hasMVEFloatOps() &&
513 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
514 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
515 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
516 return 2 * DstTy.getVectorNumElements() *
518
519 // The extend of other kinds of load is free
520 if (CCH == TTI::CastContextHint::Normal ||
522 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
523 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
524 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
525 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
526 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
527 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
528 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
529 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
530 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
531 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
532 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
533 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
534 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
535 };
536 if (const auto *Entry = ConvertCostTableLookup(
537 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
538 return AdjustCost(Entry->Cost);
539
540 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
541 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
542 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
543 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
544 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
545 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
546 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
547 // The following extend from a legal type to an illegal type, so need to
548 // split the load. This introduced an extra load operation, but the
549 // extend is still "free".
550 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
551 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
552 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
553 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
554 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
555 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
556 };
557 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
558 if (const auto *Entry =
559 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
560 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
561 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
562 }
563
564 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
565 // FPExtends are similar but also require the VCVT instructions.
566 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
567 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
568 };
569 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
570 if (const auto *Entry =
571 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
572 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
573 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
574 }
575
576 // The truncate of a store is free. This is the mirror of extends above.
577 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
578 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
579 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
580 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
581 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
582 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
583 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
584 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
585 };
586 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
587 if (const auto *Entry =
588 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
589 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
590 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
591 }
592
593 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
594 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
595 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
596 };
597 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
598 if (const auto *Entry =
599 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
600 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
601 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
602 }
603 }
604
605 // NEON vector operations that can extend their inputs.
606 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
607 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
608 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
609 // vaddl
610 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
611 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
612 // vsubl
613 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
614 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
615 // vmull
616 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
617 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
618 // vshll
619 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
620 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
621 };
622
623 auto *User = cast<Instruction>(*I->user_begin());
624 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
625 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
626 DstTy.getSimpleVT(),
627 SrcTy.getSimpleVT())) {
628 return AdjustCost(Entry->Cost);
629 }
630 }
631
632 // Single to/from double precision conversions.
633 if (Src->isVectorTy() && ST->hasNEON() &&
634 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
635 DstTy.getScalarType() == MVT::f32) ||
636 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
637 DstTy.getScalarType() == MVT::f64))) {
638 static const CostTblEntry NEONFltDblTbl[] = {
639 // Vector fptrunc/fpext conversions.
640 {ISD::FP_ROUND, MVT::v2f64, 2},
641 {ISD::FP_EXTEND, MVT::v2f32, 2},
642 {ISD::FP_EXTEND, MVT::v4f32, 4}};
643
644 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
645 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
646 return AdjustCost(LT.first * Entry->Cost);
647 }
648
649 // Some arithmetic, load and store operations have specific instructions
650 // to cast up/down their types automatically at no extra cost.
651 // TODO: Get these tables to know at least what the related operations are.
652 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
653 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
654 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
655 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
656 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
657 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
658 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
659
660 // The number of vmovl instructions for the extension.
661 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
662 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
663 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
664 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
665 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
666 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
667 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
668 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
669 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
670 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
671 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
672 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
673 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
674 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
675 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
676 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
677 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
678 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
679
680 // Operations that we legalize using splitting.
681 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
682 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
683
684 // Vector float <-> i32 conversions.
685 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
686 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
687
688 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
689 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
690 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
691 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
692 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
693 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
694 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
695 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
696 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
697 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
698 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
699 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
700 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
701 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
702 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
703 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
704 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
705 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
706 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
707 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
708
709 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
710 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
711 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
712 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
713 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
714 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
715
716 // Vector double <-> i32 conversions.
717 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
718 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
719
720 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
721 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
722 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
723 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
724 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
725 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
726
727 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
728 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
729 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
730 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
731 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
732 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
733 };
734
735 if (SrcTy.isVector() && ST->hasNEON()) {
736 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
737 DstTy.getSimpleVT(),
738 SrcTy.getSimpleVT()))
739 return AdjustCost(Entry->Cost);
740 }
741
742 // Scalar float to integer conversions.
743 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
744 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
745 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
746 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
747 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
748 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
749 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
750 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
751 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
752 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
753 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
754 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
755 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
756 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
757 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
758 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
759 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
760 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
761 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
762 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
763 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
764 };
765 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
766 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
767 DstTy.getSimpleVT(),
768 SrcTy.getSimpleVT()))
769 return AdjustCost(Entry->Cost);
770 }
771
772 // Scalar integer to float conversions.
773 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
774 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
775 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
776 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
777 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
778 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
779 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
780 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
781 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
782 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
783 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
784 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
785 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
786 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
787 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
788 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
789 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
790 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
791 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
792 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
793 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
794 };
795
796 if (SrcTy.isInteger() && ST->hasNEON()) {
797 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
798 ISD, DstTy.getSimpleVT(),
799 SrcTy.getSimpleVT()))
800 return AdjustCost(Entry->Cost);
801 }
802
803 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
804 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
805 // are linearised so take more.
806 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
807 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
808 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
809 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
810 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
811 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
812 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
813 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
814 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
815 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
816 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
817 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
818 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
819 };
820
821 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
822 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
823 ISD, DstTy.getSimpleVT(),
824 SrcTy.getSimpleVT()))
825 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
826 }
827
828 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
829 // As general rule, fp converts that were not matched above are scalarized
830 // and cost 1 vcvt for each lane, so long as the instruction is available.
831 // If not it will become a series of function calls.
832 const InstructionCost CallCost =
833 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
834 int Lanes = 1;
835 if (SrcTy.isFixedLengthVector())
836 Lanes = SrcTy.getVectorNumElements();
837
838 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
839 return Lanes;
840 else
841 return Lanes * CallCost;
842 }
843
844 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
845 SrcTy.isFixedLengthVector()) {
846 // Treat a truncate with larger than legal source (128bits for MVE) as
847 // expensive, 2 instructions per lane.
848 if ((SrcTy.getScalarType() == MVT::i8 ||
849 SrcTy.getScalarType() == MVT::i16 ||
850 SrcTy.getScalarType() == MVT::i32) &&
851 SrcTy.getSizeInBits() > 128 &&
852 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
853 return SrcTy.getVectorNumElements() * 2;
854 }
855
856 // Scalar integer conversion costs.
857 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
858 // i16 -> i64 requires two dependent operations.
859 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
860
861 // Truncates on i64 are assumed to be free.
862 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
863 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
864 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
865 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
866 };
867
868 if (SrcTy.isInteger()) {
869 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
870 DstTy.getSimpleVT(),
871 SrcTy.getSimpleVT()))
872 return AdjustCost(Entry->Cost);
873 }
874
875 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
877 : 1;
878 return AdjustCost(
879 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
880}
881
884 unsigned Index, Value *Op0,
885 Value *Op1) {
886 // Penalize inserting into an D-subregister. We end up with a three times
887 // lower estimated throughput on swift.
888 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
889 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
890 return 3;
891
892 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
893 Opcode == Instruction::ExtractElement)) {
894 // Cross-class copies are expensive on many microarchitectures,
895 // so assume they are expensive by default.
896 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
897 return 3;
898
899 // Even if it's not a cross class copy, this likely leads to mixing
900 // of NEON and VFP code and should be therefore penalized.
901 if (ValTy->isVectorTy() &&
902 ValTy->getScalarSizeInBits() <= 32)
903 return std::max<InstructionCost>(
904 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
905 2U);
906 }
907
908 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
909 Opcode == Instruction::ExtractElement)) {
910 // Integer cross-lane moves are more expensive than float, which can
911 // sometimes just be vmovs. Integer involve being passes to GPR registers,
912 // causing more of a delay.
913 std::pair<InstructionCost, MVT> LT =
915 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
916 }
917
918 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
919}
920
922 Type *CondTy,
923 CmpInst::Predicate VecPred,
925 const Instruction *I) {
926 int ISD = TLI->InstructionOpcodeToISD(Opcode);
927
928 // Thumb scalar code size cost for select.
929 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
930 ST->isThumb() && !ValTy->isVectorTy()) {
931 // Assume expensive structs.
932 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
933 return TTI::TCC_Expensive;
934
935 // Select costs can vary because they:
936 // - may require one or more conditional mov (including an IT),
937 // - can't operate directly on immediates,
938 // - require live flags, which we can't copy around easily.
940
941 // Possible IT instruction for Thumb2, or more for Thumb1.
942 ++Cost;
943
944 // i1 values may need rematerialising by using mov immediates and/or
945 // flag setting instructions.
946 if (ValTy->isIntegerTy(1))
947 ++Cost;
948
949 return Cost;
950 }
951
952 // If this is a vector min/max/abs, use the cost of that intrinsic directly
953 // instead. Hopefully when min/max intrinsics are more prevalent this code
954 // will not be needed.
955 const Instruction *Sel = I;
956 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
957 Sel->hasOneUse())
958 Sel = cast<Instruction>(Sel->user_back());
959 if (Sel && ValTy->isVectorTy() &&
960 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
961 const Value *LHS, *RHS;
963 unsigned IID = 0;
964 switch (SPF) {
965 case SPF_ABS:
966 IID = Intrinsic::abs;
967 break;
968 case SPF_SMIN:
969 IID = Intrinsic::smin;
970 break;
971 case SPF_SMAX:
972 IID = Intrinsic::smax;
973 break;
974 case SPF_UMIN:
975 IID = Intrinsic::umin;
976 break;
977 case SPF_UMAX:
978 IID = Intrinsic::umax;
979 break;
980 case SPF_FMINNUM:
981 IID = Intrinsic::minnum;
982 break;
983 case SPF_FMAXNUM:
984 IID = Intrinsic::maxnum;
985 break;
986 default:
987 break;
988 }
989 if (IID) {
990 // The ICmp is free, the select gets the cost of the min/max/etc
991 if (Sel != I)
992 return 0;
993 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
994 return getIntrinsicInstrCost(CostAttrs, CostKind);
995 }
996 }
997
998 // On NEON a vector select gets lowered to vbsl.
999 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1000 // Lowering of some vector selects is currently far from perfect.
1001 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1002 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1003 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1004 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1005 };
1006
1007 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1008 EVT SelValTy = TLI->getValueType(DL, ValTy);
1009 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1010 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1011 SelCondTy.getSimpleVT(),
1012 SelValTy.getSimpleVT()))
1013 return Entry->Cost;
1014 }
1015
1016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1017 return LT.first;
1018 }
1019
1020 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1021 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1022 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1023 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1024 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1025 if (!VecCondTy)
1026 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1027
1028 // If we don't have mve.fp any fp operations will need to be scalarized.
1029 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1030 // One scalaization insert, one scalarization extract and the cost of the
1031 // fcmps.
1032 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1033 /*Extract*/ true, CostKind) +
1034 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1035 /*Extract*/ false, CostKind) +
1036 VecValTy->getNumElements() *
1037 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1038 VecCondTy->getScalarType(), VecPred,
1039 CostKind, I);
1040 }
1041
1042 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1043 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1044 // There are two types - the input that specifies the type of the compare
1045 // and the output vXi1 type. Because we don't know how the output will be
1046 // split, we may need an expensive shuffle to get two in sync. This has the
1047 // effect of making larger than legal compares (v8i32 for example)
1048 // expensive.
1049 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1050 if (LT.first > 1)
1051 return LT.first * BaseCost +
1052 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1053 /*Extract*/ false, CostKind);
1054 return BaseCost;
1055 }
1056 }
1057
1058 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1059 // for "multiple beats" potentially needed by MVE instructions.
1060 int BaseCost = 1;
1061 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1062 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063
1064 return BaseCost *
1065 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1066}
1067
1069 ScalarEvolution *SE,
1070 const SCEV *Ptr) {
1071 // Address computations in vectorized code with non-consecutive addresses will
1072 // likely result in more instructions compared to scalar code where the
1073 // computation can more often be merged into the index mode. The resulting
1074 // extra micro-ops can significantly decrease throughput.
1075 unsigned NumVectorInstToHideOverhead = 10;
1076 int MaxMergeDistance = 64;
1077
1078 if (ST->hasNEON()) {
1079 if (Ty->isVectorTy() && SE &&
1080 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1081 return NumVectorInstToHideOverhead;
1082
1083 // In many cases the address computation is not merged into the instruction
1084 // addressing mode.
1085 return 1;
1086 }
1087 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1088}
1089
1091 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1092 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1093 // optimized, else LSR may block tail-predication.
1094 switch (II->getIntrinsicID()) {
1095 case Intrinsic::arm_mve_vctp8:
1096 case Intrinsic::arm_mve_vctp16:
1097 case Intrinsic::arm_mve_vctp32:
1098 case Intrinsic::arm_mve_vctp64:
1099 return true;
1100 default:
1101 break;
1102 }
1103 }
1104 return false;
1105}
1106
1107bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1108 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1109 return false;
1110
1111 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1112 // Don't support v2i1 yet.
1113 if (VecTy->getNumElements() == 2)
1114 return false;
1115
1116 // We don't support extending fp types.
1117 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1118 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1119 return false;
1120 }
1121
1122 unsigned EltWidth = DataTy->getScalarSizeInBits();
1123 return (EltWidth == 32 && Alignment >= 4) ||
1124 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1125}
1126
1128 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1129 return false;
1130
1131 unsigned EltWidth = Ty->getScalarSizeInBits();
1132 return ((EltWidth == 32 && Alignment >= 4) ||
1133 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1134}
1135
1136/// Given a memcpy/memset/memmove instruction, return the number of memory
1137/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1138/// call is used.
1140 MemOp MOp;
1141 unsigned DstAddrSpace = ~0u;
1142 unsigned SrcAddrSpace = ~0u;
1143 const Function *F = I->getParent()->getParent();
1144
1145 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1146 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1147 // If 'size' is not a constant, a library call will be generated.
1148 if (!C)
1149 return -1;
1150
1151 const unsigned Size = C->getValue().getZExtValue();
1152 const Align DstAlign = *MC->getDestAlign();
1153 const Align SrcAlign = *MC->getSourceAlign();
1154
1155 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1156 /*IsVolatile*/ false);
1157 DstAddrSpace = MC->getDestAddressSpace();
1158 SrcAddrSpace = MC->getSourceAddressSpace();
1159 }
1160 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1161 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1162 // If 'size' is not a constant, a library call will be generated.
1163 if (!C)
1164 return -1;
1165
1166 const unsigned Size = C->getValue().getZExtValue();
1167 const Align DstAlign = *MS->getDestAlign();
1168
1169 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1170 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1171 DstAddrSpace = MS->getDestAddressSpace();
1172 }
1173 else
1174 llvm_unreachable("Expected a memcpy/move or memset!");
1175
1176 unsigned Limit, Factor = 2;
1177 switch(I->getIntrinsicID()) {
1178 case Intrinsic::memcpy:
1179 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1180 break;
1181 case Intrinsic::memmove:
1182 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1183 break;
1184 case Intrinsic::memset:
1185 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1186 Factor = 1;
1187 break;
1188 default:
1189 llvm_unreachable("Expected a memcpy/move or memset!");
1190 }
1191
1192 // MemOps will be poplulated with a list of data types that needs to be
1193 // loaded and stored. That's why we multiply the number of elements by 2 to
1194 // get the cost for this memcpy.
1195 std::vector<EVT> MemOps;
1196 if (getTLI()->findOptimalMemOpLowering(
1197 MemOps, Limit, MOp, DstAddrSpace,
1198 SrcAddrSpace, F->getAttributes()))
1199 return MemOps.size() * Factor;
1200
1201 // If we can't find an optimal memop lowering, return the default cost
1202 return -1;
1203}
1204
1206 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1207
1208 // To model the cost of a library call, we assume 1 for the call, and
1209 // 3 for the argument setup.
1210 if (NumOps == -1)
1211 return 4;
1212 return NumOps;
1213}
1214
1216 VectorType *Tp, ArrayRef<int> Mask,
1218 int Index, VectorType *SubTp,
1220 const Instruction *CxtI) {
1221 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1222 // Treat extractsubvector as single op permutation.
1223 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1224 if (IsExtractSubvector)
1226 if (ST->hasNEON()) {
1227 if (Kind == TTI::SK_Broadcast) {
1228 static const CostTblEntry NEONDupTbl[] = {
1229 // VDUP handles these cases.
1230 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1231 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1232 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1233 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1234 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1235 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1236
1237 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1238 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1239 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1240 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1241
1242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1243 if (const auto *Entry =
1244 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1245 return LT.first * Entry->Cost;
1246 }
1247 if (Kind == TTI::SK_Reverse) {
1248 static const CostTblEntry NEONShuffleTbl[] = {
1249 // Reverse shuffle cost one instruction if we are shuffling within a
1250 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1251 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1253 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1254 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1255 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1256 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1257
1258 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1259 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1260 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1261 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1262
1263 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1264 if (const auto *Entry =
1265 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1266 return LT.first * Entry->Cost;
1267 }
1268 if (Kind == TTI::SK_Select) {
1269 static const CostTblEntry NEONSelShuffleTbl[] = {
1270 // Select shuffle cost table for ARM. Cost is the number of
1271 // instructions
1272 // required to create the shuffled vector.
1273
1274 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1275 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1276 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1277 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1278
1279 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1280 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1281 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1282
1283 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1284
1285 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1286
1287 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1288 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1289 ISD::VECTOR_SHUFFLE, LT.second))
1290 return LT.first * Entry->Cost;
1291 }
1292 }
1293 if (ST->hasMVEIntegerOps()) {
1294 if (Kind == TTI::SK_Broadcast) {
1295 static const CostTblEntry MVEDupTbl[] = {
1296 // VDUP handles these cases.
1297 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1298 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1299 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1300 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1301 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1302
1303 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1304 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1305 LT.second))
1306 return LT.first * Entry->Cost *
1308 }
1309
1310 if (!Mask.empty()) {
1311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1312 if (LT.second.isVector() &&
1313 Mask.size() <= LT.second.getVectorNumElements() &&
1314 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1315 isVREVMask(Mask, LT.second, 64)))
1316 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1317 }
1318 }
1319
1320 // Restore optimal kind.
1321 if (IsExtractSubvector)
1323 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1325 : 1;
1326 return BaseCost *
1327 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1328}
1329
1331 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1334 const Instruction *CxtI) {
1335 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1336 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1337 // Make operations on i1 relatively expensive as this often involves
1338 // combining predicates. AND and XOR should be easier to handle with IT
1339 // blocks.
1340 switch (ISDOpcode) {
1341 default:
1342 break;
1343 case ISD::AND:
1344 case ISD::XOR:
1345 return 2;
1346 case ISD::OR:
1347 return 3;
1348 }
1349 }
1350
1351 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1352
1353 if (ST->hasNEON()) {
1354 const unsigned FunctionCallDivCost = 20;
1355 const unsigned ReciprocalDivCost = 10;
1356 static const CostTblEntry CostTbl[] = {
1357 // Division.
1358 // These costs are somewhat random. Choose a cost of 20 to indicate that
1359 // vectorizing devision (added function call) is going to be very expensive.
1360 // Double registers types.
1361 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1362 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1363 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1364 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1365 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1366 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1367 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1368 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1369 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1370 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1371 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1372 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1373 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1374 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1375 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1376 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1377 // Quad register types.
1378 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1379 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1380 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1381 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1382 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1383 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1384 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1385 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1386 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1387 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1388 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1389 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1390 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1391 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1392 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1393 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1394 // Multiplication.
1395 };
1396
1397 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1398 return LT.first * Entry->Cost;
1399
1401 Opcode, Ty, CostKind, Op1Info, Op2Info);
1402
1403 // This is somewhat of a hack. The problem that we are facing is that SROA
1404 // creates a sequence of shift, and, or instructions to construct values.
1405 // These sequences are recognized by the ISel and have zero-cost. Not so for
1406 // the vectorized code. Because we have support for v2i64 but not i64 those
1407 // sequences look particularly beneficial to vectorize.
1408 // To work around this we increase the cost of v2i64 operations to make them
1409 // seem less beneficial.
1410 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1411 Cost += 4;
1412
1413 return Cost;
1414 }
1415
1416 // If this operation is a shift on arm/thumb2, it might well be folded into
1417 // the following instruction, hence having a cost of 0.
1418 auto LooksLikeAFreeShift = [&]() {
1419 if (ST->isThumb1Only() || Ty->isVectorTy())
1420 return false;
1421
1422 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1423 return false;
1424 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1425 return false;
1426
1427 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1428 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1429 case Instruction::Add:
1430 case Instruction::Sub:
1431 case Instruction::And:
1432 case Instruction::Xor:
1433 case Instruction::Or:
1434 case Instruction::ICmp:
1435 return true;
1436 default:
1437 return false;
1438 }
1439 };
1440 if (LooksLikeAFreeShift())
1441 return 0;
1442
1443 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1444 // for "multiple beats" potentially needed by MVE instructions.
1445 int BaseCost = 1;
1446 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1447 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1448
1449 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1450 // without treating floats as more expensive that scalars or increasing the
1451 // costs for custom operations. The results is also multiplied by the
1452 // MVEVectorCostFactor where appropriate.
1453 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1454 return LT.first * BaseCost;
1455
1456 // Else this is expand, assume that we need to scalarize this op.
1457 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1458 unsigned Num = VTy->getNumElements();
1461 // Return the cost of multiple scalar invocation plus the cost of
1462 // inserting and extracting the values.
1463 SmallVector<Type *> Tys(Args.size(), Ty);
1464 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1465 Num * Cost;
1466 }
1467
1468 return BaseCost;
1469}
1470
1472 MaybeAlign Alignment,
1473 unsigned AddressSpace,
1475 TTI::OperandValueInfo OpInfo,
1476 const Instruction *I) {
1477 // TODO: Handle other cost kinds.
1479 return 1;
1480
1481 // Type legalization can't handle structs
1482 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1483 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1484 CostKind);
1485
1486 if (ST->hasNEON() && Src->isVectorTy() &&
1487 (Alignment && *Alignment != Align(16)) &&
1488 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1489 // Unaligned loads/stores are extremely inefficient.
1490 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1491 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1492 return LT.first * 4;
1493 }
1494
1495 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1496 // Same for stores.
1497 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1498 ((Opcode == Instruction::Load && I->hasOneUse() &&
1499 isa<FPExtInst>(*I->user_begin())) ||
1500 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1501 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1502 Type *DstTy =
1503 Opcode == Instruction::Load
1504 ? (*I->user_begin())->getType()
1505 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1506 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1507 DstTy->getScalarType()->isFloatTy())
1508 return ST->getMVEVectorCostFactor(CostKind);
1509 }
1510
1511 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1513 : 1;
1514 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1515 CostKind, OpInfo, I);
1516}
1517
1519ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1520 unsigned AddressSpace,
1522 if (ST->hasMVEIntegerOps()) {
1523 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1524 return ST->getMVEVectorCostFactor(CostKind);
1525 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1526 return ST->getMVEVectorCostFactor(CostKind);
1527 }
1528 if (!isa<FixedVectorType>(Src))
1529 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1530 CostKind);
1531 // Scalar cost, which is currently very high due to the efficiency of the
1532 // generated code.
1533 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1534}
1535
1537 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1538 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1539 bool UseMaskForCond, bool UseMaskForGaps) {
1540 assert(Factor >= 2 && "Invalid interleave factor");
1541 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1542
1543 // vldN/vstN doesn't support vector types of i64/f64 element.
1544 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1545
1546 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1547 !UseMaskForCond && !UseMaskForGaps) {
1548 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1549 auto *SubVecTy =
1550 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1551
1552 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1553 // Accesses having vector types that are a multiple of 128 bits can be
1554 // matched to more than one vldN/vstN instruction.
1555 int BaseCost =
1556 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1557 if (NumElts % Factor == 0 &&
1558 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1559 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1560
1561 // Some smaller than legal interleaved patterns are cheap as we can make
1562 // use of the vmovn or vrev patterns to interleave a standard load. This is
1563 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1564 // promoted differently). The cost of 2 here is then a load and vrev or
1565 // vmovn.
1566 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1567 VecTy->isIntOrIntVectorTy() &&
1568 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1569 return 2 * BaseCost;
1570 }
1571
1572 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1573 Alignment, AddressSpace, CostKind,
1574 UseMaskForCond, UseMaskForGaps);
1575}
1576
1578 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1579 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1580 using namespace PatternMatch;
1581 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1582 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1583 Alignment, CostKind, I);
1584
1585 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1586 auto *VTy = cast<FixedVectorType>(DataTy);
1587
1588 // TODO: Splitting, once we do that.
1589
1590 unsigned NumElems = VTy->getNumElements();
1591 unsigned EltSize = VTy->getScalarSizeInBits();
1592 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1593
1594 // For now, it is assumed that for the MVE gather instructions the loads are
1595 // all effectively serialised. This means the cost is the scalar cost
1596 // multiplied by the number of elements being loaded. This is possibly very
1597 // conservative, but even so we still end up vectorising loops because the
1598 // cost per iteration for many loops is lower than for scalar loops.
1599 InstructionCost VectorCost =
1600 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1601 // The scalarization cost should be a lot higher. We use the number of vector
1602 // elements plus the scalarization overhead. If masking is required then a lot
1603 // of little blocks will be needed and potentially a scalarized p0 mask,
1604 // greatly increasing the cost.
1605 InstructionCost ScalarCost =
1606 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1607 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1608 CostKind) +
1609 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1610 CostKind);
1611
1612 if (EltSize < 8 || Alignment < EltSize / 8)
1613 return ScalarCost;
1614
1615 unsigned ExtSize = EltSize;
1616 // Check whether there's a single user that asks for an extended type
1617 if (I != nullptr) {
1618 // Dependent of the caller of this function, a gather instruction will
1619 // either have opcode Instruction::Load or be a call to the masked_gather
1620 // intrinsic
1621 if ((I->getOpcode() == Instruction::Load ||
1622 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1623 I->hasOneUse()) {
1624 const User *Us = *I->users().begin();
1625 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1626 // only allow valid type combinations
1627 unsigned TypeSize =
1628 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1629 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1630 (TypeSize == 16 && EltSize == 8)) &&
1631 TypeSize * NumElems == 128) {
1632 ExtSize = TypeSize;
1633 }
1634 }
1635 }
1636 // Check whether the input data needs to be truncated
1637 TruncInst *T;
1638 if ((I->getOpcode() == Instruction::Store ||
1639 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1640 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1641 // Only allow valid type combinations
1642 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1643 if (((EltSize == 16 && TypeSize == 32) ||
1644 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1645 TypeSize * NumElems == 128)
1646 ExtSize = TypeSize;
1647 }
1648 }
1649
1650 if (ExtSize * NumElems != 128 || NumElems < 4)
1651 return ScalarCost;
1652
1653 // Any (aligned) i32 gather will not need to be scalarised.
1654 if (ExtSize == 32)
1655 return VectorCost;
1656 // For smaller types, we need to ensure that the gep's inputs are correctly
1657 // extended from a small enough value. Other sizes (including i64) are
1658 // scalarized for now.
1659 if (ExtSize != 8 && ExtSize != 16)
1660 return ScalarCost;
1661
1662 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1663 Ptr = BC->getOperand(0);
1664 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1665 if (GEP->getNumOperands() != 2)
1666 return ScalarCost;
1667 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1668 // Scale needs to be correct (which is only relevant for i16s).
1669 if (Scale != 1 && Scale * 8 != ExtSize)
1670 return ScalarCost;
1671 // And we need to zext (not sext) the indexes from a small enough type.
1672 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1673 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1674 return VectorCost;
1675 }
1676 return ScalarCost;
1677 }
1678 return ScalarCost;
1679}
1680
1683 std::optional<FastMathFlags> FMF,
1685
1686 EVT ValVT = TLI->getValueType(DL, ValTy);
1687 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1688 unsigned EltSize = ValVT.getScalarSizeInBits();
1689
1690 // In general floating point reductions are a series of elementwise
1691 // operations, with free extracts on each step. These are either in-order or
1692 // treewise depending on whether that is allowed by the fast math flags.
1693 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1694 ((EltSize == 32 && ST->hasVFP2Base()) ||
1695 (EltSize == 64 && ST->hasFP64()) ||
1696 (EltSize == 16 && ST->hasFullFP16()))) {
1697 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1698 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1699 InstructionCost VecCost = 0;
1700 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1701 NumElts * EltSize > VecLimit) {
1702 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1703 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1704 NumElts /= 2;
1705 }
1706
1707 // For fp16 we need to extract the upper lane elements. MVE can add a
1708 // VREV+FMIN/MAX to perform another vector step instead.
1709 InstructionCost ExtractCost = 0;
1710 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1711 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1712 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1713 NumElts /= 2;
1714 } else if (ValVT.getVectorElementType() == MVT::f16)
1715 ExtractCost = NumElts / 2;
1716
1717 return VecCost + ExtractCost +
1718 NumElts *
1720 }
1721
1722 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1723 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1724 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1725 unsigned VecLimit =
1726 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1727 InstructionCost VecCost = 0;
1728 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1729 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1730 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1731 NumElts /= 2;
1732 }
1733 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1734 // step.
1735 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1736 NumElts * EltSize == 64) {
1737 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1738 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1739 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1740 NumElts /= 2;
1741 }
1742
1743 // From here we extract the elements and perform the and/or/xor.
1744 InstructionCost ExtractCost = NumElts;
1745 return VecCost + ExtractCost +
1746 (NumElts - 1) * getArithmeticInstrCost(
1747 Opcode, ValTy->getElementType(), CostKind);
1748 }
1749
1750 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1752 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1753
1754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755
1756 static const CostTblEntry CostTblAdd[]{
1757 {ISD::ADD, MVT::v16i8, 1},
1758 {ISD::ADD, MVT::v8i16, 1},
1759 {ISD::ADD, MVT::v4i32, 1},
1760 };
1761 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1762 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1763
1764 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1765}
1766
1768 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1770 EVT ValVT = TLI->getValueType(DL, ValTy);
1771 EVT ResVT = TLI->getValueType(DL, ResTy);
1772
1773 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1774
1775 switch (ISD) {
1776 case ISD::ADD:
1777 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1778 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1779
1780 // The legal cases are:
1781 // VADDV u/s 8/16/32
1782 // VADDLV u/s 32
1783 // Codegen currently cannot always handle larger than legal vectors very
1784 // well, especially for predicated reductions where the mask needs to be
1785 // split, so restrict to 128bit or smaller input types.
1786 unsigned RevVTSize = ResVT.getSizeInBits();
1787 if (ValVT.getSizeInBits() <= 128 &&
1788 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1789 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1790 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1791 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1792 }
1793 break;
1794 default:
1795 break;
1796 }
1797 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1798 CostKind);
1799}
1800
1803 VectorType *ValTy,
1805 EVT ValVT = TLI->getValueType(DL, ValTy);
1806 EVT ResVT = TLI->getValueType(DL, ResTy);
1807
1808 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1809 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1810
1811 // The legal cases are:
1812 // VMLAV u/s 8/16/32
1813 // VMLALV u/s 16/32
1814 // Codegen currently cannot always handle larger than legal vectors very
1815 // well, especially for predicated reductions where the mask needs to be
1816 // split, so restrict to 128bit or smaller input types.
1817 unsigned RevVTSize = ResVT.getSizeInBits();
1818 if (ValVT.getSizeInBits() <= 128 &&
1819 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1820 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1821 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1822 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1823 }
1824
1825 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1826}
1827
1830 FastMathFlags FMF,
1832 EVT ValVT = TLI->getValueType(DL, Ty);
1833
1834 // In general floating point reductions are a series of elementwise
1835 // operations, with free extracts on each step. These are either in-order or
1836 // treewise depending on whether that is allowed by the fast math flags.
1837 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1838 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1839 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1840 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1841 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1842 unsigned EltSize = ValVT.getScalarSizeInBits();
1843 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1844 InstructionCost VecCost;
1845 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1846 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1847 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1848 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1849 NumElts /= 2;
1850 }
1851
1852 // For fp16 we need to extract the upper lane elements. MVE can add a
1853 // VREV+FMIN/MAX to perform another vector step instead.
1854 InstructionCost ExtractCost = 0;
1855 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1856 NumElts == 8) {
1857 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1858 NumElts /= 2;
1859 } else if (ValVT.getVectorElementType() == MVT::f16)
1860 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1861
1863 {Ty->getElementType(), Ty->getElementType()},
1864 FMF);
1865 return VecCost + ExtractCost +
1866 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1867 }
1868
1869 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1870 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1871 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1872
1873 // All costs are the same for u/s min/max. These lower to vminv, which are
1874 // given a slightly higher cost as they tend to take multiple cycles for
1875 // smaller type sizes.
1876 static const CostTblEntry CostTblAdd[]{
1877 {ISD::SMIN, MVT::v16i8, 4},
1878 {ISD::SMIN, MVT::v8i16, 3},
1879 {ISD::SMIN, MVT::v4i32, 2},
1880 };
1881 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1882 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1883 }
1884
1885 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1886}
1887
1891 unsigned Opc = ICA.getID();
1892 switch (Opc) {
1893 case Intrinsic::get_active_lane_mask:
1894 // Currently we make a somewhat optimistic assumption that
1895 // active_lane_mask's are always free. In reality it may be freely folded
1896 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1897 // of add/icmp code. We may need to improve this in the future, but being
1898 // able to detect if it is free or not involves looking at a lot of other
1899 // code. We currently assume that the vectorizer inserted these, and knew
1900 // what it was doing in adding one.
1901 if (ST->hasMVEIntegerOps())
1902 return 0;
1903 break;
1904 case Intrinsic::sadd_sat:
1905 case Intrinsic::ssub_sat:
1906 case Intrinsic::uadd_sat:
1907 case Intrinsic::usub_sat: {
1908 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1909 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1910 Type *RetTy = ICA.getReturnType();
1911
1912 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1913 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1914 return 1; // qadd / qsub
1915 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
1916 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
1917 // Otherwise return the cost of expanding the node. Generally an add +
1918 // icmp + sel.
1920 Type *CondTy = RetTy->getWithNewBitWidth(1);
1921 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
1922 RetTy, CostKind) +
1923 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
1924 CostKind) +
1925 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
1926 CostKind);
1927 }
1928
1929 if (!ST->hasMVEIntegerOps())
1930 break;
1931
1932 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
1933 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1934 LT.second == MVT::v16i8) {
1935 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1936 // need to extend the type, as it uses shr(qadd(shl, shl)).
1937 unsigned Instrs =
1938 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
1939 : 4;
1940 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1941 }
1942 break;
1943 }
1944 case Intrinsic::abs:
1945 case Intrinsic::smin:
1946 case Intrinsic::smax:
1947 case Intrinsic::umin:
1948 case Intrinsic::umax: {
1949 if (!ST->hasMVEIntegerOps())
1950 break;
1951 Type *VT = ICA.getReturnType();
1952
1953 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1954 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1955 LT.second == MVT::v16i8)
1956 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1957 break;
1958 }
1959 case Intrinsic::minnum:
1960 case Intrinsic::maxnum: {
1961 if (!ST->hasMVEFloatOps())
1962 break;
1963 Type *VT = ICA.getReturnType();
1964 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1965 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1966 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1967 break;
1968 }
1969 case Intrinsic::fptosi_sat:
1970 case Intrinsic::fptoui_sat: {
1971 if (ICA.getArgTypes().empty())
1972 break;
1973 bool IsSigned = Opc == Intrinsic::fptosi_sat;
1974 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1975 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1976 // Check for the legal types, with the corect subtarget features.
1977 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1978 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1979 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1980 return LT.first;
1981
1982 // Equally for MVE vector types
1983 if (ST->hasMVEFloatOps() &&
1984 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
1985 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
1986 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1987
1988 // If we can we use a legal convert followed by a min+max
1989 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
1990 (ST->hasFP64() && LT.second == MVT::f64) ||
1991 (ST->hasFullFP16() && LT.second == MVT::f16) ||
1992 (ST->hasMVEFloatOps() &&
1993 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
1994 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
1995 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
1996 LT.second.getScalarSizeInBits());
1998 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1999 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2000 : Intrinsic::umin,
2001 LegalTy, {LegalTy, LegalTy});
2003 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2004 : Intrinsic::umax,
2005 LegalTy, {LegalTy, LegalTy});
2007 return LT.first * Cost;
2008 }
2009 // Otherwise we need to follow the default expansion that clamps the value
2010 // using a float min/max with a fcmp+sel for nan handling when signed.
2011 Type *FPTy = ICA.getArgTypes()[0];
2012 Type *RetTy = ICA.getReturnType();
2013 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2015 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2017 Cost +=
2018 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2020 if (IsSigned) {
2021 Type *CondTy = RetTy->getWithNewBitWidth(1);
2022 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2024 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2026 }
2027 return Cost;
2028 }
2029 }
2030
2032}
2033
2035 if (!F->isIntrinsic())
2036 return BaseT::isLoweredToCall(F);
2037
2038 // Assume all Arm-specific intrinsics map to an instruction.
2039 if (F->getName().starts_with("llvm.arm"))
2040 return false;
2041
2042 switch (F->getIntrinsicID()) {
2043 default: break;
2044 case Intrinsic::powi:
2045 case Intrinsic::sin:
2046 case Intrinsic::cos:
2047 case Intrinsic::pow:
2048 case Intrinsic::log:
2049 case Intrinsic::log10:
2050 case Intrinsic::log2:
2051 case Intrinsic::exp:
2052 case Intrinsic::exp2:
2053 return true;
2054 case Intrinsic::sqrt:
2055 case Intrinsic::fabs:
2056 case Intrinsic::copysign:
2057 case Intrinsic::floor:
2058 case Intrinsic::ceil:
2059 case Intrinsic::trunc:
2060 case Intrinsic::rint:
2061 case Intrinsic::nearbyint:
2062 case Intrinsic::round:
2063 case Intrinsic::canonicalize:
2064 case Intrinsic::lround:
2065 case Intrinsic::llround:
2066 case Intrinsic::lrint:
2067 case Intrinsic::llrint:
2068 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2069 return true;
2070 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2071 return true;
2072 // Some operations can be handled by vector instructions and assume
2073 // unsupported vectors will be expanded into supported scalar ones.
2074 // TODO Handle scalar operations properly.
2075 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2076 case Intrinsic::masked_store:
2077 case Intrinsic::masked_load:
2078 case Intrinsic::masked_gather:
2079 case Intrinsic::masked_scatter:
2080 return !ST->hasMVEIntegerOps();
2081 case Intrinsic::sadd_with_overflow:
2082 case Intrinsic::uadd_with_overflow:
2083 case Intrinsic::ssub_with_overflow:
2084 case Intrinsic::usub_with_overflow:
2085 case Intrinsic::sadd_sat:
2086 case Intrinsic::uadd_sat:
2087 case Intrinsic::ssub_sat:
2088 case Intrinsic::usub_sat:
2089 return false;
2090 }
2091
2092 return BaseT::isLoweredToCall(F);
2093}
2094
2096 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2097 EVT VT = TLI->getValueType(DL, I.getType(), true);
2098 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2099 return true;
2100
2101 // Check if an intrinsic will be lowered to a call and assume that any
2102 // other CallInst will generate a bl.
2103 if (auto *Call = dyn_cast<CallInst>(&I)) {
2104 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2105 switch(II->getIntrinsicID()) {
2106 case Intrinsic::memcpy:
2107 case Intrinsic::memset:
2108 case Intrinsic::memmove:
2109 return getNumMemOps(II) == -1;
2110 default:
2111 if (const Function *F = Call->getCalledFunction())
2112 return isLoweredToCall(F);
2113 }
2114 }
2115 return true;
2116 }
2117
2118 // FPv5 provides conversions between integer, double-precision,
2119 // single-precision, and half-precision formats.
2120 switch (I.getOpcode()) {
2121 default:
2122 break;
2123 case Instruction::FPToSI:
2124 case Instruction::FPToUI:
2125 case Instruction::SIToFP:
2126 case Instruction::UIToFP:
2127 case Instruction::FPTrunc:
2128 case Instruction::FPExt:
2129 return !ST->hasFPARMv8Base();
2130 }
2131
2132 // FIXME: Unfortunately the approach of checking the Operation Action does
2133 // not catch all cases of Legalization that use library calls. Our
2134 // Legalization step categorizes some transformations into library calls as
2135 // Custom, Expand or even Legal when doing type legalization. So for now
2136 // we have to special case for instance the SDIV of 64bit integers and the
2137 // use of floating point emulation.
2138 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2139 switch (ISD) {
2140 default:
2141 break;
2142 case ISD::SDIV:
2143 case ISD::UDIV:
2144 case ISD::SREM:
2145 case ISD::UREM:
2146 case ISD::SDIVREM:
2147 case ISD::UDIVREM:
2148 return true;
2149 }
2150 }
2151
2152 // Assume all other non-float operations are supported.
2153 if (!VT.isFloatingPoint())
2154 return false;
2155
2156 // We'll need a library call to handle most floats when using soft.
2157 if (TLI->useSoftFloat()) {
2158 switch (I.getOpcode()) {
2159 default:
2160 return true;
2161 case Instruction::Alloca:
2162 case Instruction::Load:
2163 case Instruction::Store:
2164 case Instruction::Select:
2165 case Instruction::PHI:
2166 return false;
2167 }
2168 }
2169
2170 // We'll need a libcall to perform double precision operations on a single
2171 // precision only FPU.
2172 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2173 return true;
2174
2175 // Likewise for half precision arithmetic.
2176 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2177 return true;
2178
2179 return false;
2180}
2181
2183 AssumptionCache &AC,
2184 TargetLibraryInfo *LibInfo,
2185 HardwareLoopInfo &HWLoopInfo) {
2186 // Low-overhead branches are only supported in the 'low-overhead branch'
2187 // extension of v8.1-m.
2188 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2189 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2190 return false;
2191 }
2192
2194 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2195 return false;
2196 }
2197
2198 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2199 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2200 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2201 return false;
2202 }
2203
2204 const SCEV *TripCountSCEV =
2205 SE.getAddExpr(BackedgeTakenCount,
2206 SE.getOne(BackedgeTakenCount->getType()));
2207
2208 // We need to store the trip count in LR, a 32-bit register.
2209 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2210 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2211 return false;
2212 }
2213
2214 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2215 // point in generating a hardware loop if that's going to happen.
2216
2217 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2218 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2219 switch (Call->getIntrinsicID()) {
2220 default:
2221 break;
2222 case Intrinsic::start_loop_iterations:
2223 case Intrinsic::test_start_loop_iterations:
2224 case Intrinsic::loop_decrement:
2225 case Intrinsic::loop_decrement_reg:
2226 return true;
2227 }
2228 }
2229 return false;
2230 };
2231
2232 // Scan the instructions to see if there's any that we know will turn into a
2233 // call or if this loop is already a low-overhead loop or will become a tail
2234 // predicated loop.
2235 bool IsTailPredLoop = false;
2236 auto ScanLoop = [&](Loop *L) {
2237 for (auto *BB : L->getBlocks()) {
2238 for (auto &I : *BB) {
2239 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2240 isa<InlineAsm>(I)) {
2241 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2242 return false;
2243 }
2244 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2245 IsTailPredLoop |=
2246 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2247 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2248 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2249 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2250 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2251 }
2252 }
2253 return true;
2254 };
2255
2256 // Visit inner loops.
2257 for (auto *Inner : *L)
2258 if (!ScanLoop(Inner))
2259 return false;
2260
2261 if (!ScanLoop(L))
2262 return false;
2263
2264 // TODO: Check whether the trip count calculation is expensive. If L is the
2265 // inner loop but we know it has a low trip count, calculating that trip
2266 // count (in the parent loop) may be detrimental.
2267
2268 LLVMContext &C = L->getHeader()->getContext();
2269 HWLoopInfo.CounterInReg = true;
2270 HWLoopInfo.IsNestingLegal = false;
2271 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2272 HWLoopInfo.CountType = Type::getInt32Ty(C);
2273 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2274 return true;
2275}
2276
2277static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2278 // We don't allow icmp's, and because we only look at single block loops,
2279 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2280 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2281 return false;
2282 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2283 // not currently canonical, but soon will be. Code without them uses icmp, and
2284 // so is not tail predicated as per the condition above. In order to get the
2285 // same performance we treat min and max the same as an icmp for tailpred
2286 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2287 // pick more optimial instructions like VQDMULH. They need to be recognized
2288 // directly by the vectorizer).
2289 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2290 if ((II->getIntrinsicID() == Intrinsic::smin ||
2291 II->getIntrinsicID() == Intrinsic::smax ||
2292 II->getIntrinsicID() == Intrinsic::umin ||
2293 II->getIntrinsicID() == Intrinsic::umax) &&
2294 ++ICmpCount > 1)
2295 return false;
2296
2297 if (isa<FCmpInst>(&I))
2298 return false;
2299
2300 // We could allow extending/narrowing FP loads/stores, but codegen is
2301 // too inefficient so reject this for now.
2302 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2303 return false;
2304
2305 // Extends have to be extending-loads
2306 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2307 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2308 return false;
2309
2310 // Truncs have to be narrowing-stores
2311 if (isa<TruncInst>(&I) )
2312 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2313 return false;
2314
2315 return true;
2316}
2317
2318// To set up a tail-predicated loop, we need to know the total number of
2319// elements processed by that loop. Thus, we need to determine the element
2320// size and:
2321// 1) it should be uniform for all operations in the vector loop, so we
2322// e.g. don't want any widening/narrowing operations.
2323// 2) it should be smaller than i64s because we don't have vector operations
2324// that work on i64s.
2325// 3) we don't want elements to be reversed or shuffled, to make sure the
2326// tail-predication masks/predicates the right lanes.
2327//
2329 const DataLayout &DL,
2330 const LoopAccessInfo *LAI) {
2331 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2332
2333 // If there are live-out values, it is probably a reduction. We can predicate
2334 // most reduction operations freely under MVE using a combination of
2335 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2336 // floating point and integer reductions, but don't check for operators
2337 // specifically here. If the value ends up not being a reduction (and so the
2338 // vectorizer cannot tailfold the loop), we should fall back to standard
2339 // vectorization automatically.
2341 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2342 bool ReductionsDisabled =
2345
2346 for (auto *I : LiveOuts) {
2347 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2348 !I->getType()->isHalfTy()) {
2349 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2350 "live-out value\n");
2351 return false;
2352 }
2353 if (ReductionsDisabled) {
2354 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2355 return false;
2356 }
2357 }
2358
2359 // Next, check that all instructions can be tail-predicated.
2360 PredicatedScalarEvolution PSE = LAI->getPSE();
2362 int ICmpCount = 0;
2363
2364 for (BasicBlock *BB : L->blocks()) {
2365 for (Instruction &I : BB->instructionsWithoutDebug()) {
2366 if (isa<PHINode>(&I))
2367 continue;
2368 if (!canTailPredicateInstruction(I, ICmpCount)) {
2369 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2370 return false;
2371 }
2372
2373 Type *T = I.getType();
2374 if (T->getScalarSizeInBits() > 32) {
2375 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2376 return false;
2377 }
2378 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2380 Type *AccessTy = getLoadStoreType(&I);
2381 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2382 if (NextStride == 1) {
2383 // TODO: for now only allow consecutive strides of 1. We could support
2384 // other strides as long as it is uniform, but let's keep it simple
2385 // for now.
2386 continue;
2387 } else if (NextStride == -1 ||
2388 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2389 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2391 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2392 "be tail-predicated\n.");
2393 return false;
2394 // TODO: don't tail predicate if there is a reversed load?
2395 } else if (EnableMaskedGatherScatters) {
2396 // Gather/scatters do allow loading from arbitrary strides, at
2397 // least if they are loop invariant.
2398 // TODO: Loop variant strides should in theory work, too, but
2399 // this requires further testing.
2400 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2401 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2402 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2403 if (PSE.getSE()->isLoopInvariant(Step, L))
2404 continue;
2405 }
2406 }
2407 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2408 "tail-predicate\n.");
2409 return false;
2410 }
2411 }
2412 }
2413
2414 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2415 return true;
2416}
2417
2419 if (!EnableTailPredication) {
2420 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2421 return false;
2422 }
2423
2424 // Creating a predicated vector loop is the first step for generating a
2425 // tail-predicated hardware loop, for which we need the MVE masked
2426 // load/stores instructions:
2427 if (!ST->hasMVEIntegerOps())
2428 return false;
2429
2430 LoopVectorizationLegality *LVL = TFI->LVL;
2431 Loop *L = LVL->getLoop();
2432
2433 // For now, restrict this to single block loops.
2434 if (L->getNumBlocks() > 1) {
2435 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2436 "loop.\n");
2437 return false;
2438 }
2439
2440 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2441
2442 LoopInfo *LI = LVL->getLoopInfo();
2443 HardwareLoopInfo HWLoopInfo(L);
2444 if (!HWLoopInfo.canAnalyze(*LI)) {
2445 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2446 "analyzable.\n");
2447 return false;
2448 }
2449
2452
2453 // This checks if we have the low-overhead branch architecture
2454 // extension, and if we will create a hardware-loop:
2455 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2456 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2457 "profitable.\n");
2458 return false;
2459 }
2460
2461 DominatorTree *DT = LVL->getDominatorTree();
2462 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2463 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2464 "a candidate.\n");
2465 return false;
2466 }
2467
2468 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2469}
2470
2472ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2473 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2475
2476 // Intrinsic @llvm.get.active.lane.mask is supported.
2477 // It is used in the MVETailPredication pass, which requires the number of
2478 // elements processed by this vector loop to setup the tail-predicated
2479 // loop.
2481}
2485 // Enable Upper bound unrolling universally, providing that we do not see an
2486 // active lane mask, which will be better kept as a loop to become tail
2487 // predicated than to be conditionally unrolled.
2488 UP.UpperBound =
2489 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2490 return isa<IntrinsicInst>(I) &&
2491 cast<IntrinsicInst>(I).getIntrinsicID() ==
2492 Intrinsic::get_active_lane_mask;
2493 });
2494
2495 // Only currently enable these preferences for M-Class cores.
2496 if (!ST->isMClass())
2497 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2498
2499 // Disable loop unrolling for Oz and Os.
2500 UP.OptSizeThreshold = 0;
2502 if (L->getHeader()->getParent()->hasOptSize())
2503 return;
2504
2505 SmallVector<BasicBlock*, 4> ExitingBlocks;
2506 L->getExitingBlocks(ExitingBlocks);
2507 LLVM_DEBUG(dbgs() << "Loop has:\n"
2508 << "Blocks: " << L->getNumBlocks() << "\n"
2509 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2510
2511 // Only allow another exit other than the latch. This acts as an early exit
2512 // as it mirrors the profitability calculation of the runtime unroller.
2513 if (ExitingBlocks.size() > 2)
2514 return;
2515
2516 // Limit the CFG of the loop body for targets with a branch predictor.
2517 // Allowing 4 blocks permits if-then-else diamonds in the body.
2518 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2519 return;
2520
2521 // Don't unroll vectorized loops, including the remainder loop
2522 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2523 return;
2524
2525 // Scan the loop: don't unroll loops with calls as this could prevent
2526 // inlining.
2528 for (auto *BB : L->getBlocks()) {
2529 for (auto &I : *BB) {
2530 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2531 // scalar code.
2532 if (I.getType()->isVectorTy())
2533 return;
2534
2535 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2536 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2537 if (!isLoweredToCall(F))
2538 continue;
2539 }
2540 return;
2541 }
2542
2543 SmallVector<const Value*, 4> Operands(I.operand_values());
2546 }
2547 }
2548
2549 // On v6m cores, there are very few registers available. We can easily end up
2550 // spilling and reloading more registers in an unrolled loop. Look at the
2551 // number of LCSSA phis as a rough measure of how many registers will need to
2552 // be live out of the loop, reducing the default unroll count if more than 1
2553 // value is needed. In the long run, all of this should be being learnt by a
2554 // machine.
2555 unsigned UnrollCount = 4;
2556 if (ST->isThumb1Only()) {
2557 unsigned ExitingValues = 0;
2559 L->getExitBlocks(ExitBlocks);
2560 for (auto *Exit : ExitBlocks) {
2561 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2562 // only the last is expected to be needed for address operands.
2563 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2564 return PH.getNumOperands() != 1 ||
2565 !isa<GetElementPtrInst>(PH.getOperand(0));
2566 });
2567 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2568 }
2569 if (ExitingValues)
2570 UnrollCount /= ExitingValues;
2571 if (UnrollCount <= 1)
2572 return;
2573 }
2574
2575 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2576 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2577
2578 UP.Partial = true;
2579 UP.Runtime = true;
2580 UP.UnrollRemainder = true;
2582 UP.UnrollAndJam = true;
2584
2585 // Force unrolling small loops can be very useful because of the branch
2586 // taken cost of the backedge.
2587 if (Cost < 12)
2588 UP.Force = true;
2589}
2590
2594}
2595
2596bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2597 TTI::ReductionFlags Flags) const {
2598 if (!ST->hasMVEIntegerOps())
2599 return false;
2600
2601 unsigned ScalarBits = Ty->getScalarSizeInBits();
2602 switch (Opcode) {
2603 case Instruction::Add:
2604 return ScalarBits <= 64;
2605 default:
2606 return false;
2607 }
2608}
2609
2611 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2612 if (!ST->hasMVEIntegerOps())
2613 return false;
2614 return true;
2615}
2616
2618 StackOffset BaseOffset,
2619 bool HasBaseReg, int64_t Scale,
2620 unsigned AddrSpace) const {
2622 AM.BaseGV = BaseGV;
2623 AM.BaseOffs = BaseOffset.getFixed();
2624 AM.HasBaseReg = HasBaseReg;
2625 AM.Scale = Scale;
2626 AM.ScalableOffset = BaseOffset.getScalable();
2627 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2628 if (ST->hasFPAO())
2629 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2630 return 0;
2631 }
2632 return -1;
2633}
2634
2635bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2636 if (Thumb) {
2637 // B.W is available in any Thumb2-supporting target, and also in every
2638 // version of Armv8-M, even Baseline which does not include the rest of
2639 // Thumb2.
2640 return ST->isThumb2() || ST->hasV8MBaselineOps();
2641 } else {
2642 // B is available in all versions of the Arm ISA, so the only question is
2643 // whether that ISA is available at all.
2644 return ST->hasARMOps();
2645 }
2646}
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
bool hasARMOps() const
Definition: ARMSubtarget.h:265
bool isThumb1Only() const
Definition: ARMSubtarget.h:364
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:274
bool isThumb2() const
Definition: ARMSubtarget.h:365
bool hasVFP2Base() const
Definition: ARMSubtarget.h:271
bool isMClass() const
Definition: ARMSubtarget.h:366
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:489
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool maybeLoweredToCall(Instruction &I)
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
InstructionCost getMemcpyCost(const Instruction *I)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLoweredToCall(const Function *F)
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool hasArmWideBranch(bool Thumb) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalMaskedGather(Type *Ty, Align Alignment)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool isProfitableLSRChainElement(Instruction *I)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:583
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:970
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:763
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:655
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:892
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:856
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
This class represents a range of values.
Definition: ConstantRange.h:47
ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition: Constant.h:42
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:461
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:621
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1824
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
The core instruction combiner logic.
Definition: InstCombiner.h:47
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:341
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:340
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:386
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth, const SimplifyQuery &Q)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:410
BuilderTy & Builder
Definition: InstCombiner.h:60
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:338
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
bool isShift() const
Definition: Instruction.h:281
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:212
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:242
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:123
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1928
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
InstructionCost Cost
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
bool isFixedLengthVector() const
Definition: ValueTypes.h:178
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:314
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55