LLVM 20.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
69/// Convert a vector load intrinsic into a simple llvm load instruction.
70/// This is beneficial when the underlying object being addressed comes
71/// from a constant, since we get constant-folding for free.
72static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Alignment))
84 return nullptr;
85
86 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
87 PointerType::get(II.getType(), 0));
88 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
89}
90
92 const Function *Callee) const {
93 const TargetMachine &TM = getTLI()->getTargetMachine();
94 const FeatureBitset &CallerBits =
95 TM.getSubtargetImpl(*Caller)->getFeatureBits();
96 const FeatureBitset &CalleeBits =
97 TM.getSubtargetImpl(*Callee)->getFeatureBits();
98
99 // To inline a callee, all features not in the allowed list must match exactly.
100 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
101 (CalleeBits & ~InlineFeaturesAllowed);
102 // For features in the allowed list, the callee's features must be a subset of
103 // the callers'.
104 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
105 (CalleeBits & InlineFeaturesAllowed);
106 return MatchExact && MatchSubset;
107}
108
111 ScalarEvolution *SE) const {
112 if (ST->hasMVEIntegerOps())
114
115 if (L->getHeader()->getParent()->hasOptSize())
116 return TTI::AMK_None;
117
118 if (ST->isMClass() && ST->isThumb2() &&
119 L->getNumBlocks() == 1)
120 return TTI::AMK_PreIndexed;
121
122 return TTI::AMK_None;
123}
124
125std::optional<Instruction *>
127 using namespace PatternMatch;
128 Intrinsic::ID IID = II.getIntrinsicID();
129 switch (IID) {
130 default:
131 break;
132 case Intrinsic::arm_neon_vld1: {
133 Align MemAlign =
134 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
136 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
137 return IC.replaceInstUsesWith(II, V);
138 }
139 break;
140 }
141
142 case Intrinsic::arm_neon_vld2:
143 case Intrinsic::arm_neon_vld3:
144 case Intrinsic::arm_neon_vld4:
145 case Intrinsic::arm_neon_vld2lane:
146 case Intrinsic::arm_neon_vld3lane:
147 case Intrinsic::arm_neon_vld4lane:
148 case Intrinsic::arm_neon_vst1:
149 case Intrinsic::arm_neon_vst2:
150 case Intrinsic::arm_neon_vst3:
151 case Intrinsic::arm_neon_vst4:
152 case Intrinsic::arm_neon_vst2lane:
153 case Intrinsic::arm_neon_vst3lane:
154 case Intrinsic::arm_neon_vst4lane: {
155 Align MemAlign =
156 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
158 unsigned AlignArg = II.arg_size() - 1;
159 Value *AlignArgOp = II.getArgOperand(AlignArg);
160 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
161 if (Align && *Align < MemAlign) {
162 return IC.replaceOperand(
163 II, AlignArg,
164 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
165 false));
166 }
167 break;
168 }
169
170 case Intrinsic::arm_neon_vld1x2:
171 case Intrinsic::arm_neon_vld1x3:
172 case Intrinsic::arm_neon_vld1x4:
173 case Intrinsic::arm_neon_vst1x2:
174 case Intrinsic::arm_neon_vst1x3:
175 case Intrinsic::arm_neon_vst1x4: {
176 Align NewAlign =
177 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
179 Align OldAlign = II.getParamAlign(0).valueOrOne();
180 if (NewAlign > OldAlign)
181 II.addParamAttr(0,
182 Attribute::getWithAlignment(II.getContext(), NewAlign));
183 break;
184 }
185
186 case Intrinsic::arm_mve_pred_i2v: {
187 Value *Arg = II.getArgOperand(0);
188 Value *ArgArg;
189 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
190 PatternMatch::m_Value(ArgArg))) &&
191 II.getType() == ArgArg->getType()) {
192 return IC.replaceInstUsesWith(II, ArgArg);
193 }
194 Constant *XorMask;
195 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
196 PatternMatch::m_Value(ArgArg)),
197 PatternMatch::m_Constant(XorMask))) &&
198 II.getType() == ArgArg->getType()) {
199 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
200 if (CI->getValue().trunc(16).isAllOnes()) {
201 auto TrueVector = IC.Builder.CreateVectorSplat(
202 cast<FixedVectorType>(II.getType())->getNumElements(),
203 IC.Builder.getTrue());
204 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
205 }
206 }
207 }
208 KnownBits ScalarKnown(32);
209 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
210 ScalarKnown)) {
211 return &II;
212 }
213 break;
214 }
215 case Intrinsic::arm_mve_pred_v2i: {
216 Value *Arg = II.getArgOperand(0);
217 Value *ArgArg;
218 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
219 PatternMatch::m_Value(ArgArg)))) {
220 return IC.replaceInstUsesWith(II, ArgArg);
221 }
222
223 if (II.getMetadata(LLVMContext::MD_range))
224 break;
225
226 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
227
228 if (auto CurrentRange = II.getRange()) {
229 Range = Range.intersectWith(*CurrentRange);
230 if (Range == CurrentRange)
231 break;
232 }
233
234 II.addRangeRetAttr(Range);
235 II.addRetAttr(Attribute::NoUndef);
236 return &II;
237 }
238 case Intrinsic::arm_mve_vadc:
239 case Intrinsic::arm_mve_vadc_predicated: {
240 unsigned CarryOp =
241 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
242 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
243 "Bad type for intrinsic!");
244
245 KnownBits CarryKnown(32);
246 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
247 CarryKnown)) {
248 return &II;
249 }
250 break;
251 }
252 case Intrinsic::arm_mve_vmldava: {
253 Instruction *I = cast<Instruction>(&II);
254 if (I->hasOneUse()) {
255 auto *User = cast<Instruction>(*I->user_begin());
256 Value *OpZ;
257 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
258 match(I->getOperand(3), m_Zero())) {
259 Value *OpX = I->getOperand(4);
260 Value *OpY = I->getOperand(5);
261 Type *OpTy = OpX->getType();
262
264 Value *V =
265 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
266 {I->getOperand(0), I->getOperand(1),
267 I->getOperand(2), OpZ, OpX, OpY});
268
270 return IC.eraseInstFromFunction(*User);
271 }
272 }
273 return std::nullopt;
274 }
275 }
276 return std::nullopt;
277}
278
280 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
281 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
282 std::function<void(Instruction *, unsigned, APInt, APInt &)>
283 SimplifyAndSetOp) const {
284
285 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
286 // opcode specifying a Top/Bottom instruction, which can change between
287 // instructions.
288 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
289 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
290 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
291
292 // The only odd/even lanes of operand 0 will only be demanded depending
293 // on whether this is a top/bottom instruction.
294 APInt DemandedElts =
295 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
296 : APInt::getHighBitsSet(2, 1));
297 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
298 // The other lanes will be defined from the inserted elements.
299 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
300 : APInt::getHighBitsSet(2, 1));
301 return std::nullopt;
302 };
303
304 switch (II.getIntrinsicID()) {
305 default:
306 break;
307 case Intrinsic::arm_mve_vcvt_narrow:
308 SimplifyNarrowInstrTopBottom(2);
309 break;
310 case Intrinsic::arm_mve_vqmovn:
311 SimplifyNarrowInstrTopBottom(4);
312 break;
313 case Intrinsic::arm_mve_vshrn:
314 SimplifyNarrowInstrTopBottom(7);
315 break;
316 }
317
318 return std::nullopt;
319}
320
323 assert(Ty->isIntegerTy());
324
325 unsigned Bits = Ty->getPrimitiveSizeInBits();
326 if (Bits == 0 || Imm.getActiveBits() >= 64)
327 return 4;
328
329 int64_t SImmVal = Imm.getSExtValue();
330 uint64_t ZImmVal = Imm.getZExtValue();
331 if (!ST->isThumb()) {
332 if ((SImmVal >= 0 && SImmVal < 65536) ||
333 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
334 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
335 return 1;
336 return ST->hasV6T2Ops() ? 2 : 3;
337 }
338 if (ST->isThumb2()) {
339 if ((SImmVal >= 0 && SImmVal < 65536) ||
340 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
341 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
342 return 1;
343 return ST->hasV6T2Ops() ? 2 : 3;
344 }
345 // Thumb1, any i8 imm cost 1.
346 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
347 return 1;
348 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
349 return 2;
350 // Load from constantpool.
351 return 3;
352}
353
354// Constants smaller than 256 fit in the immediate field of
355// Thumb1 instructions so we return a zero cost and 1 otherwise.
357 const APInt &Imm, Type *Ty) {
358 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
359 return 0;
360
361 return 1;
362}
363
364// Checks whether Inst is part of a min(max()) or max(min()) pattern
365// that will match to an SSAT instruction. Returns the instruction being
366// saturated, or null if no saturation pattern was found.
367static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
368 Value *LHS, *RHS;
369 ConstantInt *C;
371
372 if (InstSPF == SPF_SMAX &&
374 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376 auto isSSatMin = [&](Value *MinInst) {
377 if (isa<SelectInst>(MinInst)) {
378 Value *MinLHS, *MinRHS;
379 ConstantInt *MinC;
380 SelectPatternFlavor MinSPF =
381 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
382 if (MinSPF == SPF_SMIN &&
384 MinC->getValue() == ((-Imm) - 1))
385 return true;
386 }
387 return false;
388 };
389
390 if (isSSatMin(Inst->getOperand(1)))
391 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
392 if (Inst->hasNUses(2) &&
393 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
394 return Inst->getOperand(1);
395 }
396 return nullptr;
397}
398
399// Look for a FP Saturation pattern, where the instruction can be simplified to
400// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
401static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
402 if (Imm.getBitWidth() != 64 ||
403 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
404 return false;
405 Value *FP = isSSATMinMaxPattern(Inst, Imm);
406 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
407 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
408 if (!FP)
409 return false;
410 return isa<FPToSIInst>(FP);
411}
412
414 const APInt &Imm, Type *Ty,
416 Instruction *Inst) {
417 // Division by a constant can be turned into multiplication, but only if we
418 // know it's constant. So it's not so much that the immediate is cheap (it's
419 // not), but that the alternative is worse.
420 // FIXME: this is probably unneeded with GlobalISel.
421 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
422 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
423 Idx == 1)
424 return 0;
425
426 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427 // splitting any large offsets.
428 if (Opcode == Instruction::GetElementPtr && Idx != 0)
429 return 0;
430
431 if (Opcode == Instruction::And) {
432 // UXTB/UXTH
433 if (Imm == 255 || Imm == 65535)
434 return 0;
435 // Conversion to BIC is free, and means we can use ~Imm instead.
436 return std::min(getIntImmCost(Imm, Ty, CostKind),
437 getIntImmCost(~Imm, Ty, CostKind));
438 }
439
440 if (Opcode == Instruction::Add)
441 // Conversion to SUB is free, and means we can use -Imm instead.
442 return std::min(getIntImmCost(Imm, Ty, CostKind),
443 getIntImmCost(-Imm, Ty, CostKind));
444
445 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446 Ty->getIntegerBitWidth() == 32) {
447 int64_t NegImm = -Imm.getSExtValue();
448 if (ST->isThumb2() && NegImm < 1<<12)
449 // icmp X, #-C -> cmn X, #C
450 return 0;
451 if (ST->isThumb() && NegImm < 1<<8)
452 // icmp X, #-C -> adds X, #C
453 return 0;
454 }
455
456 // xor a, -1 can always be folded to MVN
457 if (Opcode == Instruction::Xor && Imm.isAllOnes())
458 return 0;
459
460 // Ensures negative constant of min(max()) or max(min()) patterns that
461 // match to SSAT instructions don't get hoisted
462 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
463 Ty->getIntegerBitWidth() <= 32) {
464 if (isSSATMinMaxPattern(Inst, Imm) ||
465 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
466 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
467 return 0;
468 }
469
470 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471 return 0;
472
473 // We can convert <= -1 to < 0, which is generally quite cheap.
474 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
475 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
476 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
477 return std::min(getIntImmCost(Imm, Ty, CostKind),
478 getIntImmCost(Imm + 1, Ty, CostKind));
479 }
480
481 return getIntImmCost(Imm, Ty, CostKind);
482}
483
486 const Instruction *I) {
488 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
489 // FIXME: The vectorizer is highly sensistive to the cost of these
490 // instructions, which suggests that it may be using the costs incorrectly.
491 // But, for now, just make them free to avoid performance regressions for
492 // vector targets.
493 return 0;
494 }
495 return BaseT::getCFInstrCost(Opcode, CostKind, I);
496}
497
499 Type *Src,
502 const Instruction *I) {
503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
504 assert(ISD && "Invalid opcode");
505
506 // TODO: Allow non-throughput costs that aren't binary.
507 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
509 return Cost == 0 ? 0 : 1;
510 return Cost;
511 };
512 auto IsLegalFPType = [this](EVT VT) {
513 EVT EltVT = VT.getScalarType();
514 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
515 (EltVT == MVT::f64 && ST->hasFP64()) ||
516 (EltVT == MVT::f16 && ST->hasFullFP16());
517 };
518
519 EVT SrcTy = TLI->getValueType(DL, Src);
520 EVT DstTy = TLI->getValueType(DL, Dst);
521
522 if (!SrcTy.isSimple() || !DstTy.isSimple())
523 return AdjustCost(
524 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526 // Extending masked load/Truncating masked stores is expensive because we
527 // currently don't split them. This means that we'll likely end up
528 // loading/storing each element individually (hence the high cost).
529 if ((ST->hasMVEIntegerOps() &&
530 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
531 Opcode == Instruction::SExt)) ||
532 (ST->hasMVEFloatOps() &&
533 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
534 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
535 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
536 return 2 * DstTy.getVectorNumElements() *
538
539 // The extend of other kinds of load is free
540 if (CCH == TTI::CastContextHint::Normal ||
542 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
544 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
545 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
546 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
547 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
548 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
549 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
550 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
551 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
552 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
553 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
554 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
555 };
556 if (const auto *Entry = ConvertCostTableLookup(
557 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
558 return AdjustCost(Entry->Cost);
559
560 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
562 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
563 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
564 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
565 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
566 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
567 // The following extend from a legal type to an illegal type, so need to
568 // split the load. This introduced an extra load operation, but the
569 // extend is still "free".
570 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
571 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
572 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
573 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
574 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
575 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
576 };
577 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578 if (const auto *Entry =
579 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
580 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
581 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582 }
583
584 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585 // FPExtends are similar but also require the VCVT instructions.
586 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
587 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
588 };
589 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590 if (const auto *Entry =
591 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
592 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
593 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594 }
595
596 // The truncate of a store is free. This is the mirror of extends above.
597 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
599 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
600 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
601 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
602 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
603 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
604 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
605 };
606 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607 if (const auto *Entry =
608 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
609 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
610 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611 }
612
613 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
615 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
616 };
617 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618 if (const auto *Entry =
619 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
620 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
621 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622 }
623 }
624
625 // NEON vector operations that can extend their inputs.
626 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
627 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629 // vaddl
630 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
631 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
632 // vsubl
633 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
634 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
635 // vmull
636 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
637 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
638 // vshll
639 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
640 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
641 };
642
643 auto *User = cast<Instruction>(*I->user_begin());
644 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
645 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
646 DstTy.getSimpleVT(),
647 SrcTy.getSimpleVT())) {
648 return AdjustCost(Entry->Cost);
649 }
650 }
651
652 // Single to/from double precision conversions.
653 if (Src->isVectorTy() && ST->hasNEON() &&
654 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655 DstTy.getScalarType() == MVT::f32) ||
656 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657 DstTy.getScalarType() == MVT::f64))) {
658 static const CostTblEntry NEONFltDblTbl[] = {
659 // Vector fptrunc/fpext conversions.
660 {ISD::FP_ROUND, MVT::v2f64, 2},
661 {ISD::FP_EXTEND, MVT::v2f32, 2},
662 {ISD::FP_EXTEND, MVT::v4f32, 4}};
663
664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
665 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
666 return AdjustCost(LT.first * Entry->Cost);
667 }
668
669 // Some arithmetic, load and store operations have specific instructions
670 // to cast up/down their types automatically at no extra cost.
671 // TODO: Get these tables to know at least what the related operations are.
672 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
675 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
676 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
677 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
678 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
679
680 // The number of vmovl instructions for the extension.
681 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
682 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
683 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
684 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
685 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
686 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
687 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
688 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
689 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
690 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
691 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
692 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
693 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
694 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
695 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
696 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
697 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
698 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
699
700 // Operations that we legalize using splitting.
701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
703
704 // Vector float <-> i32 conversions.
705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
706 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
707
708 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
710 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
711 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
712 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
714 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
715 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
716 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
717 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
718 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
719 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
720 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
721 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
724 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
725 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
726 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
727 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
728
729 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
730 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
731 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
732 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
733 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
734 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
735
736 // Vector double <-> i32 conversions.
737 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
738 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
739
740 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
742 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
743 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
744 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
745 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
746
747 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
748 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
749 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
750 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
751 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
752 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
753 };
754
755 if (SrcTy.isVector() && ST->hasNEON()) {
756 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
757 DstTy.getSimpleVT(),
758 SrcTy.getSimpleVT()))
759 return AdjustCost(Entry->Cost);
760 }
761
762 // Scalar float to integer conversions.
763 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
765 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
766 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
767 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
768 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
769 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
770 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
771 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
772 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
773 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
774 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
775 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
776 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
777 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
778 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
779 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
780 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
781 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
782 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
783 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
784 };
785 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
787 DstTy.getSimpleVT(),
788 SrcTy.getSimpleVT()))
789 return AdjustCost(Entry->Cost);
790 }
791
792 // Scalar integer to float conversions.
793 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
795 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
796 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
797 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
798 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
799 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
800 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
801 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
802 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
803 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
804 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
805 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
806 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
807 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
808 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
809 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
810 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
811 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
812 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
813 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
814 };
815
816 if (SrcTy.isInteger() && ST->hasNEON()) {
817 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
818 ISD, DstTy.getSimpleVT(),
819 SrcTy.getSimpleVT()))
820 return AdjustCost(Entry->Cost);
821 }
822
823 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825 // are linearised so take more.
826 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
828 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
829 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
830 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
831 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
832 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
833 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
834 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
835 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
836 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
837 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
838 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
839 };
840
841 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
843 ISD, DstTy.getSimpleVT(),
844 SrcTy.getSimpleVT()))
845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846 }
847
848 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
849 // As general rule, fp converts that were not matched above are scalarized
850 // and cost 1 vcvt for each lane, so long as the instruction is available.
851 // If not it will become a series of function calls.
852 const InstructionCost CallCost =
853 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
854 int Lanes = 1;
855 if (SrcTy.isFixedLengthVector())
856 Lanes = SrcTy.getVectorNumElements();
857
858 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
859 return Lanes;
860 else
861 return Lanes * CallCost;
862 }
863
864 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865 SrcTy.isFixedLengthVector()) {
866 // Treat a truncate with larger than legal source (128bits for MVE) as
867 // expensive, 2 instructions per lane.
868 if ((SrcTy.getScalarType() == MVT::i8 ||
869 SrcTy.getScalarType() == MVT::i16 ||
870 SrcTy.getScalarType() == MVT::i32) &&
871 SrcTy.getSizeInBits() > 128 &&
872 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873 return SrcTy.getVectorNumElements() * 2;
874 }
875
876 // Scalar integer conversion costs.
877 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878 // i16 -> i64 requires two dependent operations.
879 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
880
881 // Truncates on i64 are assumed to be free.
882 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
883 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
884 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
885 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
886 };
887
888 if (SrcTy.isInteger()) {
889 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
890 DstTy.getSimpleVT(),
891 SrcTy.getSimpleVT()))
892 return AdjustCost(Entry->Cost);
893 }
894
895 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
897 : 1;
898 return AdjustCost(
899 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900}
901
904 unsigned Index, Value *Op0,
905 Value *Op1) {
906 // Penalize inserting into an D-subregister. We end up with a three times
907 // lower estimated throughput on swift.
908 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
910 return 3;
911
912 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
913 Opcode == Instruction::ExtractElement)) {
914 // Cross-class copies are expensive on many microarchitectures,
915 // so assume they are expensive by default.
916 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
917 return 3;
918
919 // Even if it's not a cross class copy, this likely leads to mixing
920 // of NEON and VFP code and should be therefore penalized.
921 if (ValTy->isVectorTy() &&
922 ValTy->getScalarSizeInBits() <= 32)
923 return std::max<InstructionCost>(
924 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
925 2U);
926 }
927
928 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
929 Opcode == Instruction::ExtractElement)) {
930 // Integer cross-lane moves are more expensive than float, which can
931 // sometimes just be vmovs. Integer involve being passes to GPR registers,
932 // causing more of a delay.
933 std::pair<InstructionCost, MVT> LT =
935 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
936 }
937
938 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
939}
940
942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
944 TTI::OperandValueInfo Op2Info, const Instruction *I) {
945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947 // Thumb scalar code size cost for select.
948 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
949 ST->isThumb() && !ValTy->isVectorTy()) {
950 // Assume expensive structs.
951 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
952 return TTI::TCC_Expensive;
953
954 // Select costs can vary because they:
955 // - may require one or more conditional mov (including an IT),
956 // - can't operate directly on immediates,
957 // - require live flags, which we can't copy around easily.
959
960 // Possible IT instruction for Thumb2, or more for Thumb1.
961 ++Cost;
962
963 // i1 values may need rematerialising by using mov immediates and/or
964 // flag setting instructions.
965 if (ValTy->isIntegerTy(1))
966 ++Cost;
967
968 return Cost;
969 }
970
971 // If this is a vector min/max/abs, use the cost of that intrinsic directly
972 // instead. Hopefully when min/max intrinsics are more prevalent this code
973 // will not be needed.
974 const Instruction *Sel = I;
975 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
976 Sel->hasOneUse())
977 Sel = cast<Instruction>(Sel->user_back());
978 if (Sel && ValTy->isVectorTy() &&
979 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
980 const Value *LHS, *RHS;
982 unsigned IID = 0;
983 switch (SPF) {
984 case SPF_ABS:
985 IID = Intrinsic::abs;
986 break;
987 case SPF_SMIN:
988 IID = Intrinsic::smin;
989 break;
990 case SPF_SMAX:
991 IID = Intrinsic::smax;
992 break;
993 case SPF_UMIN:
994 IID = Intrinsic::umin;
995 break;
996 case SPF_UMAX:
997 IID = Intrinsic::umax;
998 break;
999 case SPF_FMINNUM:
1000 IID = Intrinsic::minnum;
1001 break;
1002 case SPF_FMAXNUM:
1003 IID = Intrinsic::maxnum;
1004 break;
1005 default:
1006 break;
1007 }
1008 if (IID) {
1009 // The ICmp is free, the select gets the cost of the min/max/etc
1010 if (Sel != I)
1011 return 0;
1012 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013 return getIntrinsicInstrCost(CostAttrs, CostKind);
1014 }
1015 }
1016
1017 // On NEON a vector select gets lowered to vbsl.
1018 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019 // Lowering of some vector selects is currently far from perfect.
1020 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1022 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1023 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1024 };
1025
1026 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1027 EVT SelValTy = TLI->getValueType(DL, ValTy);
1028 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1030 SelCondTy.getSimpleVT(),
1031 SelValTy.getSimpleVT()))
1032 return Entry->Cost;
1033 }
1034
1035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1036 return LT.first;
1037 }
1038
1039 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1041 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1042 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1043 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1044 if (!VecCondTy)
1045 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1046
1047 // If we don't have mve.fp any fp operations will need to be scalarized.
1048 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049 // One scalaization insert, one scalarization extract and the cost of the
1050 // fcmps.
1051 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1052 /*Extract*/ true, CostKind) +
1053 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1054 /*Extract*/ false, CostKind) +
1055 VecValTy->getNumElements() *
1056 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1057 VecCondTy->getScalarType(), VecPred,
1058 CostKind, Op1Info, Op2Info, I);
1059 }
1060
1061 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1062 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063 // There are two types - the input that specifies the type of the compare
1064 // and the output vXi1 type. Because we don't know how the output will be
1065 // split, we may need an expensive shuffle to get two in sync. This has the
1066 // effect of making larger than legal compares (v8i32 for example)
1067 // expensive.
1068 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1069 if (LT.first > 1)
1070 return LT.first * BaseCost +
1071 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1072 /*Extract*/ false, CostKind);
1073 return BaseCost;
1074 }
1075 }
1076
1077 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078 // for "multiple beats" potentially needed by MVE instructions.
1079 int BaseCost = 1;
1080 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084 CostKind, Op1Info, Op2Info, I);
1085}
1086
1088 ScalarEvolution *SE,
1089 const SCEV *Ptr) {
1090 // Address computations in vectorized code with non-consecutive addresses will
1091 // likely result in more instructions compared to scalar code where the
1092 // computation can more often be merged into the index mode. The resulting
1093 // extra micro-ops can significantly decrease throughput.
1094 unsigned NumVectorInstToHideOverhead = 10;
1095 int MaxMergeDistance = 64;
1096
1097 if (ST->hasNEON()) {
1098 if (Ty->isVectorTy() && SE &&
1099 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1100 return NumVectorInstToHideOverhead;
1101
1102 // In many cases the address computation is not merged into the instruction
1103 // addressing mode.
1104 return 1;
1105 }
1106 return BaseT::getAddressComputationCost(Ty, SE, Ptr);
1107}
1108
1110 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1111 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1112 // optimized, else LSR may block tail-predication.
1113 switch (II->getIntrinsicID()) {
1114 case Intrinsic::arm_mve_vctp8:
1115 case Intrinsic::arm_mve_vctp16:
1116 case Intrinsic::arm_mve_vctp32:
1117 case Intrinsic::arm_mve_vctp64:
1118 return true;
1119 default:
1120 break;
1121 }
1122 }
1123 return false;
1124}
1125
1126bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
1127 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1128 return false;
1129
1130 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1131 // Don't support v2i1 yet.
1132 if (VecTy->getNumElements() == 2)
1133 return false;
1134
1135 // We don't support extending fp types.
1136 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1137 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1138 return false;
1139 }
1140
1141 unsigned EltWidth = DataTy->getScalarSizeInBits();
1142 return (EltWidth == 32 && Alignment >= 4) ||
1143 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1144}
1145
1147 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1148 return false;
1149
1150 unsigned EltWidth = Ty->getScalarSizeInBits();
1151 return ((EltWidth == 32 && Alignment >= 4) ||
1152 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1153}
1154
1155/// Given a memcpy/memset/memmove instruction, return the number of memory
1156/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1157/// call is used.
1159 MemOp MOp;
1160 unsigned DstAddrSpace = ~0u;
1161 unsigned SrcAddrSpace = ~0u;
1162 const Function *F = I->getParent()->getParent();
1163
1164 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1165 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1166 // If 'size' is not a constant, a library call will be generated.
1167 if (!C)
1168 return -1;
1169
1170 const unsigned Size = C->getValue().getZExtValue();
1171 const Align DstAlign = *MC->getDestAlign();
1172 const Align SrcAlign = *MC->getSourceAlign();
1173
1174 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1175 /*IsVolatile*/ false);
1176 DstAddrSpace = MC->getDestAddressSpace();
1177 SrcAddrSpace = MC->getSourceAddressSpace();
1178 }
1179 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1180 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1181 // If 'size' is not a constant, a library call will be generated.
1182 if (!C)
1183 return -1;
1184
1185 const unsigned Size = C->getValue().getZExtValue();
1186 const Align DstAlign = *MS->getDestAlign();
1187
1188 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1189 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1190 DstAddrSpace = MS->getDestAddressSpace();
1191 }
1192 else
1193 llvm_unreachable("Expected a memcpy/move or memset!");
1194
1195 unsigned Limit, Factor = 2;
1196 switch(I->getIntrinsicID()) {
1197 case Intrinsic::memcpy:
1198 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1199 break;
1200 case Intrinsic::memmove:
1201 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1202 break;
1203 case Intrinsic::memset:
1204 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1205 Factor = 1;
1206 break;
1207 default:
1208 llvm_unreachable("Expected a memcpy/move or memset!");
1209 }
1210
1211 // MemOps will be poplulated with a list of data types that needs to be
1212 // loaded and stored. That's why we multiply the number of elements by 2 to
1213 // get the cost for this memcpy.
1214 std::vector<EVT> MemOps;
1215 if (getTLI()->findOptimalMemOpLowering(
1216 MemOps, Limit, MOp, DstAddrSpace,
1217 SrcAddrSpace, F->getAttributes()))
1218 return MemOps.size() * Factor;
1219
1220 // If we can't find an optimal memop lowering, return the default cost
1221 return -1;
1222}
1223
1225 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1226
1227 // To model the cost of a library call, we assume 1 for the call, and
1228 // 3 for the argument setup.
1229 if (NumOps == -1)
1230 return 4;
1231 return NumOps;
1232}
1233
1235 VectorType *Tp, ArrayRef<int> Mask,
1237 int Index, VectorType *SubTp,
1239 const Instruction *CxtI) {
1240 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
1241 // Treat extractsubvector as single op permutation.
1242 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1243 if (IsExtractSubvector)
1245 if (ST->hasNEON()) {
1246 if (Kind == TTI::SK_Broadcast) {
1247 static const CostTblEntry NEONDupTbl[] = {
1248 // VDUP handles these cases.
1249 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1253 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1254 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1255
1256 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1257 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1258 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1259 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1260
1261 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1262 if (const auto *Entry =
1263 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1264 return LT.first * Entry->Cost;
1265 }
1266 if (Kind == TTI::SK_Reverse) {
1267 static const CostTblEntry NEONShuffleTbl[] = {
1268 // Reverse shuffle cost one instruction if we are shuffling within a
1269 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1270 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1271 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1274 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1275 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1276
1277 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1278 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1279 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1280 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1281
1282 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1283 if (const auto *Entry =
1284 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1285 return LT.first * Entry->Cost;
1286 }
1287 if (Kind == TTI::SK_Select) {
1288 static const CostTblEntry NEONSelShuffleTbl[] = {
1289 // Select shuffle cost table for ARM. Cost is the number of
1290 // instructions
1291 // required to create the shuffled vector.
1292
1293 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1294 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1295 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1296 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1297
1298 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1299 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1300 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1301
1302 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1303
1304 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1305
1306 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1307 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1308 ISD::VECTOR_SHUFFLE, LT.second))
1309 return LT.first * Entry->Cost;
1310 }
1311 }
1312 if (ST->hasMVEIntegerOps()) {
1313 if (Kind == TTI::SK_Broadcast) {
1314 static const CostTblEntry MVEDupTbl[] = {
1315 // VDUP handles these cases.
1316 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1319 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1320 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1321
1322 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1323 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1324 LT.second))
1325 return LT.first * Entry->Cost *
1327 }
1328
1329 if (!Mask.empty()) {
1330 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
1331 if (LT.second.isVector() &&
1332 Mask.size() <= LT.second.getVectorNumElements() &&
1333 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1334 isVREVMask(Mask, LT.second, 64)))
1335 return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1336 }
1337 }
1338
1339 // Restore optimal kind.
1340 if (IsExtractSubvector)
1342 int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1344 : 1;
1345 return BaseCost *
1346 BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1347}
1348
1350 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1353 const Instruction *CxtI) {
1354 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1355 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1356 // Make operations on i1 relatively expensive as this often involves
1357 // combining predicates. AND and XOR should be easier to handle with IT
1358 // blocks.
1359 switch (ISDOpcode) {
1360 default:
1361 break;
1362 case ISD::AND:
1363 case ISD::XOR:
1364 return 2;
1365 case ISD::OR:
1366 return 3;
1367 }
1368 }
1369
1370 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1371
1372 if (ST->hasNEON()) {
1373 const unsigned FunctionCallDivCost = 20;
1374 const unsigned ReciprocalDivCost = 10;
1375 static const CostTblEntry CostTbl[] = {
1376 // Division.
1377 // These costs are somewhat random. Choose a cost of 20 to indicate that
1378 // vectorizing devision (added function call) is going to be very expensive.
1379 // Double registers types.
1380 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1381 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1382 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1383 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1384 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1385 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1386 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1387 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1388 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1389 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1390 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1391 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1392 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1393 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1394 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1395 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1396 // Quad register types.
1397 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1398 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1399 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1400 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1401 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1402 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1403 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1404 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1405 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1406 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1407 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1408 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1409 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1410 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1411 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1412 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1413 // Multiplication.
1414 };
1415
1416 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1417 return LT.first * Entry->Cost;
1418
1420 Opcode, Ty, CostKind, Op1Info, Op2Info);
1421
1422 // This is somewhat of a hack. The problem that we are facing is that SROA
1423 // creates a sequence of shift, and, or instructions to construct values.
1424 // These sequences are recognized by the ISel and have zero-cost. Not so for
1425 // the vectorized code. Because we have support for v2i64 but not i64 those
1426 // sequences look particularly beneficial to vectorize.
1427 // To work around this we increase the cost of v2i64 operations to make them
1428 // seem less beneficial.
1429 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1430 Cost += 4;
1431
1432 return Cost;
1433 }
1434
1435 // If this operation is a shift on arm/thumb2, it might well be folded into
1436 // the following instruction, hence having a cost of 0.
1437 auto LooksLikeAFreeShift = [&]() {
1438 if (ST->isThumb1Only() || Ty->isVectorTy())
1439 return false;
1440
1441 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1442 return false;
1443 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1444 return false;
1445
1446 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1447 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1448 case Instruction::Add:
1449 case Instruction::Sub:
1450 case Instruction::And:
1451 case Instruction::Xor:
1452 case Instruction::Or:
1453 case Instruction::ICmp:
1454 return true;
1455 default:
1456 return false;
1457 }
1458 };
1459 if (LooksLikeAFreeShift())
1460 return 0;
1461
1462 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1463 // for "multiple beats" potentially needed by MVE instructions.
1464 int BaseCost = 1;
1465 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1466 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1467
1468 // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
1469 // without treating floats as more expensive that scalars or increasing the
1470 // costs for custom operations. The results is also multiplied by the
1471 // MVEVectorCostFactor where appropriate.
1472 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1473 return LT.first * BaseCost;
1474
1475 // Else this is expand, assume that we need to scalarize this op.
1476 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1477 unsigned Num = VTy->getNumElements();
1480 // Return the cost of multiple scalar invocation plus the cost of
1481 // inserting and extracting the values.
1482 SmallVector<Type *> Tys(Args.size(), Ty);
1483 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1484 Num * Cost;
1485 }
1486
1487 return BaseCost;
1488}
1489
1491 MaybeAlign Alignment,
1492 unsigned AddressSpace,
1494 TTI::OperandValueInfo OpInfo,
1495 const Instruction *I) {
1496 // TODO: Handle other cost kinds.
1498 return 1;
1499
1500 // Type legalization can't handle structs
1501 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1502 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1503 CostKind);
1504
1505 if (ST->hasNEON() && Src->isVectorTy() &&
1506 (Alignment && *Alignment != Align(16)) &&
1507 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1508 // Unaligned loads/stores are extremely inefficient.
1509 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1510 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1511 return LT.first * 4;
1512 }
1513
1514 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1515 // Same for stores.
1516 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1517 ((Opcode == Instruction::Load && I->hasOneUse() &&
1518 isa<FPExtInst>(*I->user_begin())) ||
1519 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1520 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1521 Type *DstTy =
1522 Opcode == Instruction::Load
1523 ? (*I->user_begin())->getType()
1524 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1525 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1526 DstTy->getScalarType()->isFloatTy())
1527 return ST->getMVEVectorCostFactor(CostKind);
1528 }
1529
1530 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1532 : 1;
1533 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1534 CostKind, OpInfo, I);
1535}
1536
1538ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1539 unsigned AddressSpace,
1541 if (ST->hasMVEIntegerOps()) {
1542 if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1543 return ST->getMVEVectorCostFactor(CostKind);
1544 if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1545 return ST->getMVEVectorCostFactor(CostKind);
1546 }
1547 if (!isa<FixedVectorType>(Src))
1548 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1549 CostKind);
1550 // Scalar cost, which is currently very high due to the efficiency of the
1551 // generated code.
1552 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1553}
1554
1556 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1557 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1558 bool UseMaskForCond, bool UseMaskForGaps) {
1559 assert(Factor >= 2 && "Invalid interleave factor");
1560 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1561
1562 // vldN/vstN doesn't support vector types of i64/f64 element.
1563 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1564
1565 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1566 !UseMaskForCond && !UseMaskForGaps) {
1567 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1568 auto *SubVecTy =
1569 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1570
1571 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1572 // Accesses having vector types that are a multiple of 128 bits can be
1573 // matched to more than one vldN/vstN instruction.
1574 int BaseCost =
1575 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1576 if (NumElts % Factor == 0 &&
1577 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1578 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1579
1580 // Some smaller than legal interleaved patterns are cheap as we can make
1581 // use of the vmovn or vrev patterns to interleave a standard load. This is
1582 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1583 // promoted differently). The cost of 2 here is then a load and vrev or
1584 // vmovn.
1585 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1586 VecTy->isIntOrIntVectorTy() &&
1587 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1588 return 2 * BaseCost;
1589 }
1590
1591 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1592 Alignment, AddressSpace, CostKind,
1593 UseMaskForCond, UseMaskForGaps);
1594}
1595
1597 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1598 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
1599 using namespace PatternMatch;
1600 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1601 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1602 Alignment, CostKind, I);
1603
1604 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1605 auto *VTy = cast<FixedVectorType>(DataTy);
1606
1607 // TODO: Splitting, once we do that.
1608
1609 unsigned NumElems = VTy->getNumElements();
1610 unsigned EltSize = VTy->getScalarSizeInBits();
1611 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1612
1613 // For now, it is assumed that for the MVE gather instructions the loads are
1614 // all effectively serialised. This means the cost is the scalar cost
1615 // multiplied by the number of elements being loaded. This is possibly very
1616 // conservative, but even so we still end up vectorising loops because the
1617 // cost per iteration for many loops is lower than for scalar loops.
1618 InstructionCost VectorCost =
1619 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1620 // The scalarization cost should be a lot higher. We use the number of vector
1621 // elements plus the scalarization overhead. If masking is required then a lot
1622 // of little blocks will be needed and potentially a scalarized p0 mask,
1623 // greatly increasing the cost.
1624 InstructionCost ScalarCost =
1625 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1626 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1627 CostKind) +
1628 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1629 CostKind);
1630
1631 if (EltSize < 8 || Alignment < EltSize / 8)
1632 return ScalarCost;
1633
1634 unsigned ExtSize = EltSize;
1635 // Check whether there's a single user that asks for an extended type
1636 if (I != nullptr) {
1637 // Dependent of the caller of this function, a gather instruction will
1638 // either have opcode Instruction::Load or be a call to the masked_gather
1639 // intrinsic
1640 if ((I->getOpcode() == Instruction::Load ||
1641 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1642 I->hasOneUse()) {
1643 const User *Us = *I->users().begin();
1644 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1645 // only allow valid type combinations
1646 unsigned TypeSize =
1647 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1648 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1649 (TypeSize == 16 && EltSize == 8)) &&
1650 TypeSize * NumElems == 128) {
1651 ExtSize = TypeSize;
1652 }
1653 }
1654 }
1655 // Check whether the input data needs to be truncated
1656 TruncInst *T;
1657 if ((I->getOpcode() == Instruction::Store ||
1658 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1659 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1660 // Only allow valid type combinations
1661 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1662 if (((EltSize == 16 && TypeSize == 32) ||
1663 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1664 TypeSize * NumElems == 128)
1665 ExtSize = TypeSize;
1666 }
1667 }
1668
1669 if (ExtSize * NumElems != 128 || NumElems < 4)
1670 return ScalarCost;
1671
1672 // Any (aligned) i32 gather will not need to be scalarised.
1673 if (ExtSize == 32)
1674 return VectorCost;
1675 // For smaller types, we need to ensure that the gep's inputs are correctly
1676 // extended from a small enough value. Other sizes (including i64) are
1677 // scalarized for now.
1678 if (ExtSize != 8 && ExtSize != 16)
1679 return ScalarCost;
1680
1681 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1682 Ptr = BC->getOperand(0);
1683 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1684 if (GEP->getNumOperands() != 2)
1685 return ScalarCost;
1686 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1687 // Scale needs to be correct (which is only relevant for i16s).
1688 if (Scale != 1 && Scale * 8 != ExtSize)
1689 return ScalarCost;
1690 // And we need to zext (not sext) the indexes from a small enough type.
1691 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1692 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1693 return VectorCost;
1694 }
1695 return ScalarCost;
1696 }
1697 return ScalarCost;
1698}
1699
1702 std::optional<FastMathFlags> FMF,
1704
1705 EVT ValVT = TLI->getValueType(DL, ValTy);
1706 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1707 unsigned EltSize = ValVT.getScalarSizeInBits();
1708
1709 // In general floating point reductions are a series of elementwise
1710 // operations, with free extracts on each step. These are either in-order or
1711 // treewise depending on whether that is allowed by the fast math flags.
1712 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1713 ((EltSize == 32 && ST->hasVFP2Base()) ||
1714 (EltSize == 64 && ST->hasFP64()) ||
1715 (EltSize == 16 && ST->hasFullFP16()))) {
1716 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1717 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1718 InstructionCost VecCost = 0;
1719 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1720 NumElts * EltSize > VecLimit) {
1721 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1722 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1723 NumElts /= 2;
1724 }
1725
1726 // For fp16 we need to extract the upper lane elements. MVE can add a
1727 // VREV+FMIN/MAX to perform another vector step instead.
1728 InstructionCost ExtractCost = 0;
1729 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1730 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1731 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1732 NumElts /= 2;
1733 } else if (ValVT.getVectorElementType() == MVT::f16)
1734 ExtractCost = NumElts / 2;
1735
1736 return VecCost + ExtractCost +
1737 NumElts *
1739 }
1740
1741 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1742 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1743 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1744 unsigned VecLimit =
1745 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1746 InstructionCost VecCost = 0;
1747 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1748 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1749 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1750 NumElts /= 2;
1751 }
1752 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1753 // step.
1754 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1755 NumElts * EltSize == 64) {
1756 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1757 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1758 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1759 NumElts /= 2;
1760 }
1761
1762 // From here we extract the elements and perform the and/or/xor.
1763 InstructionCost ExtractCost = NumElts;
1764 return VecCost + ExtractCost +
1765 (NumElts - 1) * getArithmeticInstrCost(
1766 Opcode, ValTy->getElementType(), CostKind);
1767 }
1768
1769 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1771 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1772
1773 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1774
1775 static const CostTblEntry CostTblAdd[]{
1776 {ISD::ADD, MVT::v16i8, 1},
1777 {ISD::ADD, MVT::v8i16, 1},
1778 {ISD::ADD, MVT::v4i32, 1},
1779 };
1780 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1781 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1782
1783 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1784}
1785
1787 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1789 EVT ValVT = TLI->getValueType(DL, ValTy);
1790 EVT ResVT = TLI->getValueType(DL, ResTy);
1791
1792 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1793
1794 switch (ISD) {
1795 case ISD::ADD:
1796 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1797 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1798
1799 // The legal cases are:
1800 // VADDV u/s 8/16/32
1801 // VADDLV u/s 32
1802 // Codegen currently cannot always handle larger than legal vectors very
1803 // well, especially for predicated reductions where the mask needs to be
1804 // split, so restrict to 128bit or smaller input types.
1805 unsigned RevVTSize = ResVT.getSizeInBits();
1806 if (ValVT.getSizeInBits() <= 128 &&
1807 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1808 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1809 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1810 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1811 }
1812 break;
1813 default:
1814 break;
1815 }
1816 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1817 CostKind);
1818}
1819
1822 VectorType *ValTy,
1824 EVT ValVT = TLI->getValueType(DL, ValTy);
1825 EVT ResVT = TLI->getValueType(DL, ResTy);
1826
1827 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1828 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1829
1830 // The legal cases are:
1831 // VMLAV u/s 8/16/32
1832 // VMLALV u/s 16/32
1833 // Codegen currently cannot always handle larger than legal vectors very
1834 // well, especially for predicated reductions where the mask needs to be
1835 // split, so restrict to 128bit or smaller input types.
1836 unsigned RevVTSize = ResVT.getSizeInBits();
1837 if (ValVT.getSizeInBits() <= 128 &&
1838 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1839 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1840 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1841 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1842 }
1843
1844 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1845}
1846
1849 FastMathFlags FMF,
1851 EVT ValVT = TLI->getValueType(DL, Ty);
1852
1853 // In general floating point reductions are a series of elementwise
1854 // operations, with free extracts on each step. These are either in-order or
1855 // treewise depending on whether that is allowed by the fast math flags.
1856 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1857 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1858 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1859 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1860 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1861 unsigned EltSize = ValVT.getScalarSizeInBits();
1862 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1863 InstructionCost VecCost;
1864 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1865 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1866 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1867 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1868 NumElts /= 2;
1869 }
1870
1871 // For fp16 we need to extract the upper lane elements. MVE can add a
1872 // VREV+FMIN/MAX to perform another vector step instead.
1873 InstructionCost ExtractCost = 0;
1874 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1875 NumElts == 8) {
1876 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1877 NumElts /= 2;
1878 } else if (ValVT.getVectorElementType() == MVT::f16)
1879 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1880
1882 {Ty->getElementType(), Ty->getElementType()},
1883 FMF);
1884 return VecCost + ExtractCost +
1885 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1886 }
1887
1888 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1889 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1890 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1891
1892 // All costs are the same for u/s min/max. These lower to vminv, which are
1893 // given a slightly higher cost as they tend to take multiple cycles for
1894 // smaller type sizes.
1895 static const CostTblEntry CostTblAdd[]{
1896 {ISD::SMIN, MVT::v16i8, 4},
1897 {ISD::SMIN, MVT::v8i16, 3},
1898 {ISD::SMIN, MVT::v4i32, 2},
1899 };
1900 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1901 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1902 }
1903
1904 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1905}
1906
1910 unsigned Opc = ICA.getID();
1911 switch (Opc) {
1912 case Intrinsic::get_active_lane_mask:
1913 // Currently we make a somewhat optimistic assumption that
1914 // active_lane_mask's are always free. In reality it may be freely folded
1915 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1916 // of add/icmp code. We may need to improve this in the future, but being
1917 // able to detect if it is free or not involves looking at a lot of other
1918 // code. We currently assume that the vectorizer inserted these, and knew
1919 // what it was doing in adding one.
1920 if (ST->hasMVEIntegerOps())
1921 return 0;
1922 break;
1923 case Intrinsic::sadd_sat:
1924 case Intrinsic::ssub_sat:
1925 case Intrinsic::uadd_sat:
1926 case Intrinsic::usub_sat: {
1927 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1928 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1929 Type *RetTy = ICA.getReturnType();
1930
1931 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1932 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1933 return 1; // qadd / qsub
1934 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
1935 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
1936 // Otherwise return the cost of expanding the node. Generally an add +
1937 // icmp + sel.
1939 Type *CondTy = RetTy->getWithNewBitWidth(1);
1940 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
1941 RetTy, CostKind) +
1942 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
1943 CostKind) +
1944 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
1945 CostKind);
1946 }
1947
1948 if (!ST->hasMVEIntegerOps())
1949 break;
1950
1951 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
1952 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1953 LT.second == MVT::v16i8) {
1954 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1955 // need to extend the type, as it uses shr(qadd(shl, shl)).
1956 unsigned Instrs =
1957 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
1958 : 4;
1959 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1960 }
1961 break;
1962 }
1963 case Intrinsic::abs:
1964 case Intrinsic::smin:
1965 case Intrinsic::smax:
1966 case Intrinsic::umin:
1967 case Intrinsic::umax: {
1968 if (!ST->hasMVEIntegerOps())
1969 break;
1970 Type *VT = ICA.getReturnType();
1971
1972 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1973 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1974 LT.second == MVT::v16i8)
1975 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1976 break;
1977 }
1978 case Intrinsic::minnum:
1979 case Intrinsic::maxnum: {
1980 if (!ST->hasMVEFloatOps())
1981 break;
1982 Type *VT = ICA.getReturnType();
1983 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1984 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1985 return LT.first * ST->getMVEVectorCostFactor(CostKind);
1986 break;
1987 }
1988 case Intrinsic::fptosi_sat:
1989 case Intrinsic::fptoui_sat: {
1990 if (ICA.getArgTypes().empty())
1991 break;
1992 bool IsSigned = Opc == Intrinsic::fptosi_sat;
1993 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1994 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
1995 // Check for the legal types, with the corect subtarget features.
1996 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
1997 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
1998 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
1999 return LT.first;
2000
2001 // Equally for MVE vector types
2002 if (ST->hasMVEFloatOps() &&
2003 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2004 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2005 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2006
2007 // If we can we use a legal convert followed by a min+max
2008 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2009 (ST->hasFP64() && LT.second == MVT::f64) ||
2010 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2011 (ST->hasMVEFloatOps() &&
2012 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2013 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2014 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2015 LT.second.getScalarSizeInBits());
2017 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2018 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2019 : Intrinsic::umin,
2020 LegalTy, {LegalTy, LegalTy});
2022 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2023 : Intrinsic::umax,
2024 LegalTy, {LegalTy, LegalTy});
2026 return LT.first * Cost;
2027 }
2028 // Otherwise we need to follow the default expansion that clamps the value
2029 // using a float min/max with a fcmp+sel for nan handling when signed.
2030 Type *FPTy = ICA.getArgTypes()[0];
2031 Type *RetTy = ICA.getReturnType();
2032 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2034 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2036 Cost +=
2037 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2039 if (IsSigned) {
2040 Type *CondTy = RetTy->getWithNewBitWidth(1);
2041 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2043 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2045 }
2046 return Cost;
2047 }
2048 }
2049
2051}
2052
2054 if (!F->isIntrinsic())
2055 return BaseT::isLoweredToCall(F);
2056
2057 // Assume all Arm-specific intrinsics map to an instruction.
2058 if (F->getName().starts_with("llvm.arm"))
2059 return false;
2060
2061 switch (F->getIntrinsicID()) {
2062 default: break;
2063 case Intrinsic::powi:
2064 case Intrinsic::sin:
2065 case Intrinsic::cos:
2066 case Intrinsic::sincos:
2067 case Intrinsic::pow:
2068 case Intrinsic::log:
2069 case Intrinsic::log10:
2070 case Intrinsic::log2:
2071 case Intrinsic::exp:
2072 case Intrinsic::exp2:
2073 return true;
2074 case Intrinsic::sqrt:
2075 case Intrinsic::fabs:
2076 case Intrinsic::copysign:
2077 case Intrinsic::floor:
2078 case Intrinsic::ceil:
2079 case Intrinsic::trunc:
2080 case Intrinsic::rint:
2081 case Intrinsic::nearbyint:
2082 case Intrinsic::round:
2083 case Intrinsic::canonicalize:
2084 case Intrinsic::lround:
2085 case Intrinsic::llround:
2086 case Intrinsic::lrint:
2087 case Intrinsic::llrint:
2088 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2089 return true;
2090 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2091 return true;
2092 // Some operations can be handled by vector instructions and assume
2093 // unsupported vectors will be expanded into supported scalar ones.
2094 // TODO Handle scalar operations properly.
2095 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2096 case Intrinsic::masked_store:
2097 case Intrinsic::masked_load:
2098 case Intrinsic::masked_gather:
2099 case Intrinsic::masked_scatter:
2100 return !ST->hasMVEIntegerOps();
2101 case Intrinsic::sadd_with_overflow:
2102 case Intrinsic::uadd_with_overflow:
2103 case Intrinsic::ssub_with_overflow:
2104 case Intrinsic::usub_with_overflow:
2105 case Intrinsic::sadd_sat:
2106 case Intrinsic::uadd_sat:
2107 case Intrinsic::ssub_sat:
2108 case Intrinsic::usub_sat:
2109 return false;
2110 }
2111
2112 return BaseT::isLoweredToCall(F);
2113}
2114
2116 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2117 EVT VT = TLI->getValueType(DL, I.getType(), true);
2118 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2119 return true;
2120
2121 // Check if an intrinsic will be lowered to a call and assume that any
2122 // other CallInst will generate a bl.
2123 if (auto *Call = dyn_cast<CallInst>(&I)) {
2124 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2125 switch(II->getIntrinsicID()) {
2126 case Intrinsic::memcpy:
2127 case Intrinsic::memset:
2128 case Intrinsic::memmove:
2129 return getNumMemOps(II) == -1;
2130 default:
2131 if (const Function *F = Call->getCalledFunction())
2132 return isLoweredToCall(F);
2133 }
2134 }
2135 return true;
2136 }
2137
2138 // FPv5 provides conversions between integer, double-precision,
2139 // single-precision, and half-precision formats.
2140 switch (I.getOpcode()) {
2141 default:
2142 break;
2143 case Instruction::FPToSI:
2144 case Instruction::FPToUI:
2145 case Instruction::SIToFP:
2146 case Instruction::UIToFP:
2147 case Instruction::FPTrunc:
2148 case Instruction::FPExt:
2149 return !ST->hasFPARMv8Base();
2150 }
2151
2152 // FIXME: Unfortunately the approach of checking the Operation Action does
2153 // not catch all cases of Legalization that use library calls. Our
2154 // Legalization step categorizes some transformations into library calls as
2155 // Custom, Expand or even Legal when doing type legalization. So for now
2156 // we have to special case for instance the SDIV of 64bit integers and the
2157 // use of floating point emulation.
2158 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2159 switch (ISD) {
2160 default:
2161 break;
2162 case ISD::SDIV:
2163 case ISD::UDIV:
2164 case ISD::SREM:
2165 case ISD::UREM:
2166 case ISD::SDIVREM:
2167 case ISD::UDIVREM:
2168 return true;
2169 }
2170 }
2171
2172 // Assume all other non-float operations are supported.
2173 if (!VT.isFloatingPoint())
2174 return false;
2175
2176 // We'll need a library call to handle most floats when using soft.
2177 if (TLI->useSoftFloat()) {
2178 switch (I.getOpcode()) {
2179 default:
2180 return true;
2181 case Instruction::Alloca:
2182 case Instruction::Load:
2183 case Instruction::Store:
2184 case Instruction::Select:
2185 case Instruction::PHI:
2186 return false;
2187 }
2188 }
2189
2190 // We'll need a libcall to perform double precision operations on a single
2191 // precision only FPU.
2192 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2193 return true;
2194
2195 // Likewise for half precision arithmetic.
2196 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2197 return true;
2198
2199 return false;
2200}
2201
2203 AssumptionCache &AC,
2204 TargetLibraryInfo *LibInfo,
2205 HardwareLoopInfo &HWLoopInfo) {
2206 // Low-overhead branches are only supported in the 'low-overhead branch'
2207 // extension of v8.1-m.
2208 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2209 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2210 return false;
2211 }
2212
2214 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2215 return false;
2216 }
2217
2218 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2219 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2220 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2221 return false;
2222 }
2223
2224 const SCEV *TripCountSCEV =
2225 SE.getAddExpr(BackedgeTakenCount,
2226 SE.getOne(BackedgeTakenCount->getType()));
2227
2228 // We need to store the trip count in LR, a 32-bit register.
2229 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2230 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2231 return false;
2232 }
2233
2234 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2235 // point in generating a hardware loop if that's going to happen.
2236
2237 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2238 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2239 switch (Call->getIntrinsicID()) {
2240 default:
2241 break;
2242 case Intrinsic::start_loop_iterations:
2243 case Intrinsic::test_start_loop_iterations:
2244 case Intrinsic::loop_decrement:
2245 case Intrinsic::loop_decrement_reg:
2246 return true;
2247 }
2248 }
2249 return false;
2250 };
2251
2252 // Scan the instructions to see if there's any that we know will turn into a
2253 // call or if this loop is already a low-overhead loop or will become a tail
2254 // predicated loop.
2255 bool IsTailPredLoop = false;
2256 auto ScanLoop = [&](Loop *L) {
2257 for (auto *BB : L->getBlocks()) {
2258 for (auto &I : *BB) {
2259 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2260 isa<InlineAsm>(I)) {
2261 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2262 return false;
2263 }
2264 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2265 IsTailPredLoop |=
2266 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2267 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2268 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2269 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2270 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2271 }
2272 }
2273 return true;
2274 };
2275
2276 // Visit inner loops.
2277 for (auto *Inner : *L)
2278 if (!ScanLoop(Inner))
2279 return false;
2280
2281 if (!ScanLoop(L))
2282 return false;
2283
2284 // TODO: Check whether the trip count calculation is expensive. If L is the
2285 // inner loop but we know it has a low trip count, calculating that trip
2286 // count (in the parent loop) may be detrimental.
2287
2288 LLVMContext &C = L->getHeader()->getContext();
2289 HWLoopInfo.CounterInReg = true;
2290 HWLoopInfo.IsNestingLegal = false;
2291 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2292 HWLoopInfo.CountType = Type::getInt32Ty(C);
2293 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2294 return true;
2295}
2296
2297static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2298 // We don't allow icmp's, and because we only look at single block loops,
2299 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2300 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2301 return false;
2302 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2303 // not currently canonical, but soon will be. Code without them uses icmp, and
2304 // so is not tail predicated as per the condition above. In order to get the
2305 // same performance we treat min and max the same as an icmp for tailpred
2306 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2307 // pick more optimial instructions like VQDMULH. They need to be recognized
2308 // directly by the vectorizer).
2309 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2310 if ((II->getIntrinsicID() == Intrinsic::smin ||
2311 II->getIntrinsicID() == Intrinsic::smax ||
2312 II->getIntrinsicID() == Intrinsic::umin ||
2313 II->getIntrinsicID() == Intrinsic::umax) &&
2314 ++ICmpCount > 1)
2315 return false;
2316
2317 if (isa<FCmpInst>(&I))
2318 return false;
2319
2320 // We could allow extending/narrowing FP loads/stores, but codegen is
2321 // too inefficient so reject this for now.
2322 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2323 return false;
2324
2325 // Extends have to be extending-loads
2326 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2327 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2328 return false;
2329
2330 // Truncs have to be narrowing-stores
2331 if (isa<TruncInst>(&I) )
2332 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2333 return false;
2334
2335 return true;
2336}
2337
2338// To set up a tail-predicated loop, we need to know the total number of
2339// elements processed by that loop. Thus, we need to determine the element
2340// size and:
2341// 1) it should be uniform for all operations in the vector loop, so we
2342// e.g. don't want any widening/narrowing operations.
2343// 2) it should be smaller than i64s because we don't have vector operations
2344// that work on i64s.
2345// 3) we don't want elements to be reversed or shuffled, to make sure the
2346// tail-predication masks/predicates the right lanes.
2347//
2349 const DataLayout &DL,
2350 const LoopAccessInfo *LAI) {
2351 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2352
2353 // If there are live-out values, it is probably a reduction. We can predicate
2354 // most reduction operations freely under MVE using a combination of
2355 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2356 // floating point and integer reductions, but don't check for operators
2357 // specifically here. If the value ends up not being a reduction (and so the
2358 // vectorizer cannot tailfold the loop), we should fall back to standard
2359 // vectorization automatically.
2361 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2362 bool ReductionsDisabled =
2365
2366 for (auto *I : LiveOuts) {
2367 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2368 !I->getType()->isHalfTy()) {
2369 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2370 "live-out value\n");
2371 return false;
2372 }
2373 if (ReductionsDisabled) {
2374 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2375 return false;
2376 }
2377 }
2378
2379 // Next, check that all instructions can be tail-predicated.
2380 PredicatedScalarEvolution PSE = LAI->getPSE();
2382 int ICmpCount = 0;
2383
2384 for (BasicBlock *BB : L->blocks()) {
2385 for (Instruction &I : BB->instructionsWithoutDebug()) {
2386 if (isa<PHINode>(&I))
2387 continue;
2388 if (!canTailPredicateInstruction(I, ICmpCount)) {
2389 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2390 return false;
2391 }
2392
2393 Type *T = I.getType();
2394 if (T->getScalarSizeInBits() > 32) {
2395 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2396 return false;
2397 }
2398 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2400 Type *AccessTy = getLoadStoreType(&I);
2401 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2402 if (NextStride == 1) {
2403 // TODO: for now only allow consecutive strides of 1. We could support
2404 // other strides as long as it is uniform, but let's keep it simple
2405 // for now.
2406 continue;
2407 } else if (NextStride == -1 ||
2408 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2409 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2411 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2412 "be tail-predicated\n.");
2413 return false;
2414 // TODO: don't tail predicate if there is a reversed load?
2415 } else if (EnableMaskedGatherScatters) {
2416 // Gather/scatters do allow loading from arbitrary strides, at
2417 // least if they are loop invariant.
2418 // TODO: Loop variant strides should in theory work, too, but
2419 // this requires further testing.
2420 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2421 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2422 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2423 if (PSE.getSE()->isLoopInvariant(Step, L))
2424 continue;
2425 }
2426 }
2427 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2428 "tail-predicate\n.");
2429 return false;
2430 }
2431 }
2432 }
2433
2434 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2435 return true;
2436}
2437
2439 if (!EnableTailPredication) {
2440 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2441 return false;
2442 }
2443
2444 // Creating a predicated vector loop is the first step for generating a
2445 // tail-predicated hardware loop, for which we need the MVE masked
2446 // load/stores instructions:
2447 if (!ST->hasMVEIntegerOps())
2448 return false;
2449
2450 LoopVectorizationLegality *LVL = TFI->LVL;
2451 Loop *L = LVL->getLoop();
2452
2453 // For now, restrict this to single block loops.
2454 if (L->getNumBlocks() > 1) {
2455 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2456 "loop.\n");
2457 return false;
2458 }
2459
2460 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2461
2462 LoopInfo *LI = LVL->getLoopInfo();
2463 HardwareLoopInfo HWLoopInfo(L);
2464 if (!HWLoopInfo.canAnalyze(*LI)) {
2465 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2466 "analyzable.\n");
2467 return false;
2468 }
2469
2472
2473 // This checks if we have the low-overhead branch architecture
2474 // extension, and if we will create a hardware-loop:
2475 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2476 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2477 "profitable.\n");
2478 return false;
2479 }
2480
2481 DominatorTree *DT = LVL->getDominatorTree();
2482 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2483 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2484 "a candidate.\n");
2485 return false;
2486 }
2487
2488 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2489}
2490
2492ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2493 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2495
2496 // Intrinsic @llvm.get.active.lane.mask is supported.
2497 // It is used in the MVETailPredication pass, which requires the number of
2498 // elements processed by this vector loop to setup the tail-predicated
2499 // loop.
2501}
2505 // Enable Upper bound unrolling universally, providing that we do not see an
2506 // active lane mask, which will be better kept as a loop to become tail
2507 // predicated than to be conditionally unrolled.
2508 UP.UpperBound =
2509 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2510 return isa<IntrinsicInst>(I) &&
2511 cast<IntrinsicInst>(I).getIntrinsicID() ==
2512 Intrinsic::get_active_lane_mask;
2513 });
2514
2515 // Only currently enable these preferences for M-Class cores.
2516 if (!ST->isMClass())
2517 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2518
2519 // Disable loop unrolling for Oz and Os.
2520 UP.OptSizeThreshold = 0;
2522 if (L->getHeader()->getParent()->hasOptSize())
2523 return;
2524
2525 SmallVector<BasicBlock*, 4> ExitingBlocks;
2526 L->getExitingBlocks(ExitingBlocks);
2527 LLVM_DEBUG(dbgs() << "Loop has:\n"
2528 << "Blocks: " << L->getNumBlocks() << "\n"
2529 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2530
2531 // Only allow another exit other than the latch. This acts as an early exit
2532 // as it mirrors the profitability calculation of the runtime unroller.
2533 if (ExitingBlocks.size() > 2)
2534 return;
2535
2536 // Limit the CFG of the loop body for targets with a branch predictor.
2537 // Allowing 4 blocks permits if-then-else diamonds in the body.
2538 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2539 return;
2540
2541 // Don't unroll vectorized loops, including the remainder loop
2542 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2543 return;
2544
2545 // Scan the loop: don't unroll loops with calls as this could prevent
2546 // inlining.
2548 for (auto *BB : L->getBlocks()) {
2549 for (auto &I : *BB) {
2550 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2551 // scalar code.
2552 if (I.getType()->isVectorTy())
2553 return;
2554
2555 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2556 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2557 if (!isLoweredToCall(F))
2558 continue;
2559 }
2560 return;
2561 }
2562
2563 SmallVector<const Value*, 4> Operands(I.operand_values());
2566 }
2567 }
2568
2569 // On v6m cores, there are very few registers available. We can easily end up
2570 // spilling and reloading more registers in an unrolled loop. Look at the
2571 // number of LCSSA phis as a rough measure of how many registers will need to
2572 // be live out of the loop, reducing the default unroll count if more than 1
2573 // value is needed. In the long run, all of this should be being learnt by a
2574 // machine.
2575 unsigned UnrollCount = 4;
2576 if (ST->isThumb1Only()) {
2577 unsigned ExitingValues = 0;
2579 L->getExitBlocks(ExitBlocks);
2580 for (auto *Exit : ExitBlocks) {
2581 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2582 // only the last is expected to be needed for address operands.
2583 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2584 return PH.getNumOperands() != 1 ||
2585 !isa<GetElementPtrInst>(PH.getOperand(0));
2586 });
2587 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2588 }
2589 if (ExitingValues)
2590 UnrollCount /= ExitingValues;
2591 if (UnrollCount <= 1)
2592 return;
2593 }
2594
2595 // For processors with low overhead branching (LOB), runtime unrolling the
2596 // innermost loop is often detrimental to performance. In these cases the loop
2597 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2598 // deeply nested loops get executed multiple times, negating the benefits of
2599 // LOB. This is particularly noticable when the loop trip count of the
2600 // innermost loop varies within the outer loop, such as in the case of
2601 // triangular matrix decompositions. In these cases we will prefer to not
2602 // unroll the innermost loop, with the intention for it to be executed as a
2603 // low overhead loop.
2604 bool Runtime = true;
2605 if (ST->hasLOB()) {
2607 const auto *BETC = SE.getBackedgeTakenCount(L);
2608 auto *Outer = L->getOutermostLoop();
2609 if ((L != Outer && Outer != L->getParentLoop()) ||
2610 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2611 Runtime = false;
2612 }
2613 }
2614 }
2615
2616 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2617 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2618
2619 UP.Partial = true;
2620 UP.Runtime = Runtime;
2621 UP.UnrollRemainder = true;
2623 UP.UnrollAndJam = true;
2625
2626 // Force unrolling small loops can be very useful because of the branch
2627 // taken cost of the backedge.
2628 if (Cost < 12)
2629 UP.Force = true;
2630}
2631
2635}
2636
2637bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2638 TTI::ReductionFlags Flags) const {
2639 if (!ST->hasMVEIntegerOps())
2640 return false;
2641
2642 unsigned ScalarBits = Ty->getScalarSizeInBits();
2643 switch (Opcode) {
2644 case Instruction::Add:
2645 return ScalarBits <= 64;
2646 default:
2647 return false;
2648 }
2649}
2650
2652 unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2653 if (!ST->hasMVEIntegerOps())
2654 return false;
2655 return true;
2656}
2657
2659 StackOffset BaseOffset,
2660 bool HasBaseReg, int64_t Scale,
2661 unsigned AddrSpace) const {
2663 AM.BaseGV = BaseGV;
2664 AM.BaseOffs = BaseOffset.getFixed();
2665 AM.HasBaseReg = HasBaseReg;
2666 AM.Scale = Scale;
2667 AM.ScalableOffset = BaseOffset.getScalable();
2668 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2669 if (ST->hasFPAO())
2670 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2671 return 0;
2672 }
2673 return -1;
2674}
2675
2676bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2677 if (Thumb) {
2678 // B.W is available in any Thumb2-supporting target, and also in every
2679 // version of Armv8-M, even Baseline which does not include the rest of
2680 // Thumb2.
2681 return ST->isThumb2() || ST->hasV8MBaselineOps();
2682 } else {
2683 // B is available in all versions of the Arm ISA, so the only question is
2684 // whether that ISA is available at all.
2685 return ST->hasARMOps();
2686 }
2687}
2688
2689/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2690/// of the vector elements.
2691static bool areExtractExts(Value *Ext1, Value *Ext2) {
2692 using namespace PatternMatch;
2693
2694 auto areExtDoubled = [](Instruction *Ext) {
2695 return Ext->getType()->getScalarSizeInBits() ==
2696 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2697 };
2698
2699 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2700 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2701 !areExtDoubled(cast<Instruction>(Ext1)) ||
2702 !areExtDoubled(cast<Instruction>(Ext2)))
2703 return false;
2704
2705 return true;
2706}
2707
2708/// Check if sinking \p I's operands to I's basic block is profitable, because
2709/// the operands can be folded into a target instruction, e.g.
2710/// sext/zext can be folded into vsubl.
2712 SmallVectorImpl<Use *> &Ops) const {
2713 using namespace PatternMatch;
2714
2715 if (!I->getType()->isVectorTy())
2716 return false;
2717
2718 if (ST->hasNEON()) {
2719 switch (I->getOpcode()) {
2720 case Instruction::Sub:
2721 case Instruction::Add: {
2722 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2723 return false;
2724 Ops.push_back(&I->getOperandUse(0));
2725 Ops.push_back(&I->getOperandUse(1));
2726 return true;
2727 }
2728 default:
2729 return false;
2730 }
2731 }
2732
2733 if (!ST->hasMVEIntegerOps())
2734 return false;
2735
2736 auto IsFMSMul = [&](Instruction *I) {
2737 if (!I->hasOneUse())
2738 return false;
2739 auto *Sub = cast<Instruction>(*I->users().begin());
2740 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2741 };
2742 auto IsFMS = [&](Instruction *I) {
2743 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2744 match(I->getOperand(1), m_FNeg(m_Value())))
2745 return true;
2746 return false;
2747 };
2748
2749 auto IsSinker = [&](Instruction *I, int Operand) {
2750 switch (I->getOpcode()) {
2751 case Instruction::Add:
2752 case Instruction::Mul:
2753 case Instruction::FAdd:
2754 case Instruction::ICmp:
2755 case Instruction::FCmp:
2756 return true;
2757 case Instruction::FMul:
2758 return !IsFMSMul(I);
2759 case Instruction::Sub:
2760 case Instruction::FSub:
2761 case Instruction::Shl:
2762 case Instruction::LShr:
2763 case Instruction::AShr:
2764 return Operand == 1;
2765 case Instruction::Call:
2766 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2767 switch (II->getIntrinsicID()) {
2768 case Intrinsic::fma:
2769 return !IsFMS(I);
2770 case Intrinsic::sadd_sat:
2771 case Intrinsic::uadd_sat:
2772 case Intrinsic::arm_mve_add_predicated:
2773 case Intrinsic::arm_mve_mul_predicated:
2774 case Intrinsic::arm_mve_qadd_predicated:
2775 case Intrinsic::arm_mve_vhadd:
2776 case Intrinsic::arm_mve_hadd_predicated:
2777 case Intrinsic::arm_mve_vqdmull:
2778 case Intrinsic::arm_mve_vqdmull_predicated:
2779 case Intrinsic::arm_mve_vqdmulh:
2780 case Intrinsic::arm_mve_qdmulh_predicated:
2781 case Intrinsic::arm_mve_vqrdmulh:
2782 case Intrinsic::arm_mve_qrdmulh_predicated:
2783 case Intrinsic::arm_mve_fma_predicated:
2784 return true;
2785 case Intrinsic::ssub_sat:
2786 case Intrinsic::usub_sat:
2787 case Intrinsic::arm_mve_sub_predicated:
2788 case Intrinsic::arm_mve_qsub_predicated:
2789 case Intrinsic::arm_mve_hsub_predicated:
2790 case Intrinsic::arm_mve_vhsub:
2791 return Operand == 1;
2792 default:
2793 return false;
2794 }
2795 }
2796 return false;
2797 default:
2798 return false;
2799 }
2800 };
2801
2802 for (auto OpIdx : enumerate(I->operands())) {
2803 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2804 // Make sure we are not already sinking this operand
2805 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2806 continue;
2807
2808 Instruction *Shuffle = Op;
2809 if (Shuffle->getOpcode() == Instruction::BitCast)
2810 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2811 // We are looking for a splat that can be sunk.
2812 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2813 m_ZeroInt()),
2814 m_Undef(), m_ZeroMask())))
2815 continue;
2816 if (!IsSinker(I, OpIdx.index()))
2817 continue;
2818
2819 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2820 // and vector registers
2821 for (Use &U : Op->uses()) {
2822 Instruction *Insn = cast<Instruction>(U.getUser());
2823 if (!IsSinker(Insn, U.getOperandNo()))
2824 return false;
2825 }
2826
2827 Ops.push_back(&Shuffle->getOperandUse(0));
2828 if (Shuffle != Op)
2829 Ops.push_back(&Op->getOperandUse(0));
2830 Ops.push_back(&OpIdx.value());
2831 }
2832 return true;
2833}
2834
2836 Type *ArrayType) const {
2837 if (!UseWidenGlobalArrays) {
2838 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2839 return false;
2840 }
2841
2842 // Don't modify none integer array types
2843 if (!ArrayType || !ArrayType->isArrayTy() ||
2845 return 0;
2846
2847 // We pad to 4 byte boundaries
2848 if (Size % 4 == 0)
2849 return 0;
2850
2851 unsigned NumBytesToPad = 4 - (Size % 4);
2852 unsigned NewSize = Size + NumBytesToPad;
2853
2854 // Max number of bytes that memcpy allows for lowering to load/stores before
2855 // it uses library function (__aeabi_memcpy).
2856 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2857
2858 if (NewSize > MaxMemIntrinsicSize)
2859 return 0;
2860
2861 return NumBytesToPad;
2862}
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:624
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
bool hasARMOps() const
Definition: ARMSubtarget.h:302
bool isThumb1Only() const
Definition: ARMSubtarget.h:403
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:311
bool isThumb2() const
Definition: ARMSubtarget.h:404
bool hasVFP2Base() const
Definition: ARMSubtarget.h:308
bool isMClass() const
Definition: ARMSubtarget.h:405
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:519
bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr)
bool maybeLoweredToCall(Instruction &I)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
bool isLegalMaskedStore(Type *DataTy, Align Alignment)
bool isLegalMaskedLoad(Type *DataTy, Align Alignment)
InstructionCost getMemcpyCost(const Instruction *I)
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isLoweredToCall(const Function *F)
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool hasArmWideBranch(bool Thumb) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool isLegalMaskedGather(Type *Ty, Align Alignment)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty)
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool isProfitableLSRChainElement(Instruction *I)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Class to represent array types.
Definition: DerivedTypes.h:395
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:623
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:695
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:988
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
This class represents a range of values.
Definition: ConstantRange.h:47
ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1830
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1152
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
The core instruction combiner logic.
Definition: InstCombiner.h:48
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:343
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:342
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:394
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, unsigned Depth, const SimplifyQuery &Q)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:418
BuilderTy & Builder
Definition: InstCombiner.h:61
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:340
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isShift() const
Definition: Instruction.h:282
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
Type * getArrayElementType() const
Definition: Type.h:411
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1097
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:242
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:123
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
InstructionCost Cost
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55