LLVM 22.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
69/// Convert a vector load intrinsic into a simple llvm load instruction.
70/// This is beneficial when the underlying object being addressed comes
71/// from a constant, since we get constant-folding for free.
72static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Alignment))
84 return nullptr;
85
86 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
87 Align(Alignment));
88}
89
91 const Function *Callee) const {
92 const TargetMachine &TM = getTLI()->getTargetMachine();
93 const FeatureBitset &CallerBits =
94 TM.getSubtargetImpl(*Caller)->getFeatureBits();
95 const FeatureBitset &CalleeBits =
96 TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98 // To inline a callee, all features not in the allowed list must match exactly.
99 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100 (CalleeBits & ~InlineFeaturesAllowed);
101 // For features in the allowed list, the callee's features must be a subset of
102 // the callers'.
103 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104 (CalleeBits & InlineFeaturesAllowed);
105 return MatchExact && MatchSubset;
106}
107
110 ScalarEvolution *SE) const {
111 if (ST->hasMVEIntegerOps())
113
114 if (L->getHeader()->getParent()->hasOptSize())
115 return TTI::AMK_None;
116
117 if (ST->isMClass() && ST->isThumb2() &&
118 L->getNumBlocks() == 1)
119 return TTI::AMK_PreIndexed;
120
121 return TTI::AMK_None;
122}
123
124std::optional<Instruction *>
126 using namespace PatternMatch;
127 Intrinsic::ID IID = II.getIntrinsicID();
128 switch (IID) {
129 default:
130 break;
131 case Intrinsic::arm_neon_vld1: {
132 Align MemAlign =
133 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
135 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
136 return IC.replaceInstUsesWith(II, V);
137 }
138 break;
139 }
140
141 case Intrinsic::arm_neon_vld2:
142 case Intrinsic::arm_neon_vld3:
143 case Intrinsic::arm_neon_vld4:
144 case Intrinsic::arm_neon_vld2lane:
145 case Intrinsic::arm_neon_vld3lane:
146 case Intrinsic::arm_neon_vld4lane:
147 case Intrinsic::arm_neon_vst1:
148 case Intrinsic::arm_neon_vst2:
149 case Intrinsic::arm_neon_vst3:
150 case Intrinsic::arm_neon_vst4:
151 case Intrinsic::arm_neon_vst2lane:
152 case Intrinsic::arm_neon_vst3lane:
153 case Intrinsic::arm_neon_vst4lane: {
154 Align MemAlign =
155 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
157 unsigned AlignArg = II.arg_size() - 1;
158 Value *AlignArgOp = II.getArgOperand(AlignArg);
159 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
160 if (Align && *Align < MemAlign) {
161 return IC.replaceOperand(
162 II, AlignArg,
163 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
164 false));
165 }
166 break;
167 }
168
169 case Intrinsic::arm_neon_vld1x2:
170 case Intrinsic::arm_neon_vld1x3:
171 case Intrinsic::arm_neon_vld1x4:
172 case Intrinsic::arm_neon_vst1x2:
173 case Intrinsic::arm_neon_vst1x3:
174 case Intrinsic::arm_neon_vst1x4: {
175 Align NewAlign =
176 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
178 Align OldAlign = II.getParamAlign(0).valueOrOne();
179 if (NewAlign > OldAlign)
180 II.addParamAttr(0,
181 Attribute::getWithAlignment(II.getContext(), NewAlign));
182 break;
183 }
184
185 case Intrinsic::arm_mve_pred_i2v: {
186 Value *Arg = II.getArgOperand(0);
187 Value *ArgArg;
189 PatternMatch::m_Value(ArgArg))) &&
190 II.getType() == ArgArg->getType()) {
191 return IC.replaceInstUsesWith(II, ArgArg);
192 }
193 Constant *XorMask;
195 PatternMatch::m_Value(ArgArg)),
196 PatternMatch::m_Constant(XorMask))) &&
197 II.getType() == ArgArg->getType()) {
198 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
199 if (CI->getValue().trunc(16).isAllOnes()) {
200 auto TrueVector = IC.Builder.CreateVectorSplat(
201 cast<FixedVectorType>(II.getType())->getNumElements(),
202 IC.Builder.getTrue());
203 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
204 }
205 }
206 }
207 KnownBits ScalarKnown(32);
208 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
209 ScalarKnown)) {
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_pred_v2i: {
215 Value *Arg = II.getArgOperand(0);
216 Value *ArgArg;
218 PatternMatch::m_Value(ArgArg)))) {
219 return IC.replaceInstUsesWith(II, ArgArg);
220 }
221
222 if (II.getMetadata(LLVMContext::MD_range))
223 break;
224
225 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
226
227 if (auto CurrentRange = II.getRange()) {
228 Range = Range.intersectWith(*CurrentRange);
229 if (Range == CurrentRange)
230 break;
231 }
232
233 II.addRangeRetAttr(Range);
234 II.addRetAttr(Attribute::NoUndef);
235 return &II;
236 }
237 case Intrinsic::arm_mve_vadc:
238 case Intrinsic::arm_mve_vadc_predicated: {
239 unsigned CarryOp =
240 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
241 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
242 "Bad type for intrinsic!");
243
244 KnownBits CarryKnown(32);
245 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
246 CarryKnown)) {
247 return &II;
248 }
249 break;
250 }
251 case Intrinsic::arm_mve_vmldava: {
253 if (I->hasOneUse()) {
254 auto *User = cast<Instruction>(*I->user_begin());
255 Value *OpZ;
256 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
257 match(I->getOperand(3), m_Zero())) {
258 Value *OpX = I->getOperand(4);
259 Value *OpY = I->getOperand(5);
260 Type *OpTy = OpX->getType();
261
263 Value *V =
264 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
265 {I->getOperand(0), I->getOperand(1),
266 I->getOperand(2), OpZ, OpX, OpY});
267
269 return IC.eraseInstFromFunction(*User);
270 }
271 }
272 return std::nullopt;
273 }
274 }
275 return std::nullopt;
276}
277
279 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281 std::function<void(Instruction *, unsigned, APInt, APInt &)>
282 SimplifyAndSetOp) const {
283
284 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285 // opcode specifying a Top/Bottom instruction, which can change between
286 // instructions.
287 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
289 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
290
291 // The only odd/even lanes of operand 0 will only be demanded depending
292 // on whether this is a top/bottom instruction.
293 APInt DemandedElts =
294 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
295 : APInt::getHighBitsSet(2, 1));
296 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
297 // The other lanes will be defined from the inserted elements.
298 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
299 : APInt::getHighBitsSet(2, 1));
300 return std::nullopt;
301 };
302
303 switch (II.getIntrinsicID()) {
304 default:
305 break;
306 case Intrinsic::arm_mve_vcvt_narrow:
307 SimplifyNarrowInstrTopBottom(2);
308 break;
309 case Intrinsic::arm_mve_vqmovn:
310 SimplifyNarrowInstrTopBottom(4);
311 break;
312 case Intrinsic::arm_mve_vshrn:
313 SimplifyNarrowInstrTopBottom(7);
314 break;
315 }
316
317 return std::nullopt;
318}
319
322 assert(Ty->isIntegerTy());
323
324 unsigned Bits = Ty->getPrimitiveSizeInBits();
325 if (Bits == 0 || Imm.getActiveBits() >= 64)
326 return 4;
327
328 int64_t SImmVal = Imm.getSExtValue();
329 uint64_t ZImmVal = Imm.getZExtValue();
330 if (!ST->isThumb()) {
331 if ((SImmVal >= 0 && SImmVal < 65536) ||
332 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
333 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
334 return 1;
335 return ST->hasV6T2Ops() ? 2 : 3;
336 }
337 if (ST->isThumb2()) {
338 if ((SImmVal >= 0 && SImmVal < 65536) ||
339 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
340 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
341 return 1;
342 return ST->hasV6T2Ops() ? 2 : 3;
343 }
344 // Thumb1, any i8 imm cost 1.
345 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
346 return 1;
347 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
348 return 2;
349 // Load from constantpool.
350 return 3;
351}
352
353// Constants smaller than 256 fit in the immediate field of
354// Thumb1 instructions so we return a zero cost and 1 otherwise.
356 const APInt &Imm,
357 Type *Ty) const {
358 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
359 return 0;
360
361 return 1;
362}
363
364// Checks whether Inst is part of a min(max()) or max(min()) pattern
365// that will match to an SSAT instruction. Returns the instruction being
366// saturated, or null if no saturation pattern was found.
367static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
368 Value *LHS, *RHS;
369 ConstantInt *C;
371
372 if (InstSPF == SPF_SMAX &&
374 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376 auto isSSatMin = [&](Value *MinInst) {
377 if (isa<SelectInst>(MinInst)) {
378 Value *MinLHS, *MinRHS;
379 ConstantInt *MinC;
380 SelectPatternFlavor MinSPF =
381 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
382 if (MinSPF == SPF_SMIN &&
384 MinC->getValue() == ((-Imm) - 1))
385 return true;
386 }
387 return false;
388 };
389
390 if (isSSatMin(Inst->getOperand(1)))
391 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
392 if (Inst->hasNUses(2) &&
393 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
394 return Inst->getOperand(1);
395 }
396 return nullptr;
397}
398
399// Look for a FP Saturation pattern, where the instruction can be simplified to
400// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
401static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
402 if (Imm.getBitWidth() != 64 ||
403 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
404 return false;
405 Value *FP = isSSATMinMaxPattern(Inst, Imm);
406 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
408 if (!FP)
409 return false;
410 return isa<FPToSIInst>(FP);
411}
412
413InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
414 const APInt &Imm, Type *Ty,
416 Instruction *Inst) const {
417 // Division by a constant can be turned into multiplication, but only if we
418 // know it's constant. So it's not so much that the immediate is cheap (it's
419 // not), but that the alternative is worse.
420 // FIXME: this is probably unneeded with GlobalISel.
421 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
422 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
423 Idx == 1)
424 return 0;
425
426 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427 // splitting any large offsets.
428 if (Opcode == Instruction::GetElementPtr && Idx != 0)
429 return 0;
430
431 if (Opcode == Instruction::And) {
432 // UXTB/UXTH
433 if (Imm == 255 || Imm == 65535)
434 return 0;
435 // Conversion to BIC is free, and means we can use ~Imm instead.
436 return std::min(getIntImmCost(Imm, Ty, CostKind),
437 getIntImmCost(~Imm, Ty, CostKind));
438 }
439
440 if (Opcode == Instruction::Add)
441 // Conversion to SUB is free, and means we can use -Imm instead.
442 return std::min(getIntImmCost(Imm, Ty, CostKind),
443 getIntImmCost(-Imm, Ty, CostKind));
444
445 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446 Ty->getIntegerBitWidth() == 32) {
447 int64_t NegImm = -Imm.getSExtValue();
448 if (ST->isThumb2() && NegImm < 1<<12)
449 // icmp X, #-C -> cmn X, #C
450 return 0;
451 if (ST->isThumb() && NegImm < 1<<8)
452 // icmp X, #-C -> adds X, #C
453 return 0;
454 }
455
456 // xor a, -1 can always be folded to MVN
457 if (Opcode == Instruction::Xor && Imm.isAllOnes())
458 return 0;
459
460 // Ensures negative constant of min(max()) or max(min()) patterns that
461 // match to SSAT instructions don't get hoisted
462 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
463 Ty->getIntegerBitWidth() <= 32) {
464 if (isSSATMinMaxPattern(Inst, Imm) ||
465 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
467 return 0;
468 }
469
470 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471 return 0;
472
473 // We can convert <= -1 to < 0, which is generally quite cheap.
474 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
475 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
476 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
477 return std::min(getIntImmCost(Imm, Ty, CostKind),
478 getIntImmCost(Imm + 1, Ty, CostKind));
479 }
480
481 return getIntImmCost(Imm, Ty, CostKind);
482}
483
486 const Instruction *I) const {
488 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
489 // FIXME: The vectorizer is highly sensistive to the cost of these
490 // instructions, which suggests that it may be using the costs incorrectly.
491 // But, for now, just make them free to avoid performance regressions for
492 // vector targets.
493 return 0;
494 }
495 return BaseT::getCFInstrCost(Opcode, CostKind, I);
496}
497
499 Type *Src,
502 const Instruction *I) const {
503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
504 assert(ISD && "Invalid opcode");
505
506 // TODO: Allow non-throughput costs that aren't binary.
507 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
509 return Cost == 0 ? 0 : 1;
510 return Cost;
511 };
512 auto IsLegalFPType = [this](EVT VT) {
513 EVT EltVT = VT.getScalarType();
514 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
515 (EltVT == MVT::f64 && ST->hasFP64()) ||
516 (EltVT == MVT::f16 && ST->hasFullFP16());
517 };
518
519 EVT SrcTy = TLI->getValueType(DL, Src);
520 EVT DstTy = TLI->getValueType(DL, Dst);
521
522 if (!SrcTy.isSimple() || !DstTy.isSimple())
523 return AdjustCost(
524 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526 // Extending masked load/Truncating masked stores is expensive because we
527 // currently don't split them. This means that we'll likely end up
528 // loading/storing each element individually (hence the high cost).
529 if ((ST->hasMVEIntegerOps() &&
530 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
531 Opcode == Instruction::SExt)) ||
532 (ST->hasMVEFloatOps() &&
533 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
534 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
535 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
536 return 2 * DstTy.getVectorNumElements() *
537 ST->getMVEVectorCostFactor(CostKind);
538
539 // The extend of other kinds of load is free
540 if (CCH == TTI::CastContextHint::Normal ||
542 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
544 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
545 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
546 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
547 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
548 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
549 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
550 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
551 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
552 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
553 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
554 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
555 };
556 if (const auto *Entry = ConvertCostTableLookup(
557 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
558 return AdjustCost(Entry->Cost);
559
560 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
562 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
563 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
564 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
565 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
566 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
567 // The following extend from a legal type to an illegal type, so need to
568 // split the load. This introduced an extra load operation, but the
569 // extend is still "free".
570 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
571 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
572 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
573 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
574 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
575 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
576 };
577 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578 if (const auto *Entry =
579 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
580 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
581 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582 }
583
584 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585 // FPExtends are similar but also require the VCVT instructions.
586 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
587 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
588 };
589 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590 if (const auto *Entry =
591 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
592 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
593 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594 }
595
596 // The truncate of a store is free. This is the mirror of extends above.
597 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
599 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
600 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
601 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
602 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
603 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
604 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
605 };
606 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607 if (const auto *Entry =
608 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
609 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
610 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611 }
612
613 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
615 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
616 };
617 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618 if (const auto *Entry =
619 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
620 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
621 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622 }
623 }
624
625 // NEON vector operations that can extend their inputs.
626 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
627 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629 // vaddl
630 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
631 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
632 // vsubl
633 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
634 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
635 // vmull
636 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
637 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
638 // vshll
639 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
640 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
641 };
642
643 auto *User = cast<Instruction>(*I->user_begin());
644 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
645 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
646 DstTy.getSimpleVT(),
647 SrcTy.getSimpleVT())) {
648 return AdjustCost(Entry->Cost);
649 }
650 }
651
652 // Single to/from double precision conversions.
653 if (Src->isVectorTy() && ST->hasNEON() &&
654 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655 DstTy.getScalarType() == MVT::f32) ||
656 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657 DstTy.getScalarType() == MVT::f64))) {
658 static const CostTblEntry NEONFltDblTbl[] = {
659 // Vector fptrunc/fpext conversions.
660 {ISD::FP_ROUND, MVT::v2f64, 2},
661 {ISD::FP_EXTEND, MVT::v2f32, 2},
662 {ISD::FP_EXTEND, MVT::v4f32, 4}};
663
664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
665 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
666 return AdjustCost(LT.first * Entry->Cost);
667 }
668
669 // Some arithmetic, load and store operations have specific instructions
670 // to cast up/down their types automatically at no extra cost.
671 // TODO: Get these tables to know at least what the related operations are.
672 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
675 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
676 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
677 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
678 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
679
680 // The number of vmovl instructions for the extension.
681 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
682 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
683 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
684 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
685 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
686 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
687 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
688 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
689 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
690 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
691 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
692 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
693 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
694 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
695 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
696 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
697 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
698 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
699
700 // Operations that we legalize using splitting.
701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
703
704 // Vector float <-> i32 conversions.
705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
706 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
707
708 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
710 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
711 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
712 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
714 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
715 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
716 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
717 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
718 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
719 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
720 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
721 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
724 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
725 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
726 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
727 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
728
729 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
730 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
731 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
732 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
733 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
734 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
735
736 // Vector double <-> i32 conversions.
737 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
738 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
739
740 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
742 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
743 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
744 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
745 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
746
747 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
748 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
749 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
750 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
751 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
752 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
753 };
754
755 if (SrcTy.isVector() && ST->hasNEON()) {
756 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
757 DstTy.getSimpleVT(),
758 SrcTy.getSimpleVT()))
759 return AdjustCost(Entry->Cost);
760 }
761
762 // Scalar float to integer conversions.
763 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
765 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
766 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
767 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
768 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
769 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
770 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
771 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
772 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
773 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
774 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
775 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
776 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
777 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
778 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
779 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
780 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
781 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
782 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
783 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
784 };
785 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
787 DstTy.getSimpleVT(),
788 SrcTy.getSimpleVT()))
789 return AdjustCost(Entry->Cost);
790 }
791
792 // Scalar integer to float conversions.
793 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
795 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
796 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
797 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
798 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
799 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
800 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
801 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
802 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
803 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
804 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
805 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
806 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
807 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
808 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
809 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
810 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
811 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
812 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
813 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
814 };
815
816 if (SrcTy.isInteger() && ST->hasNEON()) {
817 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
818 ISD, DstTy.getSimpleVT(),
819 SrcTy.getSimpleVT()))
820 return AdjustCost(Entry->Cost);
821 }
822
823 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825 // are linearised so take more.
826 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
828 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
829 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
830 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
831 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
832 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
833 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
834 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
835 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
836 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
837 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
838 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
839 };
840
841 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
843 ISD, DstTy.getSimpleVT(),
844 SrcTy.getSimpleVT()))
845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846 }
847
848 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
849 // As general rule, fp converts that were not matched above are scalarized
850 // and cost 1 vcvt for each lane, so long as the instruction is available.
851 // If not it will become a series of function calls.
852 const InstructionCost CallCost =
853 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
854 int Lanes = 1;
855 if (SrcTy.isFixedLengthVector())
856 Lanes = SrcTy.getVectorNumElements();
857
858 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
859 return Lanes;
860 else
861 return Lanes * CallCost;
862 }
863
864 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865 SrcTy.isFixedLengthVector()) {
866 // Treat a truncate with larger than legal source (128bits for MVE) as
867 // expensive, 2 instructions per lane.
868 if ((SrcTy.getScalarType() == MVT::i8 ||
869 SrcTy.getScalarType() == MVT::i16 ||
870 SrcTy.getScalarType() == MVT::i32) &&
871 SrcTy.getSizeInBits() > 128 &&
872 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873 return SrcTy.getVectorNumElements() * 2;
874 }
875
876 // Scalar integer conversion costs.
877 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878 // i16 -> i64 requires two dependent operations.
879 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
880
881 // Truncates on i64 are assumed to be free.
882 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
883 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
884 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
885 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
886 };
887
888 if (SrcTy.isInteger()) {
889 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
890 DstTy.getSimpleVT(),
891 SrcTy.getSimpleVT()))
892 return AdjustCost(Entry->Cost);
893 }
894
895 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
896 ? ST->getMVEVectorCostFactor(CostKind)
897 : 1;
898 return AdjustCost(
899 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900}
901
904 unsigned Index, const Value *Op0,
905 const Value *Op1) const {
906 // Penalize inserting into an D-subregister. We end up with a three times
907 // lower estimated throughput on swift.
908 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
910 return 3;
911
912 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
913 Opcode == Instruction::ExtractElement)) {
914 // Cross-class copies are expensive on many microarchitectures,
915 // so assume they are expensive by default.
916 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
917 return 3;
918
919 // Even if it's not a cross class copy, this likely leads to mixing
920 // of NEON and VFP code and should be therefore penalized.
921 if (ValTy->isVectorTy() &&
922 ValTy->getScalarSizeInBits() <= 32)
923 return std::max<InstructionCost>(
924 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
925 2U);
926 }
927
928 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
929 Opcode == Instruction::ExtractElement)) {
930 // Integer cross-lane moves are more expensive than float, which can
931 // sometimes just be vmovs. Integer involve being passes to GPR registers,
932 // causing more of a delay.
933 std::pair<InstructionCost, MVT> LT =
934 getTypeLegalizationCost(ValTy->getScalarType());
935 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
936 }
937
938 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
939}
940
942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
944 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947 // Thumb scalar code size cost for select.
949 ST->isThumb() && !ValTy->isVectorTy()) {
950 // Assume expensive structs.
951 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
952 return TTI::TCC_Expensive;
953
954 // Select costs can vary because they:
955 // - may require one or more conditional mov (including an IT),
956 // - can't operate directly on immediates,
957 // - require live flags, which we can't copy around easily.
959
960 // Possible IT instruction for Thumb2, or more for Thumb1.
961 ++Cost;
962
963 // i1 values may need rematerialising by using mov immediates and/or
964 // flag setting instructions.
965 if (ValTy->isIntegerTy(1))
966 ++Cost;
967
968 return Cost;
969 }
970
971 // If this is a vector min/max/abs, use the cost of that intrinsic directly
972 // instead. Hopefully when min/max intrinsics are more prevalent this code
973 // will not be needed.
974 const Instruction *Sel = I;
975 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
976 Sel->hasOneUse())
977 Sel = cast<Instruction>(Sel->user_back());
978 if (Sel && ValTy->isVectorTy() &&
979 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
980 const Value *LHS, *RHS;
981 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
982 unsigned IID = 0;
983 switch (SPF) {
984 case SPF_ABS:
985 IID = Intrinsic::abs;
986 break;
987 case SPF_SMIN:
988 IID = Intrinsic::smin;
989 break;
990 case SPF_SMAX:
991 IID = Intrinsic::smax;
992 break;
993 case SPF_UMIN:
994 IID = Intrinsic::umin;
995 break;
996 case SPF_UMAX:
997 IID = Intrinsic::umax;
998 break;
999 case SPF_FMINNUM:
1000 IID = Intrinsic::minnum;
1001 break;
1002 case SPF_FMAXNUM:
1003 IID = Intrinsic::maxnum;
1004 break;
1005 default:
1006 break;
1007 }
1008 if (IID) {
1009 // The ICmp is free, the select gets the cost of the min/max/etc
1010 if (Sel != I)
1011 return 0;
1012 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013 return getIntrinsicInstrCost(CostAttrs, CostKind);
1014 }
1015 }
1016
1017 // On NEON a vector select gets lowered to vbsl.
1018 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019 // Lowering of some vector selects is currently far from perfect.
1020 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1022 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1023 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1024 };
1025
1026 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1027 EVT SelValTy = TLI->getValueType(DL, ValTy);
1028 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1030 SelCondTy.getSimpleVT(),
1031 SelValTy.getSimpleVT()))
1032 return Entry->Cost;
1033 }
1034
1035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1036 return LT.first;
1037 }
1038
1039 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1041 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1042 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1044 if (!VecCondTy)
1046
1047 // If we don't have mve.fp any fp operations will need to be scalarized.
1048 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049 // One scalaization insert, one scalarization extract and the cost of the
1050 // fcmps.
1051 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1052 /*Extract*/ true, CostKind) +
1053 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1054 /*Extract*/ false, CostKind) +
1055 VecValTy->getNumElements() *
1056 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1057 VecCondTy->getScalarType(), VecPred,
1058 CostKind, Op1Info, Op2Info, I);
1059 }
1060
1061 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1062 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063 // There are two types - the input that specifies the type of the compare
1064 // and the output vXi1 type. Because we don't know how the output will be
1065 // split, we may need an expensive shuffle to get two in sync. This has the
1066 // effect of making larger than legal compares (v8i32 for example)
1067 // expensive.
1068 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1069 if (LT.first > 1)
1070 return LT.first * BaseCost +
1071 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1072 /*Extract*/ false, CostKind);
1073 return BaseCost;
1074 }
1075 }
1076
1077 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078 // for "multiple beats" potentially needed by MVE instructions.
1079 int BaseCost = 1;
1080 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084 CostKind, Op1Info, Op2Info, I);
1085}
1086
1089 const SCEV *Ptr,
1091 // Address computations in vectorized code with non-consecutive addresses will
1092 // likely result in more instructions compared to scalar code where the
1093 // computation can more often be merged into the index mode. The resulting
1094 // extra micro-ops can significantly decrease throughput.
1095 unsigned NumVectorInstToHideOverhead = 10;
1096 int MaxMergeDistance = 64;
1097
1098 if (ST->hasNEON()) {
1099 if (PtrTy->isVectorTy() && SE &&
1100 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1101 return NumVectorInstToHideOverhead;
1102
1103 // In many cases the address computation is not merged into the instruction
1104 // addressing mode.
1105 return 1;
1106 }
1107 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1108}
1109
1112 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1113 // optimized, else LSR may block tail-predication.
1114 switch (II->getIntrinsicID()) {
1115 case Intrinsic::arm_mve_vctp8:
1116 case Intrinsic::arm_mve_vctp16:
1117 case Intrinsic::arm_mve_vctp32:
1118 case Intrinsic::arm_mve_vctp64:
1119 return true;
1120 default:
1121 break;
1122 }
1123 }
1124 return false;
1125}
1126
1128 unsigned /*AddressSpace*/,
1129 TTI::MaskKind /*MaskKind*/) const {
1130 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1131 return false;
1132
1133 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1134 // Don't support v2i1 yet.
1135 if (VecTy->getNumElements() == 2)
1136 return false;
1137
1138 // We don't support extending fp types.
1139 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1140 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1141 return false;
1142 }
1143
1144 unsigned EltWidth = DataTy->getScalarSizeInBits();
1145 return (EltWidth == 32 && Alignment >= 4) ||
1146 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1147}
1148
1149bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1150 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1151 return false;
1152
1153 unsigned EltWidth = Ty->getScalarSizeInBits();
1154 return ((EltWidth == 32 && Alignment >= 4) ||
1155 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1156}
1157
1158/// Given a memcpy/memset/memmove instruction, return the number of memory
1159/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1160/// call is used.
1162 MemOp MOp;
1163 unsigned DstAddrSpace = ~0u;
1164 unsigned SrcAddrSpace = ~0u;
1165 const Function *F = I->getParent()->getParent();
1166
1167 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1168 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1169 // If 'size' is not a constant, a library call will be generated.
1170 if (!C)
1171 return -1;
1172
1173 const unsigned Size = C->getValue().getZExtValue();
1174 const Align DstAlign = MC->getDestAlign().valueOrOne();
1175 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1176
1177 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1178 /*IsVolatile*/ false);
1179 DstAddrSpace = MC->getDestAddressSpace();
1180 SrcAddrSpace = MC->getSourceAddressSpace();
1181 }
1182 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1183 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1184 // If 'size' is not a constant, a library call will be generated.
1185 if (!C)
1186 return -1;
1187
1188 const unsigned Size = C->getValue().getZExtValue();
1189 const Align DstAlign = MS->getDestAlign().valueOrOne();
1190
1191 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1192 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1193 DstAddrSpace = MS->getDestAddressSpace();
1194 }
1195 else
1196 llvm_unreachable("Expected a memcpy/move or memset!");
1197
1198 unsigned Limit, Factor = 2;
1199 switch(I->getIntrinsicID()) {
1200 case Intrinsic::memcpy:
1201 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1202 break;
1203 case Intrinsic::memmove:
1204 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1205 break;
1206 case Intrinsic::memset:
1207 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1208 Factor = 1;
1209 break;
1210 default:
1211 llvm_unreachable("Expected a memcpy/move or memset!");
1212 }
1213
1214 // MemOps will be poplulated with a list of data types that needs to be
1215 // loaded and stored. That's why we multiply the number of elements by 2 to
1216 // get the cost for this memcpy.
1217 std::vector<EVT> MemOps;
1218 LLVMContext &C = F->getContext();
1219 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1220 SrcAddrSpace, F->getAttributes()))
1221 return MemOps.size() * Factor;
1222
1223 // If we can't find an optimal memop lowering, return the default cost
1224 return -1;
1225}
1226
1229
1230 // To model the cost of a library call, we assume 1 for the call, and
1231 // 3 for the argument setup.
1232 if (NumOps == -1)
1233 return 4;
1234 return NumOps;
1235}
1236
1238 VectorType *DstTy, VectorType *SrcTy,
1239 ArrayRef<int> Mask,
1241 int Index, VectorType *SubTp,
1243 const Instruction *CxtI) const {
1244 assert((Mask.empty() || DstTy->isScalableTy() ||
1245 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1246 "Expected the Mask to match the return size if given");
1247 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1248 "Expected the same scalar types");
1249
1250 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1251 // Treat extractsubvector as single op permutation.
1252 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1253 if (IsExtractSubvector)
1255 if (ST->hasNEON()) {
1256 if (Kind == TTI::SK_Broadcast) {
1257 static const CostTblEntry NEONDupTbl[] = {
1258 // VDUP handles these cases.
1259 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1260 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1261 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1262 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1263 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1264 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1265
1266 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1267 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1268 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1269 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1270
1271 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1272 if (const auto *Entry =
1273 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1274 return LT.first * Entry->Cost;
1275 }
1276 if (Kind == TTI::SK_Reverse) {
1277 static const CostTblEntry NEONShuffleTbl[] = {
1278 // Reverse shuffle cost one instruction if we are shuffling within a
1279 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1280 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1281 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1282 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1283 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1284 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1285 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1286
1287 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1288 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1289 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1290 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1291
1292 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1293 if (const auto *Entry =
1294 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1295 return LT.first * Entry->Cost;
1296 }
1297 if (Kind == TTI::SK_Select) {
1298 static const CostTblEntry NEONSelShuffleTbl[] = {
1299 // Select shuffle cost table for ARM. Cost is the number of
1300 // instructions
1301 // required to create the shuffled vector.
1302
1303 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1304 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1305 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1306 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1307
1308 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1309 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1310 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1311
1312 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1313
1314 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1315
1316 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1317 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1318 ISD::VECTOR_SHUFFLE, LT.second))
1319 return LT.first * Entry->Cost;
1320 }
1321 }
1322 if (ST->hasMVEIntegerOps()) {
1323 if (Kind == TTI::SK_Broadcast) {
1324 static const CostTblEntry MVEDupTbl[] = {
1325 // VDUP handles these cases.
1326 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1327 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1328 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1329 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1330 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1331
1332 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1333 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1334 LT.second))
1335 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1336 }
1337
1338 if (!Mask.empty()) {
1339 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1340 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1341 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1342 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1343 // higher cost than just the load.
1344 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1345 (LT.second.getScalarSizeInBits() == 8 ||
1346 LT.second.getScalarSizeInBits() == 16 ||
1347 LT.second.getScalarSizeInBits() == 32) &&
1348 LT.second.getSizeInBits() == 128 &&
1349 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1351 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1353 return ST->getMVEVectorCostFactor(CostKind) *
1354 std::max<InstructionCost>(1, LT.first / 4);
1355
1356 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1357 // store(interleaving-shuffle). The shuffle cost could potentially be
1358 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1359 // higher cost than just the store.
1360 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1361 (LT.second.getScalarSizeInBits() == 8 ||
1362 LT.second.getScalarSizeInBits() == 16 ||
1363 LT.second.getScalarSizeInBits() == 32) &&
1364 LT.second.getSizeInBits() == 128 &&
1365 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1367 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1368 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1370 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1371 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1372
1373 if (LT.second.isVector() &&
1374 Mask.size() <= LT.second.getVectorNumElements() &&
1375 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1376 isVREVMask(Mask, LT.second, 64)))
1377 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1378 }
1379 }
1380
1381 // Restore optimal kind.
1382 if (IsExtractSubvector)
1384 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1385 ? ST->getMVEVectorCostFactor(CostKind)
1386 : 1;
1387 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1388 Index, SubTp);
1389}
1390
1392 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1394 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1395 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1396 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1397 // Make operations on i1 relatively expensive as this often involves
1398 // combining predicates. AND and XOR should be easier to handle with IT
1399 // blocks.
1400 switch (ISDOpcode) {
1401 default:
1402 break;
1403 case ISD::AND:
1404 case ISD::XOR:
1405 return 2;
1406 case ISD::OR:
1407 return 3;
1408 }
1409 }
1410
1411 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1412
1413 if (ST->hasNEON()) {
1414 const unsigned FunctionCallDivCost = 20;
1415 const unsigned ReciprocalDivCost = 10;
1416 static const CostTblEntry CostTbl[] = {
1417 // Division.
1418 // These costs are somewhat random. Choose a cost of 20 to indicate that
1419 // vectorizing devision (added function call) is going to be very expensive.
1420 // Double registers types.
1421 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1422 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1423 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1424 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1425 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1426 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1427 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1428 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1429 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1430 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1431 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1432 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1433 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1434 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1435 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1436 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1437 // Quad register types.
1438 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1439 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1440 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1441 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1442 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1443 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1444 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1445 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1446 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1447 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1448 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1449 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1450 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1451 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1452 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1453 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1454 // Multiplication.
1455 };
1456
1457 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1458 return LT.first * Entry->Cost;
1459
1461 Opcode, Ty, CostKind, Op1Info, Op2Info);
1462
1463 // This is somewhat of a hack. The problem that we are facing is that SROA
1464 // creates a sequence of shift, and, or instructions to construct values.
1465 // These sequences are recognized by the ISel and have zero-cost. Not so for
1466 // the vectorized code. Because we have support for v2i64 but not i64 those
1467 // sequences look particularly beneficial to vectorize.
1468 // To work around this we increase the cost of v2i64 operations to make them
1469 // seem less beneficial.
1470 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1471 Cost += 4;
1472
1473 return Cost;
1474 }
1475
1476 // If this operation is a shift on arm/thumb2, it might well be folded into
1477 // the following instruction, hence having a cost of 0.
1478 auto LooksLikeAFreeShift = [&]() {
1479 if (ST->isThumb1Only() || Ty->isVectorTy())
1480 return false;
1481
1482 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1483 return false;
1484 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1485 return false;
1486
1487 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1488 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1489 case Instruction::Add:
1490 case Instruction::Sub:
1491 case Instruction::And:
1492 case Instruction::Xor:
1493 case Instruction::Or:
1494 case Instruction::ICmp:
1495 return true;
1496 default:
1497 return false;
1498 }
1499 };
1500 if (LooksLikeAFreeShift())
1501 return 0;
1502
1503 // When targets have both DSP and MVE we find that the
1504 // the compiler will attempt to vectorize as well as using
1505 // scalar (S/U)MLAL operations. This is in cases where we have
1506 // the pattern ext(mul(ext(i16), ext(i16))) we find
1507 // that codegen performs better when only using (S/U)MLAL scalar
1508 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1509 // check if a mul instruction is used in a (U/S)MLAL pattern.
1510 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1511 Type *Ty) -> bool {
1512 if (!ST->hasDSP())
1513 return false;
1514
1515 if (!I)
1516 return false;
1517
1518 if (Opcode != Instruction::Mul)
1519 return false;
1520
1521 if (Ty->isVectorTy())
1522 return false;
1523
1524 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1525 return cast<Instruction>(LHS)->getOpcode() ==
1526 cast<Instruction>(RHS)->getOpcode();
1527 };
1528 auto IsExtInst = [](const Value *V) -> bool {
1529 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1530 };
1531 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1532 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1533 };
1534
1535 // We check the arguments of the instruction to see if they're extends
1536 auto *BinOp = dyn_cast<BinaryOperator>(I);
1537 if (!BinOp)
1538 return false;
1539 Value *Op0 = BinOp->getOperand(0);
1540 Value *Op1 = BinOp->getOperand(1);
1541 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1542 // We're interested in an ext of an i16
1543 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1544 !IsExtensionFromHalf(Op1))
1545 return false;
1546 // We need to check if this result will be further extended to i64
1547 // and that all these uses are SExt
1548 for (auto *U : I->users())
1549 if (!IsExtInst(U))
1550 return false;
1551 return true;
1552 }
1553
1554 return false;
1555 };
1556
1557 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1558 return 0;
1559
1560 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1561 // for "multiple beats" potentially needed by MVE instructions.
1562 int BaseCost = 1;
1563 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1564 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1565
1566 // The rest of this mostly follows what is done in
1567 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1568 // that scalars or increasing the costs for custom operations. The results is
1569 // also multiplied by the MVEVectorCostFactor where appropriate.
1570 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1571 return LT.first * BaseCost;
1572
1573 // Else this is expand, assume that we need to scalarize this op.
1574 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1575 unsigned Num = VTy->getNumElements();
1577 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1578 // Return the cost of multiple scalar invocation plus the cost of
1579 // inserting and extracting the values.
1580 SmallVector<Type *> Tys(Args.size(), Ty);
1581 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1582 Num * Cost;
1583 }
1584
1585 return BaseCost;
1586}
1587
1589 Align Alignment,
1590 unsigned AddressSpace,
1592 TTI::OperandValueInfo OpInfo,
1593 const Instruction *I) const {
1594 // TODO: Handle other cost kinds.
1596 return 1;
1597
1598 // Type legalization can't handle structs
1599 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1600 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1601 CostKind);
1602
1603 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1604 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1605 // Unaligned loads/stores are extremely inefficient.
1606 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1607 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1608 return LT.first * 4;
1609 }
1610
1611 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1612 // Same for stores.
1613 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1614 ((Opcode == Instruction::Load && I->hasOneUse() &&
1615 isa<FPExtInst>(*I->user_begin())) ||
1616 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1618 Type *DstTy =
1619 Opcode == Instruction::Load
1620 ? (*I->user_begin())->getType()
1621 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1622 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1623 DstTy->getScalarType()->isFloatTy())
1624 return ST->getMVEVectorCostFactor(CostKind);
1625 }
1626
1627 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1628 ? ST->getMVEVectorCostFactor(CostKind)
1629 : 1;
1630 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1631 CostKind, OpInfo, I);
1632}
1633
1637 unsigned IID = MICA.getID();
1638 Type *Src = MICA.getDataType();
1639 Align Alignment = MICA.getAlignment();
1640 unsigned AddressSpace = MICA.getAddressSpace();
1641 if (ST->hasMVEIntegerOps()) {
1642 if (IID == Intrinsic::masked_load &&
1643 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1644 return ST->getMVEVectorCostFactor(CostKind);
1645 if (IID == Intrinsic::masked_store &&
1646 isLegalMaskedStore(Src, Alignment, AddressSpace))
1647 return ST->getMVEVectorCostFactor(CostKind);
1648 }
1649 if (!isa<FixedVectorType>(Src))
1651 // Scalar cost, which is currently very high due to the efficiency of the
1652 // generated code.
1653 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1654}
1655
1657 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1658 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1659 bool UseMaskForCond, bool UseMaskForGaps) const {
1660 assert(Factor >= 2 && "Invalid interleave factor");
1661 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1662
1663 // vldN/vstN doesn't support vector types of i64/f64 element.
1664 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1665
1666 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1667 !UseMaskForCond && !UseMaskForGaps) {
1668 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1669 auto *SubVecTy =
1670 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1671
1672 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1673 // Accesses having vector types that are a multiple of 128 bits can be
1674 // matched to more than one vldN/vstN instruction.
1675 int BaseCost =
1676 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1677 if (NumElts % Factor == 0 &&
1678 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1679 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1680
1681 // Some smaller than legal interleaved patterns are cheap as we can make
1682 // use of the vmovn or vrev patterns to interleave a standard load. This is
1683 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1684 // promoted differently). The cost of 2 here is then a load and vrev or
1685 // vmovn.
1686 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1687 VecTy->isIntOrIntVectorTy() &&
1688 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1689 return 2 * BaseCost;
1690 }
1691
1692 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1693 Alignment, AddressSpace, CostKind,
1694 UseMaskForCond, UseMaskForGaps);
1695}
1696
1698 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1699 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1700 using namespace PatternMatch;
1701 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1702 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1703 Alignment, CostKind, I);
1704
1705 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1706 auto *VTy = cast<FixedVectorType>(DataTy);
1707
1708 // TODO: Splitting, once we do that.
1709
1710 unsigned NumElems = VTy->getNumElements();
1711 unsigned EltSize = VTy->getScalarSizeInBits();
1712 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1713
1714 // For now, it is assumed that for the MVE gather instructions the loads are
1715 // all effectively serialised. This means the cost is the scalar cost
1716 // multiplied by the number of elements being loaded. This is possibly very
1717 // conservative, but even so we still end up vectorising loops because the
1718 // cost per iteration for many loops is lower than for scalar loops.
1719 InstructionCost VectorCost =
1720 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1721 // The scalarization cost should be a lot higher. We use the number of vector
1722 // elements plus the scalarization overhead. If masking is required then a lot
1723 // of little blocks will be needed and potentially a scalarized p0 mask,
1724 // greatly increasing the cost.
1725 InstructionCost ScalarCost =
1726 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1727 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1728 CostKind) +
1729 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1730 CostKind);
1731
1732 if (EltSize < 8 || Alignment < EltSize / 8)
1733 return ScalarCost;
1734
1735 unsigned ExtSize = EltSize;
1736 // Check whether there's a single user that asks for an extended type
1737 if (I != nullptr) {
1738 // Dependent of the caller of this function, a gather instruction will
1739 // either have opcode Instruction::Load or be a call to the masked_gather
1740 // intrinsic
1741 if ((I->getOpcode() == Instruction::Load ||
1743 I->hasOneUse()) {
1744 const User *Us = *I->users().begin();
1745 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1746 // only allow valid type combinations
1747 unsigned TypeSize =
1748 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1749 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1750 (TypeSize == 16 && EltSize == 8)) &&
1751 TypeSize * NumElems == 128) {
1752 ExtSize = TypeSize;
1753 }
1754 }
1755 }
1756 // Check whether the input data needs to be truncated
1757 TruncInst *T;
1758 if ((I->getOpcode() == Instruction::Store ||
1760 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1761 // Only allow valid type combinations
1762 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1763 if (((EltSize == 16 && TypeSize == 32) ||
1764 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1765 TypeSize * NumElems == 128)
1766 ExtSize = TypeSize;
1767 }
1768 }
1769
1770 if (ExtSize * NumElems != 128 || NumElems < 4)
1771 return ScalarCost;
1772
1773 // Any (aligned) i32 gather will not need to be scalarised.
1774 if (ExtSize == 32)
1775 return VectorCost;
1776 // For smaller types, we need to ensure that the gep's inputs are correctly
1777 // extended from a small enough value. Other sizes (including i64) are
1778 // scalarized for now.
1779 if (ExtSize != 8 && ExtSize != 16)
1780 return ScalarCost;
1781
1782 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1783 Ptr = BC->getOperand(0);
1784 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1785 if (GEP->getNumOperands() != 2)
1786 return ScalarCost;
1787 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1788 // Scale needs to be correct (which is only relevant for i16s).
1789 if (Scale != 1 && Scale * 8 != ExtSize)
1790 return ScalarCost;
1791 // And we need to zext (not sext) the indexes from a small enough type.
1792 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1793 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1794 return VectorCost;
1795 }
1796 return ScalarCost;
1797 }
1798 return ScalarCost;
1799}
1800
1803 std::optional<FastMathFlags> FMF,
1805
1806 EVT ValVT = TLI->getValueType(DL, ValTy);
1807 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1808 unsigned EltSize = ValVT.getScalarSizeInBits();
1809
1810 // In general floating point reductions are a series of elementwise
1811 // operations, with free extracts on each step. These are either in-order or
1812 // treewise depending on whether that is allowed by the fast math flags.
1813 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1814 ((EltSize == 32 && ST->hasVFP2Base()) ||
1815 (EltSize == 64 && ST->hasFP64()) ||
1816 (EltSize == 16 && ST->hasFullFP16()))) {
1817 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1818 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1819 InstructionCost VecCost = 0;
1820 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1821 NumElts * EltSize > VecLimit) {
1822 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1823 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1824 NumElts /= 2;
1825 }
1826
1827 // For fp16 we need to extract the upper lane elements. MVE can add a
1828 // VREV+FMIN/MAX to perform another vector step instead.
1829 InstructionCost ExtractCost = 0;
1830 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1831 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1832 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1833 NumElts /= 2;
1834 } else if (ValVT.getVectorElementType() == MVT::f16)
1835 ExtractCost = NumElts / 2;
1836
1837 return VecCost + ExtractCost +
1838 NumElts *
1839 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1840 }
1841
1842 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1843 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1844 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1845 unsigned VecLimit =
1846 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1847 InstructionCost VecCost = 0;
1848 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1849 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1850 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1851 NumElts /= 2;
1852 }
1853 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1854 // step.
1855 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1856 NumElts * EltSize == 64) {
1857 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1858 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1859 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1860 NumElts /= 2;
1861 }
1862
1863 // From here we extract the elements and perform the and/or/xor.
1864 InstructionCost ExtractCost = NumElts;
1865 return VecCost + ExtractCost +
1866 (NumElts - 1) * getArithmeticInstrCost(
1867 Opcode, ValTy->getElementType(), CostKind);
1868 }
1869
1870 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1872 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1873
1874 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1875
1876 static const CostTblEntry CostTblAdd[]{
1877 {ISD::ADD, MVT::v16i8, 1},
1878 {ISD::ADD, MVT::v8i16, 1},
1879 {ISD::ADD, MVT::v4i32, 1},
1880 };
1881 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1882 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1883
1884 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1885}
1886
1888 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1889 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1890 EVT ValVT = TLI->getValueType(DL, ValTy);
1891 EVT ResVT = TLI->getValueType(DL, ResTy);
1892
1893 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1894
1895 switch (ISD) {
1896 case ISD::ADD:
1897 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1898 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1899
1900 // The legal cases are:
1901 // VADDV u/s 8/16/32
1902 // VADDLV u/s 32
1903 // Codegen currently cannot always handle larger than legal vectors very
1904 // well, especially for predicated reductions where the mask needs to be
1905 // split, so restrict to 128bit or smaller input types.
1906 unsigned RevVTSize = ResVT.getSizeInBits();
1907 if (ValVT.getSizeInBits() <= 128 &&
1908 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1909 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1910 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1911 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1912 }
1913 break;
1914 default:
1915 break;
1916 }
1917 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1918 CostKind);
1919}
1920
1922ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1923 Type *ResTy, VectorType *ValTy,
1925 if (RedOpcode != Instruction::Add)
1927 EVT ValVT = TLI->getValueType(DL, ValTy);
1928 EVT ResVT = TLI->getValueType(DL, ResTy);
1929
1930 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1932
1933 // The legal cases are:
1934 // VMLAV u/s 8/16/32
1935 // VMLALV u/s 16/32
1936 // Codegen currently cannot always handle larger than legal vectors very
1937 // well, especially for predicated reductions where the mask needs to be
1938 // split, so restrict to 128bit or smaller input types.
1939 unsigned RevVTSize = ResVT.getSizeInBits();
1940 if (ValVT.getSizeInBits() <= 128 &&
1941 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1942 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1943 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1944 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1945 }
1946
1947 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
1948 CostKind);
1949}
1950
1953 FastMathFlags FMF,
1955 EVT ValVT = TLI->getValueType(DL, Ty);
1956
1957 // In general floating point reductions are a series of elementwise
1958 // operations, with free extracts on each step. These are either in-order or
1959 // treewise depending on whether that is allowed by the fast math flags.
1960 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1961 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1962 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1963 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1964 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1965 unsigned EltSize = ValVT.getScalarSizeInBits();
1966 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1967 InstructionCost VecCost;
1968 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1969 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1970 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1971 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1972 NumElts /= 2;
1973 }
1974
1975 // For fp16 we need to extract the upper lane elements. MVE can add a
1976 // VREV+FMIN/MAX to perform another vector step instead.
1977 InstructionCost ExtractCost = 0;
1978 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1979 NumElts == 8) {
1980 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1981 NumElts /= 2;
1982 } else if (ValVT.getVectorElementType() == MVT::f16)
1983 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1984
1985 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
1986 {Ty->getElementType(), Ty->getElementType()},
1987 FMF);
1988 return VecCost + ExtractCost +
1989 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1990 }
1991
1992 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1993 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1994 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1995
1996 // All costs are the same for u/s min/max. These lower to vminv, which are
1997 // given a slightly higher cost as they tend to take multiple cycles for
1998 // smaller type sizes.
1999 static const CostTblEntry CostTblAdd[]{
2000 {ISD::SMIN, MVT::v16i8, 4},
2001 {ISD::SMIN, MVT::v8i16, 3},
2002 {ISD::SMIN, MVT::v4i32, 2},
2003 };
2004 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2005 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2006 }
2007
2008 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2009}
2010
2014 unsigned Opc = ICA.getID();
2015 switch (Opc) {
2016 case Intrinsic::get_active_lane_mask:
2017 // Currently we make a somewhat optimistic assumption that
2018 // active_lane_mask's are always free. In reality it may be freely folded
2019 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2020 // of add/icmp code. We may need to improve this in the future, but being
2021 // able to detect if it is free or not involves looking at a lot of other
2022 // code. We currently assume that the vectorizer inserted these, and knew
2023 // what it was doing in adding one.
2024 if (ST->hasMVEIntegerOps())
2025 return 0;
2026 break;
2027 case Intrinsic::sadd_sat:
2028 case Intrinsic::ssub_sat:
2029 case Intrinsic::uadd_sat:
2030 case Intrinsic::usub_sat: {
2031 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2032 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2033 Type *RetTy = ICA.getReturnType();
2034
2035 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2036 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2037 return 1; // qadd / qsub
2038 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2039 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2040 // Otherwise return the cost of expanding the node. Generally an add +
2041 // icmp + sel.
2043 Type *CondTy = RetTy->getWithNewBitWidth(1);
2044 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2045 RetTy, CostKind) +
2046 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2047 CostKind) +
2048 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2049 CostKind);
2050 }
2051
2052 if (!ST->hasMVEIntegerOps())
2053 break;
2054
2055 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2056 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2057 LT.second == MVT::v16i8) {
2058 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2059 // need to extend the type, as it uses shr(qadd(shl, shl)).
2060 unsigned Instrs =
2061 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2062 : 4;
2063 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2064 }
2065 break;
2066 }
2067 case Intrinsic::abs:
2068 case Intrinsic::smin:
2069 case Intrinsic::smax:
2070 case Intrinsic::umin:
2071 case Intrinsic::umax: {
2072 if (!ST->hasMVEIntegerOps())
2073 break;
2074 Type *VT = ICA.getReturnType();
2075
2076 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2077 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2078 LT.second == MVT::v16i8)
2079 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2080 break;
2081 }
2082 case Intrinsic::minnum:
2083 case Intrinsic::maxnum: {
2084 if (!ST->hasMVEFloatOps())
2085 break;
2086 Type *VT = ICA.getReturnType();
2087 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2088 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2089 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2090 break;
2091 }
2092 case Intrinsic::fptosi_sat:
2093 case Intrinsic::fptoui_sat: {
2094 if (ICA.getArgTypes().empty())
2095 break;
2096 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2097 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2098 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2099 // Check for the legal types, with the corect subtarget features.
2100 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2101 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2102 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2103 return LT.first;
2104
2105 // Equally for MVE vector types
2106 if (ST->hasMVEFloatOps() &&
2107 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2108 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2109 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2110
2111 // If we can we use a legal convert followed by a min+max
2112 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2113 (ST->hasFP64() && LT.second == MVT::f64) ||
2114 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2115 (ST->hasMVEFloatOps() &&
2116 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2117 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2118 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2119 LT.second.getScalarSizeInBits());
2121 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2122 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2123 : Intrinsic::umin,
2124 LegalTy, {LegalTy, LegalTy});
2126 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2127 : Intrinsic::umax,
2128 LegalTy, {LegalTy, LegalTy});
2130 return LT.first * Cost;
2131 }
2132 // Otherwise we need to follow the default expansion that clamps the value
2133 // using a float min/max with a fcmp+sel for nan handling when signed.
2134 Type *FPTy = ICA.getArgTypes()[0];
2135 Type *RetTy = ICA.getReturnType();
2136 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2138 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2140 Cost +=
2141 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2142 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2143 if (IsSigned) {
2144 Type *CondTy = RetTy->getWithNewBitWidth(1);
2145 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2147 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2149 }
2150 return Cost;
2151 }
2152 }
2153
2155}
2156
2158 if (!F->isIntrinsic())
2159 return BaseT::isLoweredToCall(F);
2160
2161 // Assume all Arm-specific intrinsics map to an instruction.
2162 if (F->getName().starts_with("llvm.arm"))
2163 return false;
2164
2165 switch (F->getIntrinsicID()) {
2166 default: break;
2167 case Intrinsic::powi:
2168 case Intrinsic::sin:
2169 case Intrinsic::cos:
2170 case Intrinsic::sincos:
2171 case Intrinsic::pow:
2172 case Intrinsic::log:
2173 case Intrinsic::log10:
2174 case Intrinsic::log2:
2175 case Intrinsic::exp:
2176 case Intrinsic::exp2:
2177 return true;
2178 case Intrinsic::sqrt:
2179 case Intrinsic::fabs:
2180 case Intrinsic::copysign:
2181 case Intrinsic::floor:
2182 case Intrinsic::ceil:
2183 case Intrinsic::trunc:
2184 case Intrinsic::rint:
2185 case Intrinsic::nearbyint:
2186 case Intrinsic::round:
2187 case Intrinsic::canonicalize:
2188 case Intrinsic::lround:
2189 case Intrinsic::llround:
2190 case Intrinsic::lrint:
2191 case Intrinsic::llrint:
2192 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2193 return true;
2194 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2195 return true;
2196 // Some operations can be handled by vector instructions and assume
2197 // unsupported vectors will be expanded into supported scalar ones.
2198 // TODO Handle scalar operations properly.
2199 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2200 case Intrinsic::masked_store:
2201 case Intrinsic::masked_load:
2202 case Intrinsic::masked_gather:
2203 case Intrinsic::masked_scatter:
2204 return !ST->hasMVEIntegerOps();
2205 case Intrinsic::sadd_with_overflow:
2206 case Intrinsic::uadd_with_overflow:
2207 case Intrinsic::ssub_with_overflow:
2208 case Intrinsic::usub_with_overflow:
2209 case Intrinsic::sadd_sat:
2210 case Intrinsic::uadd_sat:
2211 case Intrinsic::ssub_sat:
2212 case Intrinsic::usub_sat:
2213 return false;
2214 }
2215
2216 return BaseT::isLoweredToCall(F);
2217}
2218
2220 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2221 EVT VT = TLI->getValueType(DL, I.getType(), true);
2222 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2223 return true;
2224
2225 // Check if an intrinsic will be lowered to a call and assume that any
2226 // other CallInst will generate a bl.
2227 if (auto *Call = dyn_cast<CallInst>(&I)) {
2228 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2229 switch(II->getIntrinsicID()) {
2230 case Intrinsic::memcpy:
2231 case Intrinsic::memset:
2232 case Intrinsic::memmove:
2233 return getNumMemOps(II) == -1;
2234 default:
2235 if (const Function *F = Call->getCalledFunction())
2236 return isLoweredToCall(F);
2237 }
2238 }
2239 return true;
2240 }
2241
2242 // FPv5 provides conversions between integer, double-precision,
2243 // single-precision, and half-precision formats.
2244 switch (I.getOpcode()) {
2245 default:
2246 break;
2247 case Instruction::FPToSI:
2248 case Instruction::FPToUI:
2249 case Instruction::SIToFP:
2250 case Instruction::UIToFP:
2251 case Instruction::FPTrunc:
2252 case Instruction::FPExt:
2253 return !ST->hasFPARMv8Base();
2254 }
2255
2256 // FIXME: Unfortunately the approach of checking the Operation Action does
2257 // not catch all cases of Legalization that use library calls. Our
2258 // Legalization step categorizes some transformations into library calls as
2259 // Custom, Expand or even Legal when doing type legalization. So for now
2260 // we have to special case for instance the SDIV of 64bit integers and the
2261 // use of floating point emulation.
2262 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2263 switch (ISD) {
2264 default:
2265 break;
2266 case ISD::SDIV:
2267 case ISD::UDIV:
2268 case ISD::SREM:
2269 case ISD::UREM:
2270 case ISD::SDIVREM:
2271 case ISD::UDIVREM:
2272 return true;
2273 }
2274 }
2275
2276 // Assume all other non-float operations are supported.
2277 if (!VT.isFloatingPoint())
2278 return false;
2279
2280 // We'll need a library call to handle most floats when using soft.
2281 if (TLI->useSoftFloat()) {
2282 switch (I.getOpcode()) {
2283 default:
2284 return true;
2285 case Instruction::Alloca:
2286 case Instruction::Load:
2287 case Instruction::Store:
2288 case Instruction::Select:
2289 case Instruction::PHI:
2290 return false;
2291 }
2292 }
2293
2294 // We'll need a libcall to perform double precision operations on a single
2295 // precision only FPU.
2296 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2297 return true;
2298
2299 // Likewise for half precision arithmetic.
2300 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2301 return true;
2302
2303 return false;
2304}
2305
2307 AssumptionCache &AC,
2308 TargetLibraryInfo *LibInfo,
2309 HardwareLoopInfo &HWLoopInfo) const {
2310 // Low-overhead branches are only supported in the 'low-overhead branch'
2311 // extension of v8.1-m.
2312 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2313 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2314 return false;
2315 }
2316
2318 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2319 return false;
2320 }
2321
2322 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2323 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2324 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2325 return false;
2326 }
2327
2328 const SCEV *TripCountSCEV =
2329 SE.getAddExpr(BackedgeTakenCount,
2330 SE.getOne(BackedgeTakenCount->getType()));
2331
2332 // We need to store the trip count in LR, a 32-bit register.
2333 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2334 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2335 return false;
2336 }
2337
2338 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2339 // point in generating a hardware loop if that's going to happen.
2340
2341 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2342 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2343 switch (Call->getIntrinsicID()) {
2344 default:
2345 break;
2346 case Intrinsic::start_loop_iterations:
2347 case Intrinsic::test_start_loop_iterations:
2348 case Intrinsic::loop_decrement:
2349 case Intrinsic::loop_decrement_reg:
2350 return true;
2351 }
2352 }
2353 return false;
2354 };
2355
2356 // Scan the instructions to see if there's any that we know will turn into a
2357 // call or if this loop is already a low-overhead loop or will become a tail
2358 // predicated loop.
2359 bool IsTailPredLoop = false;
2360 auto ScanLoop = [&](Loop *L) {
2361 for (auto *BB : L->getBlocks()) {
2362 for (auto &I : *BB) {
2363 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2364 isa<InlineAsm>(I)) {
2365 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2366 return false;
2367 }
2368 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2369 IsTailPredLoop |=
2370 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2371 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2372 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2373 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2374 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2375 }
2376 }
2377 return true;
2378 };
2379
2380 // Visit inner loops.
2381 for (auto *Inner : *L)
2382 if (!ScanLoop(Inner))
2383 return false;
2384
2385 if (!ScanLoop(L))
2386 return false;
2387
2388 // TODO: Check whether the trip count calculation is expensive. If L is the
2389 // inner loop but we know it has a low trip count, calculating that trip
2390 // count (in the parent loop) may be detrimental.
2391
2392 LLVMContext &C = L->getHeader()->getContext();
2393 HWLoopInfo.CounterInReg = true;
2394 HWLoopInfo.IsNestingLegal = false;
2395 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2396 HWLoopInfo.CountType = Type::getInt32Ty(C);
2397 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2398 return true;
2399}
2400
2401static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2402 // We don't allow icmp's, and because we only look at single block loops,
2403 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2404 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2405 return false;
2406 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2407 // not currently canonical, but soon will be. Code without them uses icmp, and
2408 // so is not tail predicated as per the condition above. In order to get the
2409 // same performance we treat min and max the same as an icmp for tailpred
2410 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2411 // pick more optimial instructions like VQDMULH. They need to be recognized
2412 // directly by the vectorizer).
2413 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2414 if ((II->getIntrinsicID() == Intrinsic::smin ||
2415 II->getIntrinsicID() == Intrinsic::smax ||
2416 II->getIntrinsicID() == Intrinsic::umin ||
2417 II->getIntrinsicID() == Intrinsic::umax) &&
2418 ++ICmpCount > 1)
2419 return false;
2420
2421 if (isa<FCmpInst>(&I))
2422 return false;
2423
2424 // We could allow extending/narrowing FP loads/stores, but codegen is
2425 // too inefficient so reject this for now.
2427 return false;
2428
2429 // Extends have to be extending-loads
2430 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2431 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2432 return false;
2433
2434 // Truncs have to be narrowing-stores
2435 if (isa<TruncInst>(&I) )
2436 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2437 return false;
2438
2439 return true;
2440}
2441
2442// To set up a tail-predicated loop, we need to know the total number of
2443// elements processed by that loop. Thus, we need to determine the element
2444// size and:
2445// 1) it should be uniform for all operations in the vector loop, so we
2446// e.g. don't want any widening/narrowing operations.
2447// 2) it should be smaller than i64s because we don't have vector operations
2448// that work on i64s.
2449// 3) we don't want elements to be reversed or shuffled, to make sure the
2450// tail-predication masks/predicates the right lanes.
2451//
2453 const DataLayout &DL,
2454 const LoopAccessInfo *LAI,
2455 const DominatorTree &DT) {
2456 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2457
2458 // If there are live-out values, it is probably a reduction. We can predicate
2459 // most reduction operations freely under MVE using a combination of
2460 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2461 // floating point and integer reductions, but don't check for operators
2462 // specifically here. If the value ends up not being a reduction (and so the
2463 // vectorizer cannot tailfold the loop), we should fall back to standard
2464 // vectorization automatically.
2466 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2467 bool ReductionsDisabled =
2470
2471 for (auto *I : LiveOuts) {
2472 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2473 !I->getType()->isHalfTy()) {
2474 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2475 "live-out value\n");
2476 return false;
2477 }
2478 if (ReductionsDisabled) {
2479 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2480 return false;
2481 }
2482 }
2483
2484 // Next, check that all instructions can be tail-predicated.
2485 PredicatedScalarEvolution PSE = LAI->getPSE();
2486 int ICmpCount = 0;
2487
2488 for (BasicBlock *BB : L->blocks()) {
2489 for (Instruction &I : BB->instructionsWithoutDebug()) {
2490 if (isa<PHINode>(&I))
2491 continue;
2492 if (!canTailPredicateInstruction(I, ICmpCount)) {
2493 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2494 return false;
2495 }
2496
2497 Type *T = I.getType();
2498 if (T->getScalarSizeInBits() > 32) {
2499 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2500 return false;
2501 }
2502 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2504 Type *AccessTy = getLoadStoreType(&I);
2505 int64_t NextStride =
2506 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2507 if (NextStride == 1) {
2508 // TODO: for now only allow consecutive strides of 1. We could support
2509 // other strides as long as it is uniform, but let's keep it simple
2510 // for now.
2511 continue;
2512 } else if (NextStride == -1 ||
2513 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2514 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2516 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2517 "be tail-predicated\n.");
2518 return false;
2519 // TODO: don't tail predicate if there is a reversed load?
2520 } else if (EnableMaskedGatherScatters) {
2521 // Gather/scatters do allow loading from arbitrary strides, at
2522 // least if they are loop invariant.
2523 // TODO: Loop variant strides should in theory work, too, but
2524 // this requires further testing.
2525 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2526 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2527 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2528 if (PSE.getSE()->isLoopInvariant(Step, L))
2529 continue;
2530 }
2531 }
2532 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2533 "tail-predicate\n.");
2534 return false;
2535 }
2536 }
2537 }
2538
2539 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2540 return true;
2541}
2542
2544 if (!EnableTailPredication) {
2545 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2546 return false;
2547 }
2548
2549 // Creating a predicated vector loop is the first step for generating a
2550 // tail-predicated hardware loop, for which we need the MVE masked
2551 // load/stores instructions:
2552 if (!ST->hasMVEIntegerOps())
2553 return false;
2554
2555 LoopVectorizationLegality *LVL = TFI->LVL;
2556 Loop *L = LVL->getLoop();
2557
2558 // For now, restrict this to single block loops.
2559 if (L->getNumBlocks() > 1) {
2560 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2561 "loop.\n");
2562 return false;
2563 }
2564
2565 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2566
2567 LoopInfo *LI = LVL->getLoopInfo();
2568 HardwareLoopInfo HWLoopInfo(L);
2569 if (!HWLoopInfo.canAnalyze(*LI)) {
2570 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2571 "analyzable.\n");
2572 return false;
2573 }
2574
2577
2578 // This checks if we have the low-overhead branch architecture
2579 // extension, and if we will create a hardware-loop:
2580 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2581 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2582 "profitable.\n");
2583 return false;
2584 }
2585
2586 DominatorTree *DT = LVL->getDominatorTree();
2587 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2588 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2589 "a candidate.\n");
2590 return false;
2591 }
2592
2593 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2594 *LVL->getDominatorTree());
2595}
2596
2598ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2599 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2601
2602 // Intrinsic @llvm.get.active.lane.mask is supported.
2603 // It is used in the MVETailPredication pass, which requires the number of
2604 // elements processed by this vector loop to setup the tail-predicated
2605 // loop.
2607}
2610 OptimizationRemarkEmitter *ORE) const {
2611 // Enable Upper bound unrolling universally, providing that we do not see an
2612 // active lane mask, which will be better kept as a loop to become tail
2613 // predicated than to be conditionally unrolled.
2614 UP.UpperBound =
2615 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2616 return isa<IntrinsicInst>(I) &&
2617 cast<IntrinsicInst>(I).getIntrinsicID() ==
2618 Intrinsic::get_active_lane_mask;
2619 });
2620
2621 // Only currently enable these preferences for M-Class cores.
2622 if (!ST->isMClass())
2623 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2624
2625 // Disable loop unrolling for Oz and Os.
2626 UP.OptSizeThreshold = 0;
2628 if (L->getHeader()->getParent()->hasOptSize())
2629 return;
2630
2631 SmallVector<BasicBlock*, 4> ExitingBlocks;
2632 L->getExitingBlocks(ExitingBlocks);
2633 LLVM_DEBUG(dbgs() << "Loop has:\n"
2634 << "Blocks: " << L->getNumBlocks() << "\n"
2635 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2636
2637 // Only allow another exit other than the latch. This acts as an early exit
2638 // as it mirrors the profitability calculation of the runtime unroller.
2639 if (ExitingBlocks.size() > 2)
2640 return;
2641
2642 // Limit the CFG of the loop body for targets with a branch predictor.
2643 // Allowing 4 blocks permits if-then-else diamonds in the body.
2644 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2645 return;
2646
2647 // Don't unroll vectorized loops, including the remainder loop
2648 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2649 return;
2650
2651 // Scan the loop: don't unroll loops with calls as this could prevent
2652 // inlining.
2654 for (auto *BB : L->getBlocks()) {
2655 for (auto &I : *BB) {
2656 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2657 // scalar code.
2658 if (I.getType()->isVectorTy())
2659 return;
2660
2661 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2662 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2663 if (!isLoweredToCall(F))
2664 continue;
2665 }
2666 return;
2667 }
2668
2669 SmallVector<const Value*, 4> Operands(I.operand_values());
2670 Cost += getInstructionCost(&I, Operands,
2672 }
2673 }
2674
2675 // On v6m cores, there are very few registers available. We can easily end up
2676 // spilling and reloading more registers in an unrolled loop. Look at the
2677 // number of LCSSA phis as a rough measure of how many registers will need to
2678 // be live out of the loop, reducing the default unroll count if more than 1
2679 // value is needed. In the long run, all of this should be being learnt by a
2680 // machine.
2681 unsigned UnrollCount = 4;
2682 if (ST->isThumb1Only()) {
2683 unsigned ExitingValues = 0;
2685 L->getExitBlocks(ExitBlocks);
2686 for (auto *Exit : ExitBlocks) {
2687 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2688 // only the last is expected to be needed for address operands.
2689 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2690 return PH.getNumOperands() != 1 ||
2691 !isa<GetElementPtrInst>(PH.getOperand(0));
2692 });
2693 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2694 }
2695 if (ExitingValues)
2696 UnrollCount /= ExitingValues;
2697 if (UnrollCount <= 1)
2698 return;
2699 }
2700
2701 // For processors with low overhead branching (LOB), runtime unrolling the
2702 // innermost loop is often detrimental to performance. In these cases the loop
2703 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2704 // deeply nested loops get executed multiple times, negating the benefits of
2705 // LOB. This is particularly noticable when the loop trip count of the
2706 // innermost loop varies within the outer loop, such as in the case of
2707 // triangular matrix decompositions. In these cases we will prefer to not
2708 // unroll the innermost loop, with the intention for it to be executed as a
2709 // low overhead loop.
2710 bool Runtime = true;
2711 if (ST->hasLOB()) {
2713 const auto *BETC = SE.getBackedgeTakenCount(L);
2714 auto *Outer = L->getOutermostLoop();
2715 if ((L != Outer && Outer != L->getParentLoop()) ||
2716 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2717 Runtime = false;
2718 }
2719 }
2720 }
2721
2722 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2723 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2724
2725 UP.Partial = true;
2726 UP.Runtime = Runtime;
2727 UP.UnrollRemainder = true;
2729 UP.UnrollAndJam = true;
2731
2732 // Force unrolling small loops can be very useful because of the branch
2733 // taken cost of the backedge.
2734 if (Cost < 12)
2735 UP.Force = true;
2736}
2737
2742
2744 if (!ST->hasMVEIntegerOps())
2745 return false;
2746
2747 unsigned ScalarBits = Ty->getScalarSizeInBits();
2748 switch (Kind) {
2749 case RecurKind::Add:
2750 return ScalarBits <= 64;
2751 default:
2752 return false;
2753 }
2754}
2755
2757 if (!ST->hasMVEIntegerOps())
2758 return false;
2759 return true;
2760}
2761
2763 StackOffset BaseOffset,
2764 bool HasBaseReg, int64_t Scale,
2765 unsigned AddrSpace) const {
2767 AM.BaseGV = BaseGV;
2768 AM.BaseOffs = BaseOffset.getFixed();
2769 AM.HasBaseReg = HasBaseReg;
2770 AM.Scale = Scale;
2771 AM.ScalableOffset = BaseOffset.getScalable();
2772 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2773 if (ST->hasFPAO())
2774 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2775 return 0;
2776 }
2778}
2779
2780bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2781 if (Thumb) {
2782 // B.W is available in any Thumb2-supporting target, and also in every
2783 // version of Armv8-M, even Baseline which does not include the rest of
2784 // Thumb2.
2785 return ST->isThumb2() || ST->hasV8MBaselineOps();
2786 } else {
2787 // B is available in all versions of the Arm ISA, so the only question is
2788 // whether that ISA is available at all.
2789 return ST->hasARMOps();
2790 }
2791}
2792
2793/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2794/// of the vector elements.
2795static bool areExtractExts(Value *Ext1, Value *Ext2) {
2796 using namespace PatternMatch;
2797
2798 auto areExtDoubled = [](Instruction *Ext) {
2799 return Ext->getType()->getScalarSizeInBits() ==
2800 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2801 };
2802
2803 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2804 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2805 !areExtDoubled(cast<Instruction>(Ext1)) ||
2806 !areExtDoubled(cast<Instruction>(Ext2)))
2807 return false;
2808
2809 return true;
2810}
2811
2812/// Check if sinking \p I's operands to I's basic block is profitable, because
2813/// the operands can be folded into a target instruction, e.g.
2814/// sext/zext can be folded into vsubl.
2816 SmallVectorImpl<Use *> &Ops) const {
2817 using namespace PatternMatch;
2818
2819 if (!I->getType()->isVectorTy())
2820 return false;
2821
2822 if (ST->hasNEON()) {
2823 switch (I->getOpcode()) {
2824 case Instruction::Sub:
2825 case Instruction::Add: {
2826 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2827 return false;
2828 Ops.push_back(&I->getOperandUse(0));
2829 Ops.push_back(&I->getOperandUse(1));
2830 return true;
2831 }
2832 default:
2833 return false;
2834 }
2835 }
2836
2837 if (!ST->hasMVEIntegerOps())
2838 return false;
2839
2840 auto IsFMSMul = [&](Instruction *I) {
2841 if (!I->hasOneUse())
2842 return false;
2843 auto *Sub = cast<Instruction>(*I->users().begin());
2844 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2845 };
2846 auto IsFMS = [&](Instruction *I) {
2847 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2848 match(I->getOperand(1), m_FNeg(m_Value())))
2849 return true;
2850 return false;
2851 };
2852
2853 auto IsSinker = [&](Instruction *I, int Operand) {
2854 switch (I->getOpcode()) {
2855 case Instruction::Add:
2856 case Instruction::Mul:
2857 case Instruction::FAdd:
2858 case Instruction::ICmp:
2859 case Instruction::FCmp:
2860 return true;
2861 case Instruction::FMul:
2862 return !IsFMSMul(I);
2863 case Instruction::Sub:
2864 case Instruction::FSub:
2865 case Instruction::Shl:
2866 case Instruction::LShr:
2867 case Instruction::AShr:
2868 return Operand == 1;
2869 case Instruction::Call:
2870 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2871 switch (II->getIntrinsicID()) {
2872 case Intrinsic::fma:
2873 return !IsFMS(I);
2874 case Intrinsic::sadd_sat:
2875 case Intrinsic::uadd_sat:
2876 case Intrinsic::arm_mve_add_predicated:
2877 case Intrinsic::arm_mve_mul_predicated:
2878 case Intrinsic::arm_mve_qadd_predicated:
2879 case Intrinsic::arm_mve_vhadd:
2880 case Intrinsic::arm_mve_hadd_predicated:
2881 case Intrinsic::arm_mve_vqdmull:
2882 case Intrinsic::arm_mve_vqdmull_predicated:
2883 case Intrinsic::arm_mve_vqdmulh:
2884 case Intrinsic::arm_mve_qdmulh_predicated:
2885 case Intrinsic::arm_mve_vqrdmulh:
2886 case Intrinsic::arm_mve_qrdmulh_predicated:
2887 case Intrinsic::arm_mve_fma_predicated:
2888 return true;
2889 case Intrinsic::ssub_sat:
2890 case Intrinsic::usub_sat:
2891 case Intrinsic::arm_mve_sub_predicated:
2892 case Intrinsic::arm_mve_qsub_predicated:
2893 case Intrinsic::arm_mve_hsub_predicated:
2894 case Intrinsic::arm_mve_vhsub:
2895 return Operand == 1;
2896 default:
2897 return false;
2898 }
2899 }
2900 return false;
2901 default:
2902 return false;
2903 }
2904 };
2905
2906 for (auto OpIdx : enumerate(I->operands())) {
2907 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2908 // Make sure we are not already sinking this operand
2909 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2910 continue;
2911
2912 Instruction *Shuffle = Op;
2913 if (Shuffle->getOpcode() == Instruction::BitCast)
2914 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2915 // We are looking for a splat that can be sunk.
2916 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2917 m_ZeroInt()),
2918 m_Undef(), m_ZeroMask())))
2919 continue;
2920 if (!IsSinker(I, OpIdx.index()))
2921 continue;
2922
2923 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2924 // and vector registers
2925 for (Use &U : Op->uses()) {
2926 Instruction *Insn = cast<Instruction>(U.getUser());
2927 if (!IsSinker(Insn, U.getOperandNo()))
2928 return false;
2929 }
2930
2931 Ops.push_back(&Shuffle->getOperandUse(0));
2932 if (Shuffle != Op)
2933 Ops.push_back(&Op->getOperandUse(0));
2934 Ops.push_back(&OpIdx.value());
2935 }
2936 return true;
2937}
2938
2940 Type *ArrayType) const {
2941 if (!UseWidenGlobalArrays) {
2942 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2943 return false;
2944 }
2945
2946 // Don't modify none integer array types
2947 if (!ArrayType || !ArrayType->isArrayTy() ||
2949 return 0;
2950
2951 // We pad to 4 byte boundaries
2952 if (Size % 4 == 0)
2953 return 0;
2954
2955 unsigned NumBytesToPad = 4 - (Size % 4);
2956 unsigned NewSize = Size + NumBytesToPad;
2957
2958 // Max number of bytes that memcpy allows for lowering to load/stores before
2959 // it uses library function (__aeabi_memcpy).
2960 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2961
2962 if (NewSize > MaxMemIntrinsicSize)
2963 return 0;
2964
2965 return NumBytesToPad;
2966}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
Type * getArrayElementType() const
Definition Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).