LLVM 23.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
70 "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
92 Align(Alignment));
93}
94
96 const Function *Callee) const {
97 const TargetMachine &TM = getTLI()->getTargetMachine();
98 const FeatureBitset &CallerBits =
99 TM.getSubtargetImpl(*Caller)->getFeatureBits();
100 const FeatureBitset &CalleeBits =
101 TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103 // To inline a callee, all features not in the allowed list must match exactly.
104 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105 (CalleeBits & ~InlineFeaturesAllowed);
106 // For features in the allowed list, the callee's features must be a subset of
107 // the callers'.
108 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109 (CalleeBits & InlineFeaturesAllowed);
110
111 LLVM_DEBUG({
112 if (!MatchExact || !MatchSubset) {
113 dbgs() << "=== Inline compatibility debug ===\n";
114 dbgs() << "Caller: " << Caller->getName() << "\n";
115 dbgs() << "Callee: " << Callee->getName() << "\n";
116
117 // Bit diffs
118 FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
119 FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only
120
121 // Counts
122 dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
123 dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";
124
125 dbgs() << "Only-in-caller feature indices [";
126 {
127 bool First = true;
128 for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {
129 if (ExtraInCaller.test(I)) {
130 if (!First)
131 dbgs() << ", ";
132 dbgs() << I;
133 First = false;
134 }
135 }
136 }
137 dbgs() << "]\n";
138
139 dbgs() << "Only-in-callee feature indices [";
140 {
141 bool First = true;
142 for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {
143 if (MissingInCaller.test(I)) {
144 if (!First)
145 dbgs() << ", ";
146 dbgs() << I;
147 First = false;
148 }
149 }
150 }
151 dbgs() << "]\n";
152
153 // Indices map to features as found in
154 // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
155 dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
156 << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
157 }
158 });
159 return MatchExact && MatchSubset;
160}
161
164 ScalarEvolution *SE) const {
165 if (ST->hasMVEIntegerOps())
167
168 if (L->getHeader()->getParent()->hasOptSize())
169 return TTI::AMK_None;
170
171 if (ST->isMClass() && ST->isThumb2() &&
172 L->getNumBlocks() == 1)
173 return TTI::AMK_PreIndexed;
174
175 return TTI::AMK_None;
176}
177
178std::optional<Instruction *>
180 using namespace PatternMatch;
181 Intrinsic::ID IID = II.getIntrinsicID();
182 switch (IID) {
183 default:
184 break;
185 case Intrinsic::arm_neon_vld1: {
186 Align MemAlign =
187 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
189 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
190 return IC.replaceInstUsesWith(II, V);
191 }
192 break;
193 }
194
195 case Intrinsic::arm_neon_vld2:
196 case Intrinsic::arm_neon_vld3:
197 case Intrinsic::arm_neon_vld4:
198 case Intrinsic::arm_neon_vld2lane:
199 case Intrinsic::arm_neon_vld3lane:
200 case Intrinsic::arm_neon_vld4lane:
201 case Intrinsic::arm_neon_vst1:
202 case Intrinsic::arm_neon_vst2:
203 case Intrinsic::arm_neon_vst3:
204 case Intrinsic::arm_neon_vst4:
205 case Intrinsic::arm_neon_vst2lane:
206 case Intrinsic::arm_neon_vst3lane:
207 case Intrinsic::arm_neon_vst4lane: {
208 Align MemAlign =
209 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
211 unsigned AlignArg = II.arg_size() - 1;
212 Value *AlignArgOp = II.getArgOperand(AlignArg);
213 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
214 if (Align && *Align < MemAlign) {
215 return IC.replaceOperand(
216 II, AlignArg,
217 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
218 false));
219 }
220 break;
221 }
222
223 case Intrinsic::arm_neon_vld1x2:
224 case Intrinsic::arm_neon_vld1x3:
225 case Intrinsic::arm_neon_vld1x4:
226 case Intrinsic::arm_neon_vst1x2:
227 case Intrinsic::arm_neon_vst1x3:
228 case Intrinsic::arm_neon_vst1x4: {
229 Align NewAlign =
230 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
232 Align OldAlign = II.getParamAlign(0).valueOrOne();
233 if (NewAlign > OldAlign)
234 II.addParamAttr(0,
235 Attribute::getWithAlignment(II.getContext(), NewAlign));
236 break;
237 }
238
239 case Intrinsic::arm_mve_pred_i2v: {
240 Value *Arg = II.getArgOperand(0);
241 Value *ArgArg;
243 PatternMatch::m_Value(ArgArg))) &&
244 II.getType() == ArgArg->getType()) {
245 return IC.replaceInstUsesWith(II, ArgArg);
246 }
247 Constant *XorMask;
249 PatternMatch::m_Value(ArgArg)),
250 PatternMatch::m_Constant(XorMask))) &&
251 II.getType() == ArgArg->getType()) {
252 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
253 if (CI->getValue().trunc(16).isAllOnes()) {
254 auto TrueVector = IC.Builder.CreateVectorSplat(
255 cast<FixedVectorType>(II.getType())->getNumElements(),
256 IC.Builder.getTrue());
257 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
258 }
259 }
260 }
261 KnownBits ScalarKnown(32);
262 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
263 ScalarKnown)) {
264 return &II;
265 }
266 break;
267 }
268 case Intrinsic::arm_mve_pred_v2i: {
269 Value *Arg = II.getArgOperand(0);
270 Value *ArgArg;
272 PatternMatch::m_Value(ArgArg)))) {
273 return IC.replaceInstUsesWith(II, ArgArg);
274 }
275
276 if (II.getMetadata(LLVMContext::MD_range))
277 break;
278
279 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
280
281 if (auto CurrentRange = II.getRange()) {
282 Range = Range.intersectWith(*CurrentRange);
283 if (Range == CurrentRange)
284 break;
285 }
286
287 II.addRangeRetAttr(Range);
288 II.addRetAttr(Attribute::NoUndef);
289 return &II;
290 }
291 case Intrinsic::arm_mve_vadc:
292 case Intrinsic::arm_mve_vadc_predicated: {
293 unsigned CarryOp =
294 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
295 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
296 "Bad type for intrinsic!");
297
298 KnownBits CarryKnown(32);
299 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
300 CarryKnown)) {
301 return &II;
302 }
303 break;
304 }
305 case Intrinsic::arm_mve_vmldava: {
307 if (I->hasOneUse()) {
308 auto *User = cast<Instruction>(*I->user_begin());
309 Value *OpZ;
310 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
311 match(I->getOperand(3), m_Zero())) {
312 Value *OpX = I->getOperand(4);
313 Value *OpY = I->getOperand(5);
314 Type *OpTy = OpX->getType();
315
317 Value *V =
318 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
319 {I->getOperand(0), I->getOperand(1),
320 I->getOperand(2), OpZ, OpX, OpY});
321
323 return IC.eraseInstFromFunction(*User);
324 }
325 }
326 return std::nullopt;
327 }
328 }
329 return std::nullopt;
330}
331
333 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
334 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
335 std::function<void(Instruction *, unsigned, APInt, APInt &)>
336 SimplifyAndSetOp) const {
337
338 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
339 // opcode specifying a Top/Bottom instruction, which can change between
340 // instructions.
341 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
342 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
343 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
344
345 // The only odd/even lanes of operand 0 will only be demanded depending
346 // on whether this is a top/bottom instruction.
347 APInt DemandedElts =
348 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
349 : APInt::getHighBitsSet(2, 1));
350 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
351 // The other lanes will be defined from the inserted elements.
352 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
353 : APInt::getHighBitsSet(2, 1));
354 return std::nullopt;
355 };
356
357 switch (II.getIntrinsicID()) {
358 default:
359 break;
360 case Intrinsic::arm_mve_vcvt_narrow:
361 SimplifyNarrowInstrTopBottom(2);
362 break;
363 case Intrinsic::arm_mve_vqmovn:
364 SimplifyNarrowInstrTopBottom(4);
365 break;
366 case Intrinsic::arm_mve_vshrn:
367 SimplifyNarrowInstrTopBottom(7);
368 break;
369 }
370
371 return std::nullopt;
372}
373
376 assert(Ty->isIntegerTy());
377
378 unsigned Bits = Ty->getPrimitiveSizeInBits();
379 if (Bits == 0 || Imm.getActiveBits() >= 64)
380 return 4;
381
382 int64_t SImmVal = Imm.getSExtValue();
383 uint64_t ZImmVal = Imm.getZExtValue();
384 if (!ST->isThumb()) {
385 if ((SImmVal >= 0 && SImmVal < 65536) ||
386 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
387 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
388 return 1;
389 return ST->hasV6T2Ops() ? 2 : 3;
390 }
391 if (ST->isThumb2()) {
392 if ((SImmVal >= 0 && SImmVal < 65536) ||
393 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
394 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
395 return 1;
396 return ST->hasV6T2Ops() ? 2 : 3;
397 }
398 // Thumb1, any i8 imm cost 1.
399 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
400 return 1;
401 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
402 return 2;
403 // Load from constantpool.
404 return 3;
405}
406
407// Constants smaller than 256 fit in the immediate field of
408// Thumb1 instructions so we return a zero cost and 1 otherwise.
410 const APInt &Imm,
411 Type *Ty) const {
412 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
413 return 0;
414
415 return 1;
416}
417
418// Checks whether Inst is part of a min(max()) or max(min()) pattern
419// that will match to an SSAT instruction. Returns the instruction being
420// saturated, or null if no saturation pattern was found.
421static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
422 Value *LHS, *RHS;
423 ConstantInt *C;
425
426 if (InstSPF == SPF_SMAX &&
428 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
429
430 auto isSSatMin = [&](Value *MinInst) {
431 if (isa<SelectInst>(MinInst)) {
432 Value *MinLHS, *MinRHS;
433 ConstantInt *MinC;
434 SelectPatternFlavor MinSPF =
435 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
436 if (MinSPF == SPF_SMIN &&
438 MinC->getValue() == ((-Imm) - 1))
439 return true;
440 }
441 return false;
442 };
443
444 if (isSSatMin(Inst->getOperand(1)))
445 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
446 if (Inst->hasNUses(2) &&
447 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
448 return Inst->getOperand(1);
449 }
450 return nullptr;
451}
452
453// Look for a FP Saturation pattern, where the instruction can be simplified to
454// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
455static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
456 if (Imm.getBitWidth() != 64 ||
457 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
458 return false;
459 Value *FP = isSSATMinMaxPattern(Inst, Imm);
460 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
462 if (!FP)
463 return false;
464 return isa<FPToSIInst>(FP);
465}
466
467InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
468 const APInt &Imm, Type *Ty,
470 Instruction *Inst) const {
471 // Division by a constant can be turned into multiplication, but only if we
472 // know it's constant. So it's not so much that the immediate is cheap (it's
473 // not), but that the alternative is worse.
474 // FIXME: this is probably unneeded with GlobalISel.
475 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
476 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
477 Idx == 1)
478 return 0;
479
480 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
481 // splitting any large offsets.
482 if (Opcode == Instruction::GetElementPtr && Idx != 0)
483 return 0;
484
485 if (Opcode == Instruction::And) {
486 // UXTB/UXTH
487 if (Imm == 255 || Imm == 65535)
488 return 0;
489 // Conversion to BIC is free, and means we can use ~Imm instead.
490 return std::min(getIntImmCost(Imm, Ty, CostKind),
491 getIntImmCost(~Imm, Ty, CostKind));
492 }
493
494 if (Opcode == Instruction::Add)
495 // Conversion to SUB is free, and means we can use -Imm instead.
496 return std::min(getIntImmCost(Imm, Ty, CostKind),
497 getIntImmCost(-Imm, Ty, CostKind));
498
499 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
500 Ty->getIntegerBitWidth() == 32) {
501 int64_t NegImm = -Imm.getSExtValue();
502 if (ST->isThumb2() && NegImm < 1<<12)
503 // icmp X, #-C -> cmn X, #C
504 return 0;
505 if (ST->isThumb() && NegImm < 1<<8)
506 // icmp X, #-C -> adds X, #C
507 return 0;
508 }
509
510 // xor a, -1 can always be folded to MVN
511 if (Opcode == Instruction::Xor && Imm.isAllOnes())
512 return 0;
513
514 // Ensures negative constant of min(max()) or max(min()) patterns that
515 // match to SSAT instructions don't get hoisted
516 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
517 Ty->getIntegerBitWidth() <= 32) {
518 if (isSSATMinMaxPattern(Inst, Imm) ||
519 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
521 return 0;
522 }
523
524 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
525 return 0;
526
527 // We can convert <= -1 to < 0, which is generally quite cheap.
528 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
529 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
530 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
531 return std::min(getIntImmCost(Imm, Ty, CostKind),
532 getIntImmCost(Imm + 1, Ty, CostKind));
533 }
534
535 return getIntImmCost(Imm, Ty, CostKind);
536}
537
540 const Instruction *I) const {
542 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
543 // FIXME: The vectorizer is highly sensitive to the cost of these
544 // instructions, which suggests that it may be using the costs incorrectly.
545 // But, for now, just make them free to avoid performance regressions for
546 // vector targets.
547 return 0;
548 }
549 return BaseT::getCFInstrCost(Opcode, CostKind, I);
550}
551
553 Type *Src,
556 const Instruction *I) const {
557 int ISD = TLI->InstructionOpcodeToISD(Opcode);
558 assert(ISD && "Invalid opcode");
559
560 // TODO: Allow non-throughput costs that aren't binary.
561 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
563 return Cost == 0 ? 0 : 1;
564 return Cost;
565 };
566 auto IsLegalFPType = [this](EVT VT) {
567 EVT EltVT = VT.getScalarType();
568 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
569 (EltVT == MVT::f64 && ST->hasFP64()) ||
570 (EltVT == MVT::f16 && ST->hasFullFP16());
571 };
572
573 EVT SrcTy = TLI->getValueType(DL, Src);
574 EVT DstTy = TLI->getValueType(DL, Dst);
575
576 if (!SrcTy.isSimple() || !DstTy.isSimple())
577 return AdjustCost(
578 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
579
580 // Extending masked load/Truncating masked stores is expensive because we
581 // currently don't split them. This means that we'll likely end up
582 // loading/storing each element individually (hence the high cost).
583 if ((ST->hasMVEIntegerOps() &&
584 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
585 Opcode == Instruction::SExt)) ||
586 (ST->hasMVEFloatOps() &&
587 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
588 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
589 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
590 return 2 * DstTy.getVectorNumElements() *
591 ST->getMVEVectorCostFactor(CostKind);
592
593 // The extend of other kinds of load is free
594 if (CCH == TTI::CastContextHint::Normal ||
596 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
597 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
598 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
599 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
600 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
601 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
602 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
603 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
604 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
605 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
606 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
607 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
608 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
609 };
610 if (const auto *Entry = ConvertCostTableLookup(
611 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
612 return AdjustCost(Entry->Cost);
613
614 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
615 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
616 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
617 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
618 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
619 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
620 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
621 // The following extend from a legal type to an illegal type, so need to
622 // split the load. This introduced an extra load operation, but the
623 // extend is still "free".
624 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
625 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
626 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
627 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
628 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
629 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
630 };
631 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
632 if (const auto *Entry =
633 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
634 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
635 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
636 }
637
638 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
639 // FPExtends are similar but also require the VCVT instructions.
640 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
641 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
642 };
643 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
644 if (const auto *Entry =
645 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
646 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
647 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
648 }
649
650 // The truncate of a store is free. This is the mirror of extends above.
651 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
652 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
653 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
654 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
655 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
656 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
657 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
658 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
659 };
660 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
661 if (const auto *Entry =
662 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
663 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
664 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
665 }
666
667 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
668 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
669 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
670 };
671 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
672 if (const auto *Entry =
673 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
674 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
675 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
676 }
677 }
678
679 // NEON vector operations that can extend their inputs.
680 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
681 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
682 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
683 // vaddl
684 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
685 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
686 // vsubl
687 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
688 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
689 // vmull
690 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
691 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
692 // vshll
693 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
694 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
695 };
696
697 auto *User = cast<Instruction>(*I->user_begin());
698 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
699 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
700 DstTy.getSimpleVT(),
701 SrcTy.getSimpleVT())) {
702 return AdjustCost(Entry->Cost);
703 }
704 }
705
706 // Single to/from double precision conversions.
707 if (Src->isVectorTy() && ST->hasNEON() &&
708 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
709 DstTy.getScalarType() == MVT::f32) ||
710 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
711 DstTy.getScalarType() == MVT::f64))) {
712 static const CostTblEntry NEONFltDblTbl[] = {
713 // Vector fptrunc/fpext conversions.
714 {ISD::FP_ROUND, MVT::v2f64, 2},
715 {ISD::FP_EXTEND, MVT::v2f32, 2},
716 {ISD::FP_EXTEND, MVT::v4f32, 4}};
717
718 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
719 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
720 return AdjustCost(LT.first * Entry->Cost);
721 }
722
723 // Some arithmetic, load and store operations have specific instructions
724 // to cast up/down their types automatically at no extra cost.
725 // TODO: Get these tables to know at least what the related operations are.
726 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
727 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
728 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
729 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
730 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
731 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
732 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
733
734 // The number of vmovl instructions for the extension.
735 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
736 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
737 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
738 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
739 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
740 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
741 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
742 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
747 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
748 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
749 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
750 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
751 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
752 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
753
754 // Operations that we legalize using splitting.
755 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
756 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
757
758 // Vector float <-> i32 conversions.
759 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
760 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
761
762 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
763 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
764 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
765 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
766 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
767 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
768 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
769 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
770 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
771 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
772 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
773 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
774 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
775 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
776 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
777 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
778 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
779 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
780 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
781 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
782
783 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
784 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
785 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
786 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
787 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
788 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
789
790 // Vector double <-> i32 conversions.
791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
792 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
793
794 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
795 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
796 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
797 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
798 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
799 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
800
801 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
802 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
803 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
804 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
805 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
806 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
807 };
808
809 if (SrcTy.isVector() && ST->hasNEON()) {
810 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
811 DstTy.getSimpleVT(),
812 SrcTy.getSimpleVT()))
813 return AdjustCost(Entry->Cost);
814 }
815
816 // Scalar float to integer conversions.
817 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
818 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
819 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
820 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
821 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
822 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
823 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
824 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
825 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
826 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
827 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
828 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
829 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
830 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
831 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
832 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
833 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
834 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
835 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
836 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
837 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
838 };
839 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
840 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
841 DstTy.getSimpleVT(),
842 SrcTy.getSimpleVT()))
843 return AdjustCost(Entry->Cost);
844 }
845
846 // Scalar integer to float conversions.
847 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
848 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
849 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
850 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
851 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
852 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
853 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
854 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
855 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
856 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
857 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
858 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
859 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
860 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
861 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
862 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
863 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
864 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
865 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
866 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
867 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
868 };
869
870 if (SrcTy.isInteger() && ST->hasNEON()) {
871 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
872 ISD, DstTy.getSimpleVT(),
873 SrcTy.getSimpleVT()))
874 return AdjustCost(Entry->Cost);
875 }
876
877 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
878 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
879 // are linearised so take more.
880 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
881 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
882 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
883 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
884 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
885 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
886 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
887 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
888 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
889 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
890 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
891 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
892 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
893 };
894
895 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
896 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
897 ISD, DstTy.getSimpleVT(),
898 SrcTy.getSimpleVT()))
899 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
900 }
901
902 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
903 // As general rule, fp converts that were not matched above are scalarized
904 // and cost 1 vcvt for each lane, so long as the instruction is available.
905 // If not it will become a series of function calls.
906 const InstructionCost CallCost =
907 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
908 int Lanes = 1;
909 if (SrcTy.isFixedLengthVector())
910 Lanes = SrcTy.getVectorNumElements();
911
912 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
913 return Lanes;
914 else
915 return Lanes * CallCost;
916 }
917
918 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
919 SrcTy.isFixedLengthVector()) {
920 // Treat a truncate with larger than legal source (128bits for MVE) as
921 // expensive, 2 instructions per lane.
922 if ((SrcTy.getScalarType() == MVT::i8 ||
923 SrcTy.getScalarType() == MVT::i16 ||
924 SrcTy.getScalarType() == MVT::i32) &&
925 SrcTy.getSizeInBits() > 128 &&
926 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
927 return SrcTy.getVectorNumElements() * 2;
928 }
929
930 // Scalar integer conversion costs.
931 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
932 // i16 -> i64 requires two dependent operations.
933 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
934
935 // Truncates on i64 are assumed to be free.
936 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
937 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
938 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
939 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
940 };
941
942 if (SrcTy.isInteger()) {
943 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
944 DstTy.getSimpleVT(),
945 SrcTy.getSimpleVT()))
946 return AdjustCost(Entry->Cost);
947 }
948
949 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
950 ? ST->getMVEVectorCostFactor(CostKind)
951 : 1;
952 return AdjustCost(
953 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
954}
955
957 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
958 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
959 // Penalize inserting into an D-subregister. We end up with a three times
960 // lower estimated throughput on swift.
961 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
962 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
963 return 3;
964
965 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
966 Opcode == Instruction::ExtractElement)) {
967 // Cross-class copies are expensive on many microarchitectures,
968 // so assume they are expensive by default.
969 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
970 return 3;
971
972 // Even if it's not a cross class copy, this likely leads to mixing
973 // of NEON and VFP code and should be therefore penalized.
974 if (ValTy->isVectorTy() &&
975 ValTy->getScalarSizeInBits() <= 32)
976 return std::max<InstructionCost>(
977 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
978 VIC),
979 2U);
980 }
981
982 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
983 Opcode == Instruction::ExtractElement)) {
984 // Integer cross-lane moves are more expensive than float, which can
985 // sometimes just be vmovs. Integer involve being passes to GPR registers,
986 // causing more of a delay.
987 std::pair<InstructionCost, MVT> LT =
989 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
990 }
991
992 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
993 VIC);
994}
995
997 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
999 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
1000 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1001
1002 // Thumb scalar code size cost for select.
1004 ST->isThumb() && !ValTy->isVectorTy()) {
1005 // Assume expensive structs.
1006 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
1007 return TTI::TCC_Expensive;
1008
1009 // Select costs can vary because they:
1010 // - may require one or more conditional mov (including an IT),
1011 // - can't operate directly on immediates,
1012 // - require live flags, which we can't copy around easily.
1014
1015 // Possible IT instruction for Thumb2, or more for Thumb1.
1016 ++Cost;
1017
1018 // i1 values may need rematerialising by using mov immediates and/or
1019 // flag setting instructions.
1020 if (ValTy->isIntegerTy(1))
1021 ++Cost;
1022
1023 return Cost;
1024 }
1025
1026 // If this is a vector min/max/abs, use the cost of that intrinsic directly
1027 // instead. Hopefully when min/max intrinsics are more prevalent this code
1028 // will not be needed.
1029 const Instruction *Sel = I;
1030 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
1031 Sel->hasOneUse())
1032 Sel = cast<Instruction>(Sel->user_back());
1033 if (Sel && ValTy->isVectorTy() &&
1034 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
1035 const Value *LHS, *RHS;
1036 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
1037 unsigned IID = 0;
1038 switch (SPF) {
1039 case SPF_ABS:
1040 IID = Intrinsic::abs;
1041 break;
1042 case SPF_SMIN:
1043 IID = Intrinsic::smin;
1044 break;
1045 case SPF_SMAX:
1046 IID = Intrinsic::smax;
1047 break;
1048 case SPF_UMIN:
1049 IID = Intrinsic::umin;
1050 break;
1051 case SPF_UMAX:
1052 IID = Intrinsic::umax;
1053 break;
1054 case SPF_FMINNUM:
1055 IID = Intrinsic::minnum;
1056 break;
1057 case SPF_FMAXNUM:
1058 IID = Intrinsic::maxnum;
1059 break;
1060 default:
1061 break;
1062 }
1063 if (IID) {
1064 // The ICmp is free, the select gets the cost of the min/max/etc
1065 if (Sel != I)
1066 return 0;
1067 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1068 return getIntrinsicInstrCost(CostAttrs, CostKind);
1069 }
1070 }
1071
1072 // On NEON a vector select gets lowered to vbsl.
1073 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1074 // Lowering of some vector selects is currently far from perfect.
1075 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1076 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1077 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1078 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1079 };
1080
1081 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1082 EVT SelValTy = TLI->getValueType(DL, ValTy);
1083 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1084 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1085 SelCondTy.getSimpleVT(),
1086 SelValTy.getSimpleVT()))
1087 return Entry->Cost;
1088 }
1089
1090 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1091 return LT.first;
1092 }
1093
1094 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1095 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1096 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1097 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1099 if (!VecCondTy)
1101
1102 // If we don't have mve.fp any fp operations will need to be scalarized.
1103 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1104 // One scalaization insert, one scalarization extract and the cost of the
1105 // fcmps.
1106 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1107 /*Extract*/ true, CostKind) +
1108 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1109 /*Extract*/ false, CostKind) +
1110 VecValTy->getNumElements() *
1111 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1112 VecCondTy->getScalarType(), VecPred,
1113 CostKind, Op1Info, Op2Info, I);
1114 }
1115
1116 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1117 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1118 // There are two types - the input that specifies the type of the compare
1119 // and the output vXi1 type. Because we don't know how the output will be
1120 // split, we may need an expensive shuffle to get two in sync. This has the
1121 // effect of making larger than legal compares (v8i32 for example)
1122 // expensive.
1123 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1124 if (LT.first > 1)
1125 return LT.first * BaseCost +
1126 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1127 /*Extract*/ false, CostKind);
1128 return BaseCost;
1129 }
1130 }
1131
1132 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1133 // for "multiple beats" potentially needed by MVE instructions.
1134 int BaseCost = 1;
1135 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1136 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1137
1138 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1139 CostKind, Op1Info, Op2Info, I);
1140}
1141
1144 const SCEV *Ptr,
1146 // Address computations in vectorized code with non-consecutive addresses will
1147 // likely result in more instructions compared to scalar code where the
1148 // computation can more often be merged into the index mode. The resulting
1149 // extra micro-ops can significantly decrease throughput.
1150 unsigned NumVectorInstToHideOverhead = 10;
1151 int MaxMergeDistance = 64;
1152
1153 if (ST->hasNEON()) {
1154 if (PtrTy->isVectorTy() && SE &&
1155 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1156 return NumVectorInstToHideOverhead;
1157
1158 // In many cases the address computation is not merged into the instruction
1159 // addressing mode.
1160 return 1;
1161 }
1162 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1163}
1164
1167 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1168 // optimized, else LSR may block tail-predication.
1169 switch (II->getIntrinsicID()) {
1170 case Intrinsic::arm_mve_vctp8:
1171 case Intrinsic::arm_mve_vctp16:
1172 case Intrinsic::arm_mve_vctp32:
1173 case Intrinsic::arm_mve_vctp64:
1174 return true;
1175 default:
1176 break;
1177 }
1178 }
1179 return false;
1180}
1181
1183 unsigned /*AddressSpace*/,
1184 TTI::MaskKind /*MaskKind*/) const {
1185 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1186 return false;
1187
1188 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1189 // Don't support v2i1 yet.
1190 if (VecTy->getNumElements() == 2)
1191 return false;
1192
1193 // We don't support extending fp types.
1194 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1195 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1196 return false;
1197 }
1198
1199 unsigned EltWidth = DataTy->getScalarSizeInBits();
1200 return (EltWidth == 32 && Alignment >= 4) ||
1201 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1202}
1203
1204bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1205 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1206 return false;
1207
1208 unsigned EltWidth = Ty->getScalarSizeInBits();
1209 return ((EltWidth == 32 && Alignment >= 4) ||
1210 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1211}
1212
1213/// Given a memcpy/memset/memmove instruction, return the number of memory
1214/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1215/// call is used.
1217 MemOp MOp;
1218 unsigned DstAddrSpace = ~0u;
1219 unsigned SrcAddrSpace = ~0u;
1220 const Function *F = I->getParent()->getParent();
1221
1222 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1223 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1224 // If 'size' is not a constant, a library call will be generated.
1225 if (!C)
1226 return -1;
1227
1228 const unsigned Size = C->getValue().getZExtValue();
1229 const Align DstAlign = MC->getDestAlign().valueOrOne();
1230 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1231
1232 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1233 /*IsVolatile*/ false);
1234 DstAddrSpace = MC->getDestAddressSpace();
1235 SrcAddrSpace = MC->getSourceAddressSpace();
1236 }
1237 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1238 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1239 // If 'size' is not a constant, a library call will be generated.
1240 if (!C)
1241 return -1;
1242
1243 const unsigned Size = C->getValue().getZExtValue();
1244 const Align DstAlign = MS->getDestAlign().valueOrOne();
1245
1246 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1247 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1248 DstAddrSpace = MS->getDestAddressSpace();
1249 }
1250 else
1251 llvm_unreachable("Expected a memcpy/move or memset!");
1252
1253 unsigned Limit, Factor = 2;
1254 switch(I->getIntrinsicID()) {
1255 case Intrinsic::memcpy:
1256 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1257 break;
1258 case Intrinsic::memmove:
1259 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1260 break;
1261 case Intrinsic::memset:
1262 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1263 Factor = 1;
1264 break;
1265 default:
1266 llvm_unreachable("Expected a memcpy/move or memset!");
1267 }
1268
1269 // MemOps will be poplulated with a list of data types that needs to be
1270 // loaded and stored. That's why we multiply the number of elements by 2 to
1271 // get the cost for this memcpy.
1272 std::vector<EVT> MemOps;
1273 LLVMContext &C = F->getContext();
1274 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1275 SrcAddrSpace, F->getAttributes(),
1276 nullptr))
1277 return MemOps.size() * Factor;
1278
1279 // If we can't find an optimal memop lowering, return the default cost
1280 return -1;
1281}
1282
1285
1286 // To model the cost of a library call, we assume 1 for the call, and
1287 // 3 for the argument setup.
1288 if (NumOps == -1)
1289 return 4;
1290 return NumOps;
1291}
1292
1294 VectorType *DstTy, VectorType *SrcTy,
1295 ArrayRef<int> Mask,
1297 int Index, VectorType *SubTp,
1299 const Instruction *CxtI) const {
1300 assert((Mask.empty() || DstTy->isScalableTy() ||
1301 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1302 "Expected the Mask to match the return size if given");
1303 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1304 "Expected the same scalar types");
1305
1306 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1307 // Treat extractsubvector as single op permutation.
1308 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1309 if (IsExtractSubvector)
1311 if (ST->hasNEON()) {
1312 if (Kind == TTI::SK_Broadcast) {
1313 static const CostTblEntry NEONDupTbl[] = {
1314 // VDUP handles these cases.
1315 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1316 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1319 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1320 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1321
1322 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1323 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1324 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1325 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1326
1327 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1328 if (const auto *Entry =
1329 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1330 return LT.first * Entry->Cost;
1331 }
1332 if (Kind == TTI::SK_Reverse) {
1333 static const CostTblEntry NEONShuffleTbl[] = {
1334 // Reverse shuffle cost one instruction if we are shuffling within a
1335 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1336 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1337 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1338 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1339 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1340 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1341 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1342
1343 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1344 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1345 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1346 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1347
1348 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1349 if (const auto *Entry =
1350 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1351 return LT.first * Entry->Cost;
1352 }
1353 if (Kind == TTI::SK_Select) {
1354 static const CostTblEntry NEONSelShuffleTbl[] = {
1355 // Select shuffle cost table for ARM. Cost is the number of
1356 // instructions
1357 // required to create the shuffled vector.
1358
1359 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1360 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1361 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1362 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1363
1364 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1365 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1366 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1367
1368 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1369
1370 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1371
1372 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1373 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1374 ISD::VECTOR_SHUFFLE, LT.second))
1375 return LT.first * Entry->Cost;
1376 }
1377 }
1378 if (ST->hasMVEIntegerOps()) {
1379 if (Kind == TTI::SK_Broadcast) {
1380 static const CostTblEntry MVEDupTbl[] = {
1381 // VDUP handles these cases.
1382 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1383 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1384 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1385 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1386 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1387
1388 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1389 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1390 LT.second))
1391 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1392 }
1393
1394 if (!Mask.empty()) {
1395 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1396 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1397 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1398 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1399 // higher cost than just the load.
1400 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1401 (LT.second.getScalarSizeInBits() == 8 ||
1402 LT.second.getScalarSizeInBits() == 16 ||
1403 LT.second.getScalarSizeInBits() == 32) &&
1404 LT.second.getSizeInBits() == 128 &&
1405 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1407 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1409 return ST->getMVEVectorCostFactor(CostKind) *
1410 std::max<InstructionCost>(1, LT.first / 4);
1411
1412 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1413 // store(interleaving-shuffle). The shuffle cost could potentially be
1414 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1415 // higher cost than just the store.
1416 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1417 (LT.second.getScalarSizeInBits() == 8 ||
1418 LT.second.getScalarSizeInBits() == 16 ||
1419 LT.second.getScalarSizeInBits() == 32) &&
1420 LT.second.getSizeInBits() == 128 &&
1421 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1423 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1424 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1426 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1427 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1428
1429 if (LT.second.isVector() &&
1430 Mask.size() <= LT.second.getVectorNumElements() &&
1431 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1432 isVREVMask(Mask, LT.second, 64)))
1433 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1434 }
1435 }
1436
1437 // Restore optimal kind.
1438 if (IsExtractSubvector)
1440 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1441 ? ST->getMVEVectorCostFactor(CostKind)
1442 : 1;
1443 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1444 Index, SubTp);
1445}
1446
1448 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1450 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1451 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1452 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1453 // Make operations on i1 relatively expensive as this often involves
1454 // combining predicates. AND and XOR should be easier to handle with IT
1455 // blocks.
1456 switch (ISDOpcode) {
1457 default:
1458 break;
1459 case ISD::AND:
1460 case ISD::XOR:
1461 return 2;
1462 case ISD::OR:
1463 return 3;
1464 }
1465 }
1466
1467 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1468
1469 if (ST->hasNEON()) {
1470 const unsigned FunctionCallDivCost = 20;
1471 const unsigned ReciprocalDivCost = 10;
1472 static const CostTblEntry CostTbl[] = {
1473 // Division.
1474 // These costs are somewhat random. Choose a cost of 20 to indicate that
1475 // vectorizing devision (added function call) is going to be very expensive.
1476 // Double registers types.
1477 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1478 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1479 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1480 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1481 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1482 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1483 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1484 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1485 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1486 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1487 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1488 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1489 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1490 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1491 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1492 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1493 // Quad register types.
1494 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1495 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1496 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1497 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1498 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1499 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1500 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1501 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1502 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1503 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1504 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1505 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1506 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1507 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1508 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1509 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1510 // Multiplication.
1511 };
1512
1513 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1514 return LT.first * Entry->Cost;
1515
1517 Opcode, Ty, CostKind, Op1Info, Op2Info);
1518
1519 // This is somewhat of a hack. The problem that we are facing is that SROA
1520 // creates a sequence of shift, and, or instructions to construct values.
1521 // These sequences are recognized by the ISel and have zero-cost. Not so for
1522 // the vectorized code. Because we have support for v2i64 but not i64 those
1523 // sequences look particularly beneficial to vectorize.
1524 // To work around this we increase the cost of v2i64 operations to make them
1525 // seem less beneficial.
1526 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1527 Cost += 4;
1528
1529 return Cost;
1530 }
1531
1532 // If this operation is a shift on arm/thumb2, it might well be folded into
1533 // the following instruction, hence having a cost of 0.
1534 auto LooksLikeAFreeShift = [&]() {
1535 if (ST->isThumb1Only() || Ty->isVectorTy())
1536 return false;
1537
1538 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1539 return false;
1540 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1541 return false;
1542
1543 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1544 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1545 case Instruction::Add:
1546 case Instruction::Sub:
1547 case Instruction::And:
1548 case Instruction::Xor:
1549 case Instruction::Or:
1550 case Instruction::ICmp:
1551 return true;
1552 default:
1553 return false;
1554 }
1555 };
1556 if (LooksLikeAFreeShift())
1557 return 0;
1558
1559 // When targets have both DSP and MVE we find that the
1560 // the compiler will attempt to vectorize as well as using
1561 // scalar (S/U)MLAL operations. This is in cases where we have
1562 // the pattern ext(mul(ext(i16), ext(i16))) we find
1563 // that codegen performs better when only using (S/U)MLAL scalar
1564 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1565 // check if a mul instruction is used in a (U/S)MLAL pattern.
1566 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1567 Type *Ty) -> bool {
1568 if (!ST->hasDSP())
1569 return false;
1570
1571 if (!I)
1572 return false;
1573
1574 if (Opcode != Instruction::Mul)
1575 return false;
1576
1577 if (Ty->isVectorTy())
1578 return false;
1579
1580 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1581 return cast<Instruction>(LHS)->getOpcode() ==
1582 cast<Instruction>(RHS)->getOpcode();
1583 };
1584 auto IsExtInst = [](const Value *V) -> bool {
1585 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1586 };
1587 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1588 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1589 };
1590
1591 // We check the arguments of the instruction to see if they're extends
1592 auto *BinOp = dyn_cast<BinaryOperator>(I);
1593 if (!BinOp)
1594 return false;
1595 Value *Op0 = BinOp->getOperand(0);
1596 Value *Op1 = BinOp->getOperand(1);
1597 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1598 // We're interested in an ext of an i16
1599 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1600 !IsExtensionFromHalf(Op1))
1601 return false;
1602 // We need to check if this result will be further extended to i64
1603 // and that all these uses are SExt
1604 for (auto *U : I->users())
1605 if (!IsExtInst(U))
1606 return false;
1607 return true;
1608 }
1609
1610 return false;
1611 };
1612
1613 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1614 return 0;
1615
1616 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1617 // for "multiple beats" potentially needed by MVE instructions.
1618 int BaseCost = 1;
1619 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1620 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1621
1622 // The rest of this mostly follows what is done in
1623 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1624 // that scalars or increasing the costs for custom operations. The results is
1625 // also multiplied by the MVEVectorCostFactor where appropriate.
1626 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1627 return LT.first * BaseCost;
1628
1629 // Else this is expand, assume that we need to scalarize this op.
1630 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1631 unsigned Num = VTy->getNumElements();
1633 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1634 // Return the cost of multiple scalar invocation plus the cost of
1635 // inserting and extracting the values.
1636 SmallVector<Type *> Tys(Args.size(), Ty);
1637 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1638 Num * Cost;
1639 }
1640
1641 return BaseCost;
1642}
1643
1645 Align Alignment,
1646 unsigned AddressSpace,
1648 TTI::OperandValueInfo OpInfo,
1649 const Instruction *I) const {
1650 // FIXME: Load latency isn't handled here
1651 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
1652 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1653 CostKind, OpInfo, I);
1654
1655 // TODO: Handle other cost kinds.
1657 return 1;
1658
1659 // Type legalization can't handle structs
1660 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1661 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1662 CostKind);
1663
1664 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1665 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1666 // Unaligned loads/stores are extremely inefficient.
1667 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1668 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1669 return LT.first * 4;
1670 }
1671
1672 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1673 // Same for stores.
1674 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1675 ((Opcode == Instruction::Load && I->hasOneUse() &&
1676 isa<FPExtInst>(*I->user_begin())) ||
1677 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1679 Type *DstTy =
1680 Opcode == Instruction::Load
1681 ? (*I->user_begin())->getType()
1682 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1683 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1684 DstTy->getScalarType()->isFloatTy())
1685 return ST->getMVEVectorCostFactor(CostKind);
1686 }
1687
1688 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1689 ? ST->getMVEVectorCostFactor(CostKind)
1690 : 1;
1691 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1692 CostKind, OpInfo, I);
1693}
1694
1698 switch (MICA.getID()) {
1699 case Intrinsic::masked_scatter:
1700 case Intrinsic::masked_gather:
1701 return getGatherScatterOpCost(MICA, CostKind);
1702 case Intrinsic::masked_load:
1703 case Intrinsic::masked_store:
1704 return getMaskedMemoryOpCost(MICA, CostKind);
1705 }
1707}
1708
1712 unsigned IID = MICA.getID();
1713 Type *Src = MICA.getDataType();
1714 Align Alignment = MICA.getAlignment();
1715 unsigned AddressSpace = MICA.getAddressSpace();
1716 if (ST->hasMVEIntegerOps()) {
1717 if (IID == Intrinsic::masked_load &&
1718 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1719 return ST->getMVEVectorCostFactor(CostKind);
1720 if (IID == Intrinsic::masked_store &&
1721 isLegalMaskedStore(Src, Alignment, AddressSpace))
1722 return ST->getMVEVectorCostFactor(CostKind);
1723 }
1724 if (!isa<FixedVectorType>(Src))
1726 // Scalar cost, which is currently very high due to the efficiency of the
1727 // generated code.
1728 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1729}
1730
1732 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1733 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1734 bool UseMaskForCond, bool UseMaskForGaps) const {
1735 assert(Factor >= 2 && "Invalid interleave factor");
1736 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1737
1738 // vldN/vstN doesn't support vector types of i64/f64 element.
1739 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1740
1741 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1742 !UseMaskForCond && !UseMaskForGaps) {
1743 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1744 auto *SubVecTy =
1745 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1746
1747 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1748 // Accesses having vector types that are a multiple of 128 bits can be
1749 // matched to more than one vldN/vstN instruction.
1750 int BaseCost =
1751 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1752 if (NumElts % Factor == 0 &&
1753 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1754 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1755
1756 // Some smaller than legal interleaved patterns are cheap as we can make
1757 // use of the vmovn or vrev patterns to interleave a standard load. This is
1758 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1759 // promoted differently). The cost of 2 here is then a load and vrev or
1760 // vmovn.
1761 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1762 VecTy->isIntOrIntVectorTy() &&
1763 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1764 return 2 * BaseCost;
1765 }
1766
1767 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1768 Alignment, AddressSpace, CostKind,
1769 UseMaskForCond, UseMaskForGaps);
1770}
1771
1775
1776 Type *DataTy = MICA.getDataType();
1777 const Value *Ptr = MICA.getPointer();
1778 bool VariableMask = MICA.getVariableMask();
1779 Align Alignment = MICA.getAlignment();
1780 const Instruction *I = MICA.getInst();
1781
1782 using namespace PatternMatch;
1783 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1785
1786 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1787 auto *VTy = cast<FixedVectorType>(DataTy);
1788
1789 // TODO: Splitting, once we do that.
1790
1791 unsigned NumElems = VTy->getNumElements();
1792 unsigned EltSize = VTy->getScalarSizeInBits();
1793 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1794
1795 // For now, it is assumed that for the MVE gather instructions the loads are
1796 // all effectively serialised. This means the cost is the scalar cost
1797 // multiplied by the number of elements being loaded. This is possibly very
1798 // conservative, but even so we still end up vectorising loops because the
1799 // cost per iteration for many loops is lower than for scalar loops.
1800 InstructionCost VectorCost =
1801 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1802 // The scalarization cost should be a lot higher. We use the number of vector
1803 // elements plus the scalarization overhead. If masking is required then a lot
1804 // of little blocks will be needed and potentially a scalarized p0 mask,
1805 // greatly increasing the cost.
1806 InstructionCost ScalarCost =
1807 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1808 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1809 CostKind) +
1810 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1811 CostKind);
1812
1813 if (EltSize < 8 || Alignment < EltSize / 8)
1814 return ScalarCost;
1815
1816 unsigned ExtSize = EltSize;
1817 // Check whether there's a single user that asks for an extended type
1818 if (I != nullptr) {
1819 // Dependent of the caller of this function, a gather instruction will
1820 // either have opcode Instruction::Load or be a call to the masked_gather
1821 // intrinsic
1822 if ((I->getOpcode() == Instruction::Load ||
1824 I->hasOneUse()) {
1825 const User *Us = *I->users().begin();
1826 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1827 // only allow valid type combinations
1828 unsigned TypeSize =
1829 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1830 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1831 (TypeSize == 16 && EltSize == 8)) &&
1832 TypeSize * NumElems == 128) {
1833 ExtSize = TypeSize;
1834 }
1835 }
1836 }
1837 // Check whether the input data needs to be truncated
1838 TruncInst *T;
1839 if ((I->getOpcode() == Instruction::Store ||
1841 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1842 // Only allow valid type combinations
1843 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1844 if (((EltSize == 16 && TypeSize == 32) ||
1845 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1846 TypeSize * NumElems == 128)
1847 ExtSize = TypeSize;
1848 }
1849 }
1850
1851 if (ExtSize * NumElems != 128 || NumElems < 4)
1852 return ScalarCost;
1853
1854 // Any (aligned) i32 gather will not need to be scalarised.
1855 if (ExtSize == 32)
1856 return VectorCost;
1857 // For smaller types, we need to ensure that the gep's inputs are correctly
1858 // extended from a small enough value. Other sizes (including i64) are
1859 // scalarized for now.
1860 if (ExtSize != 8 && ExtSize != 16)
1861 return ScalarCost;
1862
1863 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1864 Ptr = BC->getOperand(0);
1865 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1866 if (GEP->getNumOperands() != 2)
1867 return ScalarCost;
1868 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1869 // Scale needs to be correct (which is only relevant for i16s).
1870 if (Scale != 1 && Scale * 8 != ExtSize)
1871 return ScalarCost;
1872 // And we need to zext (not sext) the indexes from a small enough type.
1873 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1874 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1875 return VectorCost;
1876 }
1877 return ScalarCost;
1878 }
1879 return ScalarCost;
1880}
1881
1884 std::optional<FastMathFlags> FMF,
1886
1887 EVT ValVT = TLI->getValueType(DL, ValTy);
1888 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1889 unsigned EltSize = ValVT.getScalarSizeInBits();
1890
1891 // In general floating point reductions are a series of elementwise
1892 // operations, with free extracts on each step. These are either in-order or
1893 // treewise depending on whether that is allowed by the fast math flags.
1894 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1895 ((EltSize == 32 && ST->hasVFP2Base()) ||
1896 (EltSize == 64 && ST->hasFP64()) ||
1897 (EltSize == 16 && ST->hasFullFP16()))) {
1898 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1899 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1900 InstructionCost VecCost = 0;
1901 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1902 NumElts * EltSize > VecLimit) {
1903 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1904 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1905 NumElts /= 2;
1906 }
1907
1908 // For fp16 we need to extract the upper lane elements. MVE can add a
1909 // VREV+FMIN/MAX to perform another vector step instead.
1910 InstructionCost ExtractCost = 0;
1911 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1912 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1913 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1914 NumElts /= 2;
1915 } else if (ValVT.getVectorElementType() == MVT::f16)
1916 ExtractCost = NumElts / 2;
1917
1918 return VecCost + ExtractCost +
1919 NumElts *
1921 }
1922
1923 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1924 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1925 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1926 unsigned VecLimit =
1927 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1928 InstructionCost VecCost = 0;
1929 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1930 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1931 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1932 NumElts /= 2;
1933 }
1934 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1935 // step.
1936 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1937 NumElts * EltSize == 64) {
1938 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1939 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1940 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1941 NumElts /= 2;
1942 }
1943
1944 // From here we extract the elements and perform the and/or/xor.
1945 InstructionCost ExtractCost = NumElts;
1946 return VecCost + ExtractCost +
1947 (NumElts - 1) * getArithmeticInstrCost(
1948 Opcode, ValTy->getElementType(), CostKind);
1949 }
1950
1951 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1953 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1954
1955 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1956
1957 static const CostTblEntry CostTblAdd[]{
1958 {ISD::ADD, MVT::v16i8, 1},
1959 {ISD::ADD, MVT::v8i16, 1},
1960 {ISD::ADD, MVT::v4i32, 1},
1961 };
1962 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1963 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1964
1965 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1966}
1967
1969 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1970 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1971 EVT ValVT = TLI->getValueType(DL, ValTy);
1972 EVT ResVT = TLI->getValueType(DL, ResTy);
1973
1974 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1975
1976 switch (ISD) {
1977 case ISD::ADD:
1978 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1979 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1980
1981 // The legal cases are:
1982 // VADDV u/s 8/16/32
1983 // VADDLV u/s 32
1984 // Codegen currently cannot always handle larger than legal vectors very
1985 // well, especially for predicated reductions where the mask needs to be
1986 // split, so restrict to 128bit or smaller input types.
1987 unsigned RevVTSize = ResVT.getSizeInBits();
1988 if (ValVT.getSizeInBits() <= 128 &&
1989 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1990 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1991 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1992 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1993 }
1994 break;
1995 default:
1996 break;
1997 }
1998 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1999 CostKind);
2000}
2001
2003ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
2004 Type *ResTy, VectorType *ValTy,
2006 if (RedOpcode != Instruction::Add)
2008 EVT ValVT = TLI->getValueType(DL, ValTy);
2009 EVT ResVT = TLI->getValueType(DL, ResTy);
2010
2011 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
2012 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2013
2014 // The legal cases are:
2015 // VMLAV u/s 8/16/32
2016 // VMLALV u/s 16/32
2017 // Codegen currently cannot always handle larger than legal vectors very
2018 // well, especially for predicated reductions where the mask needs to be
2019 // split, so restrict to 128bit or smaller input types.
2020 unsigned RevVTSize = ResVT.getSizeInBits();
2021 if (ValVT.getSizeInBits() <= 128 &&
2022 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
2023 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
2024 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
2025 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
2026 }
2027
2028 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
2029 CostKind);
2030}
2031
2034 FastMathFlags FMF,
2036 EVT ValVT = TLI->getValueType(DL, Ty);
2037
2038 // In general floating point reductions are a series of elementwise
2039 // operations, with free extracts on each step. These are either in-order or
2040 // treewise depending on whether that is allowed by the fast math flags.
2041 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
2042 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
2043 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
2044 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
2045 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
2046 unsigned EltSize = ValVT.getScalarSizeInBits();
2047 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
2048 InstructionCost VecCost;
2049 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
2050 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
2051 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
2052 VecCost += getIntrinsicInstrCost(ICA, CostKind);
2053 NumElts /= 2;
2054 }
2055
2056 // For fp16 we need to extract the upper lane elements. MVE can add a
2057 // VREV+FMIN/MAX to perform another vector step instead.
2058 InstructionCost ExtractCost = 0;
2059 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2060 NumElts == 8) {
2061 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
2062 NumElts /= 2;
2063 } else if (ValVT.getVectorElementType() == MVT::f16)
2064 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
2065
2066 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2067 {Ty->getElementType(), Ty->getElementType()},
2068 FMF);
2069 return VecCost + ExtractCost +
2070 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2071 }
2072
2073 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2074 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2075 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2076
2077 // All costs are the same for u/s min/max. These lower to vminv, which are
2078 // given a slightly higher cost as they tend to take multiple cycles for
2079 // smaller type sizes.
2080 static const CostTblEntry CostTblAdd[]{
2081 {ISD::SMIN, MVT::v16i8, 4},
2082 {ISD::SMIN, MVT::v8i16, 3},
2083 {ISD::SMIN, MVT::v4i32, 2},
2084 };
2085 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2086 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2087 }
2088
2089 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2090}
2091
2095 unsigned Opc = ICA.getID();
2096 switch (Opc) {
2097 case Intrinsic::get_active_lane_mask:
2098 // Currently we make a somewhat optimistic assumption that
2099 // active_lane_mask's are always free. In reality it may be freely folded
2100 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2101 // of add/icmp code. We may need to improve this in the future, but being
2102 // able to detect if it is free or not involves looking at a lot of other
2103 // code. We currently assume that the vectorizer inserted these, and knew
2104 // what it was doing in adding one.
2105 if (ST->hasMVEIntegerOps())
2106 return 0;
2107 break;
2108 case Intrinsic::sadd_sat:
2109 case Intrinsic::ssub_sat:
2110 case Intrinsic::uadd_sat:
2111 case Intrinsic::usub_sat: {
2112 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2113 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2114 Type *RetTy = ICA.getReturnType();
2115
2116 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2117 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2118 return 1; // qadd / qsub
2119 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2120 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2121 // Otherwise return the cost of expanding the node. Generally an add +
2122 // icmp + sel.
2124 Type *CondTy = RetTy->getWithNewBitWidth(1);
2125 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2126 RetTy, CostKind) +
2127 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2128 CostKind) +
2129 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2130 CostKind);
2131 }
2132
2133 if (!ST->hasMVEIntegerOps())
2134 break;
2135
2136 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2137 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2138 LT.second == MVT::v16i8) {
2139 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2140 // need to extend the type, as it uses shr(qadd(shl, shl)).
2141 unsigned Instrs =
2142 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2143 : 4;
2144 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2145 }
2146 break;
2147 }
2148 case Intrinsic::abs:
2149 case Intrinsic::smin:
2150 case Intrinsic::smax:
2151 case Intrinsic::umin:
2152 case Intrinsic::umax: {
2153 if (!ST->hasMVEIntegerOps())
2154 break;
2155 Type *VT = ICA.getReturnType();
2156
2157 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2158 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2159 LT.second == MVT::v16i8)
2160 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2161 break;
2162 }
2163 case Intrinsic::minnum:
2164 case Intrinsic::maxnum: {
2165 if (!ST->hasMVEFloatOps())
2166 break;
2167 Type *VT = ICA.getReturnType();
2168 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2169 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2170 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2171 break;
2172 }
2173 case Intrinsic::fptosi_sat:
2174 case Intrinsic::fptoui_sat: {
2175 if (ICA.getArgTypes().empty())
2176 break;
2177 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2178 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2179 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2180 // Check for the legal types, with the correct subtarget features.
2181 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2182 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2183 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2184 return LT.first;
2185
2186 // Equally for MVE vector types
2187 if (ST->hasMVEFloatOps() &&
2188 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2189 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2190 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2191
2192 // If we can we use a legal convert followed by a min+max
2193 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2194 (ST->hasFP64() && LT.second == MVT::f64) ||
2195 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2196 (ST->hasMVEFloatOps() &&
2197 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2198 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2199 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2200 LT.second.getScalarSizeInBits());
2202 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2203 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2204 : Intrinsic::umin,
2205 LegalTy, {LegalTy, LegalTy});
2207 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2208 : Intrinsic::umax,
2209 LegalTy, {LegalTy, LegalTy});
2211 return LT.first * Cost;
2212 }
2213 // Otherwise we need to follow the default expansion that clamps the value
2214 // using a float min/max with a fcmp+sel for nan handling when signed.
2215 Type *FPTy = ICA.getArgTypes()[0];
2216 Type *RetTy = ICA.getReturnType();
2217 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2219 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2221 Cost +=
2222 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2223 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2224 if (IsSigned) {
2225 Type *CondTy = RetTy->getWithNewBitWidth(1);
2226 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2228 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2230 }
2231 return Cost;
2232 }
2233 }
2234
2236}
2237
2239 if (!F->isIntrinsic())
2240 return BaseT::isLoweredToCall(F);
2241
2242 // Assume all Arm-specific intrinsics map to an instruction.
2243 if (F->getName().starts_with("llvm.arm"))
2244 return false;
2245
2246 switch (F->getIntrinsicID()) {
2247 default: break;
2248 case Intrinsic::powi:
2249 case Intrinsic::sin:
2250 case Intrinsic::cos:
2251 case Intrinsic::sincos:
2252 case Intrinsic::pow:
2253 case Intrinsic::log:
2254 case Intrinsic::log10:
2255 case Intrinsic::log2:
2256 case Intrinsic::exp:
2257 case Intrinsic::exp2:
2258 return true;
2259 case Intrinsic::sqrt:
2260 case Intrinsic::fabs:
2261 case Intrinsic::copysign:
2262 case Intrinsic::floor:
2263 case Intrinsic::ceil:
2264 case Intrinsic::trunc:
2265 case Intrinsic::rint:
2266 case Intrinsic::nearbyint:
2267 case Intrinsic::round:
2268 case Intrinsic::canonicalize:
2269 case Intrinsic::lround:
2270 case Intrinsic::llround:
2271 case Intrinsic::lrint:
2272 case Intrinsic::llrint:
2273 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2274 return true;
2275 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2276 return true;
2277 // Some operations can be handled by vector instructions and assume
2278 // unsupported vectors will be expanded into supported scalar ones.
2279 // TODO Handle scalar operations properly.
2280 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2281 case Intrinsic::masked_store:
2282 case Intrinsic::masked_load:
2283 case Intrinsic::masked_gather:
2284 case Intrinsic::masked_scatter:
2285 return !ST->hasMVEIntegerOps();
2286 case Intrinsic::sadd_with_overflow:
2287 case Intrinsic::uadd_with_overflow:
2288 case Intrinsic::ssub_with_overflow:
2289 case Intrinsic::usub_with_overflow:
2290 case Intrinsic::sadd_sat:
2291 case Intrinsic::uadd_sat:
2292 case Intrinsic::ssub_sat:
2293 case Intrinsic::usub_sat:
2294 return false;
2295 }
2296
2297 return BaseT::isLoweredToCall(F);
2298}
2299
2301 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2302 EVT VT = TLI->getValueType(DL, I.getType(), true);
2303 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2304 return true;
2305
2306 // Check if an intrinsic will be lowered to a call and assume that any
2307 // other CallInst will generate a bl.
2308 if (auto *Call = dyn_cast<CallInst>(&I)) {
2309 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2310 switch(II->getIntrinsicID()) {
2311 case Intrinsic::memcpy:
2312 case Intrinsic::memset:
2313 case Intrinsic::memmove:
2314 return getNumMemOps(II) == -1;
2315 default:
2316 if (const Function *F = Call->getCalledFunction())
2317 return isLoweredToCall(F);
2318 }
2319 }
2320 return true;
2321 }
2322
2323 // FPv5 provides conversions between integer, double-precision,
2324 // single-precision, and half-precision formats.
2325 switch (I.getOpcode()) {
2326 default:
2327 break;
2328 case Instruction::FPToSI:
2329 case Instruction::FPToUI:
2330 case Instruction::SIToFP:
2331 case Instruction::UIToFP:
2332 case Instruction::FPTrunc:
2333 case Instruction::FPExt:
2334 return !ST->hasFPARMv8Base();
2335 }
2336
2337 // FIXME: Unfortunately the approach of checking the Operation Action does
2338 // not catch all cases of Legalization that use library calls. Our
2339 // Legalization step categorizes some transformations into library calls as
2340 // Custom, Expand or even Legal when doing type legalization. So for now
2341 // we have to special case for instance the SDIV of 64bit integers and the
2342 // use of floating point emulation.
2343 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2344 switch (ISD) {
2345 default:
2346 break;
2347 case ISD::SDIV:
2348 case ISD::UDIV:
2349 case ISD::SREM:
2350 case ISD::UREM:
2351 case ISD::SDIVREM:
2352 case ISD::UDIVREM:
2353 return true;
2354 }
2355 }
2356
2357 // Assume all other non-float operations are supported.
2358 if (!VT.isFloatingPoint())
2359 return false;
2360
2361 // We'll need a library call to handle most floats when using soft.
2362 if (TLI->useSoftFloat()) {
2363 switch (I.getOpcode()) {
2364 default:
2365 return true;
2366 case Instruction::Alloca:
2367 case Instruction::Load:
2368 case Instruction::Store:
2369 case Instruction::Select:
2370 case Instruction::PHI:
2371 return false;
2372 }
2373 }
2374
2375 // We'll need a libcall to perform double precision operations on a single
2376 // precision only FPU.
2377 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2378 return true;
2379
2380 // Likewise for half precision arithmetic.
2381 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2382 return true;
2383
2384 return false;
2385}
2386
2388 AssumptionCache &AC,
2389 TargetLibraryInfo *LibInfo,
2390 HardwareLoopInfo &HWLoopInfo) const {
2391 // Low-overhead branches are only supported in the 'low-overhead branch'
2392 // extension of v8.1-m.
2393 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2394 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2395 return false;
2396 }
2397
2399 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2400 return false;
2401 }
2402
2403 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2404 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2405 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2406 return false;
2407 }
2408
2409 const SCEV *TripCountSCEV =
2410 SE.getAddExpr(BackedgeTakenCount,
2411 SE.getOne(BackedgeTakenCount->getType()));
2412
2413 // We need to store the trip count in LR, a 32-bit register.
2414 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2415 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2416 return false;
2417 }
2418
2419 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2420 // point in generating a hardware loop if that's going to happen.
2421
2422 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2423 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2424 switch (Call->getIntrinsicID()) {
2425 default:
2426 break;
2427 case Intrinsic::start_loop_iterations:
2428 case Intrinsic::test_start_loop_iterations:
2429 case Intrinsic::loop_decrement:
2430 case Intrinsic::loop_decrement_reg:
2431 return true;
2432 }
2433 }
2434 return false;
2435 };
2436
2437 // Scan the instructions to see if there's any that we know will turn into a
2438 // call or if this loop is already a low-overhead loop or will become a tail
2439 // predicated loop.
2440 bool IsTailPredLoop = false;
2441 auto ScanLoop = [&](Loop *L) {
2442 for (auto *BB : L->getBlocks()) {
2443 for (auto &I : *BB) {
2444 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2445 isa<InlineAsm>(I)) {
2446 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2447 return false;
2448 }
2449 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2450 IsTailPredLoop |=
2451 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2452 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2453 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2454 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2455 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2456 }
2457 }
2458 return true;
2459 };
2460
2461 // Visit inner loops.
2462 for (auto *Inner : *L)
2463 if (!ScanLoop(Inner))
2464 return false;
2465
2466 if (!ScanLoop(L))
2467 return false;
2468
2469 // TODO: Check whether the trip count calculation is expensive. If L is the
2470 // inner loop but we know it has a low trip count, calculating that trip
2471 // count (in the parent loop) may be detrimental.
2472
2473 LLVMContext &C = L->getHeader()->getContext();
2474 HWLoopInfo.CounterInReg = true;
2475 HWLoopInfo.IsNestingLegal = false;
2476 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2477 HWLoopInfo.CountType = Type::getInt32Ty(C);
2478 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2479 return true;
2480}
2481
2482static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2483 // We don't allow icmp's, and because we only look at single block loops,
2484 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2485 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2486 return false;
2487 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2488 // not currently canonical, but soon will be. Code without them uses icmp, and
2489 // so is not tail predicated as per the condition above. In order to get the
2490 // same performance we treat min and max the same as an icmp for tailpred
2491 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2492 // pick more optimal instructions like VQDMULH. They need to be recognized
2493 // directly by the vectorizer).
2494 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2495 if ((II->getIntrinsicID() == Intrinsic::smin ||
2496 II->getIntrinsicID() == Intrinsic::smax ||
2497 II->getIntrinsicID() == Intrinsic::umin ||
2498 II->getIntrinsicID() == Intrinsic::umax) &&
2499 ++ICmpCount > 1)
2500 return false;
2501
2502 if (isa<FCmpInst>(&I))
2503 return false;
2504
2505 // We could allow extending/narrowing FP loads/stores, but codegen is
2506 // too inefficient so reject this for now.
2508 return false;
2509
2510 // Extends have to be extending-loads
2511 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2512 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2513 return false;
2514
2515 // Truncs have to be narrowing-stores
2516 if (isa<TruncInst>(&I) )
2517 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2518 return false;
2519
2520 return true;
2521}
2522
2523// To set up a tail-predicated loop, we need to know the total number of
2524// elements processed by that loop. Thus, we need to determine the element
2525// size and:
2526// 1) it should be uniform for all operations in the vector loop, so we
2527// e.g. don't want any widening/narrowing operations.
2528// 2) it should be smaller than i64s because we don't have vector operations
2529// that work on i64s.
2530// 3) we don't want elements to be reversed or shuffled, to make sure the
2531// tail-predication masks/predicates the right lanes.
2532//
2534 const DataLayout &DL,
2535 const LoopAccessInfo *LAI,
2536 const DominatorTree &DT) {
2537 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2538
2539 // If there are live-out values, it is probably a reduction. We can predicate
2540 // most reduction operations freely under MVE using a combination of
2541 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2542 // floating point and integer reductions, but don't check for operators
2543 // specifically here. If the value ends up not being a reduction (and so the
2544 // vectorizer cannot tailfold the loop), we should fall back to standard
2545 // vectorization automatically.
2547 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2548 bool ReductionsDisabled =
2551
2552 for (auto *I : LiveOuts) {
2553 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2554 !I->getType()->isHalfTy()) {
2555 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2556 "live-out value\n");
2557 return false;
2558 }
2559 if (ReductionsDisabled) {
2560 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2561 return false;
2562 }
2563 }
2564
2565 // Next, check that all instructions can be tail-predicated.
2566 PredicatedScalarEvolution PSE = LAI->getPSE();
2567 int ICmpCount = 0;
2568
2569 for (BasicBlock *BB : L->blocks()) {
2570 for (Instruction &I : *BB) {
2572 continue;
2573 if (!canTailPredicateInstruction(I, ICmpCount)) {
2574 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2575 return false;
2576 }
2577
2578 Type *T = I.getType();
2579 if (T->getScalarSizeInBits() > 32) {
2580 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2581 return false;
2582 }
2583 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2585 Type *AccessTy = getLoadStoreType(&I);
2586 int64_t NextStride =
2587 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2588 if (NextStride == 1) {
2589 // TODO: for now only allow consecutive strides of 1. We could support
2590 // other strides as long as it is uniform, but let's keep it simple
2591 // for now.
2592 continue;
2593 } else if (NextStride == -1 ||
2594 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2595 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2597 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2598 "be tail-predicated\n.");
2599 return false;
2600 // TODO: don't tail predicate if there is a reversed load?
2601 } else if (EnableMaskedGatherScatters) {
2602 // Gather/scatters do allow loading from arbitrary strides, at
2603 // least if they are loop invariant.
2604 // TODO: Loop variant strides should in theory work, too, but
2605 // this requires further testing.
2606 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2607 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2608 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2609 if (PSE.getSE()->isLoopInvariant(Step, L))
2610 continue;
2611 }
2612 }
2613 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2614 "tail-predicate\n.");
2615 return false;
2616 }
2617 }
2618 }
2619
2620 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2621 return true;
2622}
2623
2625 if (!EnableTailPredication) {
2626 LLVM_DEBUG(dbgs() << "Tail-folding not enabled.\n");
2627 return false;
2628 }
2629
2630 // Creating a tail-folded vector loop is the first step for generating a
2631 // tail-folded hardware loop, for which we need the MVE masked
2632 // load/stores instructions:
2633 if (!ST->hasMVEIntegerOps())
2634 return false;
2635
2636 LoopVectorizationLegality *LVL = TFI->LVL;
2637 Loop *L = LVL->getLoop();
2638
2639 // For now, restrict this to single block loops.
2640 if (L->getNumBlocks() > 1) {
2641 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: not a single block "
2642 "loop.\n");
2643 return false;
2644 }
2645
2646 assert(L->isInnermost() &&
2647 "preferTailFoldingOverEpilogue: inner-loop expected");
2648
2649 LoopInfo *LI = LVL->getLoopInfo();
2650 HardwareLoopInfo HWLoopInfo(L);
2651 if (!HWLoopInfo.canAnalyze(*LI)) {
2652 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2653 "analyzable.\n");
2654 return false;
2655 }
2656
2659
2660 // This checks if we have the low-overhead branch architecture
2661 // extension, and if we will create a hardware-loop:
2662 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2663 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2664 "profitable.\n");
2665 return false;
2666 }
2667
2668 DominatorTree *DT = LVL->getDominatorTree();
2669 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2670 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2671 "a candidate.\n");
2672 return false;
2673 }
2674
2675 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2676 *LVL->getDominatorTree());
2677}
2678
2680 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2682
2683 // Intrinsic @llvm.get.active.lane.mask is supported.
2684 // It is used in the MVETailPredication pass, which requires the number of
2685 // elements processed by this vector loop to setup the tail-predicated
2686 // loop.
2688}
2691 OptimizationRemarkEmitter *ORE) const {
2692 // Enable Upper bound unrolling universally, providing that we do not see an
2693 // active lane mask, which will be better kept as a loop to become tail
2694 // predicated than to be conditionally unrolled.
2695 UP.UpperBound =
2696 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2697 return isa<IntrinsicInst>(I) &&
2698 cast<IntrinsicInst>(I).getIntrinsicID() ==
2699 Intrinsic::get_active_lane_mask;
2700 });
2701
2702 // Only currently enable these preferences for M-Class cores.
2703 if (!ST->isMClass())
2704 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2705
2706 // Disable loop unrolling for Oz and Os.
2707 UP.OptSizeThreshold = 0;
2709 if (L->getHeader()->getParent()->hasOptSize())
2710 return;
2711
2712 SmallVector<BasicBlock*, 4> ExitingBlocks;
2713 L->getExitingBlocks(ExitingBlocks);
2714 LLVM_DEBUG(dbgs() << "Loop has:\n"
2715 << "Blocks: " << L->getNumBlocks() << "\n"
2716 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2717
2718 // Only allow another exit other than the latch. This acts as an early exit
2719 // as it mirrors the profitability calculation of the runtime unroller.
2720 if (ExitingBlocks.size() > 2)
2721 return;
2722
2723 // Limit the CFG of the loop body for targets with a branch predictor.
2724 // Allowing 4 blocks permits if-then-else diamonds in the body.
2725 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2726 return;
2727
2728 // Don't unroll vectorized loops, including the remainder loop
2729 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2730 return;
2731
2732 // Scan the loop: don't unroll loops with calls as this could prevent
2733 // inlining.
2735 for (auto *BB : L->getBlocks()) {
2736 for (auto &I : *BB) {
2737 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2738 // scalar code.
2739 if (I.getType()->isVectorTy())
2740 return;
2741
2742 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2743 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2744 if (!isLoweredToCall(F))
2745 continue;
2746 }
2747 return;
2748 }
2749
2750 SmallVector<const Value*, 4> Operands(I.operand_values());
2751 Cost += getInstructionCost(&I, Operands,
2753 }
2754 }
2755
2756 // On v6m cores, there are very few registers available. We can easily end up
2757 // spilling and reloading more registers in an unrolled loop. Look at the
2758 // number of LCSSA phis as a rough measure of how many registers will need to
2759 // be live out of the loop, reducing the default unroll count if more than 1
2760 // value is needed. In the long run, all of this should be being learnt by a
2761 // machine.
2762 unsigned UnrollCount = 4;
2763 if (ST->isThumb1Only()) {
2764 unsigned ExitingValues = 0;
2766 L->getExitBlocks(ExitBlocks);
2767 for (auto *Exit : ExitBlocks) {
2768 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2769 // only the last is expected to be needed for address operands.
2770 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2771 return PH.getNumOperands() != 1 ||
2772 !isa<GetElementPtrInst>(PH.getOperand(0));
2773 });
2774 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2775 }
2776 if (ExitingValues)
2777 UnrollCount /= ExitingValues;
2778 if (UnrollCount <= 1)
2779 return;
2780 }
2781
2782 // For processors with low overhead branching (LOB), runtime unrolling the
2783 // innermost loop is often detrimental to performance. In these cases the loop
2784 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2785 // deeply nested loops get executed multiple times, negating the benefits of
2786 // LOB. This is particularly noticeable when the loop trip count of the
2787 // innermost loop varies within the outer loop, such as in the case of
2788 // triangular matrix decompositions. In these cases we will prefer to not
2789 // unroll the innermost loop, with the intention for it to be executed as a
2790 // low overhead loop.
2791 bool Runtime = true;
2792 if (ST->hasLOB()) {
2794 const SCEV *BETC = SE.getBackedgeTakenCount(L);
2795 auto *Outer = L->getOutermostLoop();
2796 if ((L != Outer && Outer != L->getParentLoop()) ||
2797 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2798 Runtime = false;
2799 }
2800 }
2801 }
2802
2803 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2804 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2805
2806 UP.Partial = true;
2807 UP.Runtime = Runtime;
2808 UP.UnrollRemainder = true;
2810 UP.UnrollAndJam = true;
2812
2813 // Force unrolling small loops can be very useful because of the branch
2814 // taken cost of the backedge.
2816 UP.Force = true;
2817}
2818
2823
2825 if (!ST->hasMVEIntegerOps())
2826 return false;
2827
2828 unsigned ScalarBits = Ty->getScalarSizeInBits();
2829 switch (Kind) {
2830 case RecurKind::Add:
2831 return ScalarBits <= 64;
2832 default:
2833 return false;
2834 }
2835}
2836
2838 if (!ST->hasMVEIntegerOps())
2839 return false;
2840 return true;
2841}
2842
2844 StackOffset BaseOffset,
2845 bool HasBaseReg, int64_t Scale,
2846 unsigned AddrSpace) const {
2848 AM.BaseGV = BaseGV;
2849 AM.BaseOffs = BaseOffset.getFixed();
2850 AM.HasBaseReg = HasBaseReg;
2851 AM.Scale = Scale;
2852 AM.ScalableOffset = BaseOffset.getScalable();
2853 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2854 if (ST->hasFPAO())
2855 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2856 return 0;
2857 }
2859}
2860
2862 // MVE only has 8 vector registers, so we should consider register pressure to
2863 // avoid vectorizing when the cost of spills exceeds the gains from
2864 // vectorization.
2865 return ST->hasMVEIntegerOps();
2866}
2867
2868bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2869 if (Thumb) {
2870 // B.W is available in any Thumb2-supporting target, and also in every
2871 // version of Armv8-M, even Baseline which does not include the rest of
2872 // Thumb2.
2873 return ST->isThumb2() || ST->hasV8MBaselineOps();
2874 } else {
2875 // B is available in all versions of the Arm ISA, so the only question is
2876 // whether that ISA is available at all.
2877 return ST->hasARMOps();
2878 }
2879}
2880
2881/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2882/// of the vector elements.
2883static bool areExtractExts(Value *Ext1, Value *Ext2) {
2884 using namespace PatternMatch;
2885
2886 auto areExtDoubled = [](Instruction *Ext) {
2887 return Ext->getType()->getScalarSizeInBits() ==
2888 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2889 };
2890
2891 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2892 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2893 !areExtDoubled(cast<Instruction>(Ext1)) ||
2894 !areExtDoubled(cast<Instruction>(Ext2)))
2895 return false;
2896
2897 return true;
2898}
2899
2900/// Check if sinking \p I's operands to I's basic block is profitable, because
2901/// the operands can be folded into a target instruction, e.g.
2902/// sext/zext can be folded into vsubl.
2904 SmallVectorImpl<Use *> &Ops) const {
2905 using namespace PatternMatch;
2906
2907 if (!I->getType()->isVectorTy())
2908 return false;
2909
2910 if (ST->hasNEON()) {
2911 switch (I->getOpcode()) {
2912 case Instruction::Sub:
2913 case Instruction::Add: {
2914 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2915 return false;
2916 Ops.push_back(&I->getOperandUse(0));
2917 Ops.push_back(&I->getOperandUse(1));
2918 return true;
2919 }
2920 default:
2921 return false;
2922 }
2923 }
2924
2925 if (!ST->hasMVEIntegerOps())
2926 return false;
2927
2928 auto IsFMSMul = [&](Instruction *I) {
2929 if (!I->hasOneUse())
2930 return false;
2931 auto *Sub = cast<Instruction>(*I->users().begin());
2932 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2933 };
2934 auto IsFMS = [&](Instruction *I) {
2935 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2936 match(I->getOperand(1), m_FNeg(m_Value())))
2937 return true;
2938 return false;
2939 };
2940
2941 auto IsSinker = [&](Instruction *I, int Operand) {
2942 switch (I->getOpcode()) {
2943 case Instruction::Add:
2944 case Instruction::Mul:
2945 case Instruction::FAdd:
2946 case Instruction::ICmp:
2947 case Instruction::FCmp:
2948 return true;
2949 case Instruction::FMul:
2950 return !IsFMSMul(I);
2951 case Instruction::Sub:
2952 case Instruction::FSub:
2953 case Instruction::Shl:
2954 case Instruction::LShr:
2955 case Instruction::AShr:
2956 return Operand == 1;
2957 case Instruction::Call:
2958 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2959 switch (II->getIntrinsicID()) {
2960 case Intrinsic::fma:
2961 return !IsFMS(I);
2962 case Intrinsic::sadd_sat:
2963 case Intrinsic::uadd_sat:
2964 case Intrinsic::arm_mve_add_predicated:
2965 case Intrinsic::arm_mve_mul_predicated:
2966 case Intrinsic::arm_mve_qadd_predicated:
2967 case Intrinsic::arm_mve_vhadd:
2968 case Intrinsic::arm_mve_hadd_predicated:
2969 case Intrinsic::arm_mve_vqdmull:
2970 case Intrinsic::arm_mve_vqdmull_predicated:
2971 case Intrinsic::arm_mve_vqdmulh:
2972 case Intrinsic::arm_mve_qdmulh_predicated:
2973 case Intrinsic::arm_mve_vqrdmulh:
2974 case Intrinsic::arm_mve_qrdmulh_predicated:
2975 case Intrinsic::arm_mve_fma_predicated:
2976 return true;
2977 case Intrinsic::ssub_sat:
2978 case Intrinsic::usub_sat:
2979 case Intrinsic::arm_mve_sub_predicated:
2980 case Intrinsic::arm_mve_qsub_predicated:
2981 case Intrinsic::arm_mve_hsub_predicated:
2982 case Intrinsic::arm_mve_vhsub:
2983 return Operand == 1;
2984 default:
2985 return false;
2986 }
2987 }
2988 return false;
2989 default:
2990 return false;
2991 }
2992 };
2993
2994 for (auto OpIdx : enumerate(I->operands())) {
2995 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2996 // Make sure we are not already sinking this operand
2997 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2998 continue;
2999
3000 Instruction *Shuffle = Op;
3001 if (Shuffle->getOpcode() == Instruction::BitCast)
3002 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
3003 // We are looking for a splat that can be sunk.
3004 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
3005 m_ZeroInt()),
3006 m_Undef(), m_ZeroMask())))
3007 continue;
3008 if (!IsSinker(I, OpIdx.index()))
3009 continue;
3010
3011 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3012 // and vector registers
3013 for (Use &U : Op->uses()) {
3014 Instruction *Insn = cast<Instruction>(U.getUser());
3015 if (!IsSinker(Insn, U.getOperandNo()))
3016 return false;
3017 }
3018
3019 Ops.push_back(&Shuffle->getOperandUse(0));
3020 if (Shuffle != Op)
3021 Ops.push_back(&Op->getOperandUse(0));
3022 Ops.push_back(&OpIdx.value());
3023 }
3024 return true;
3025}
3026
3028 Type *ArrayType) const {
3029 if (!UseWidenGlobalArrays) {
3030 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
3031 return false;
3032 }
3033
3034 // Don't modify none integer array types
3035 if (!ArrayType || !ArrayType->isArrayTy() ||
3037 return 0;
3038
3039 // We pad to 4 byte boundaries
3040 if (Size % 4 == 0)
3041 return 0;
3042
3043 unsigned NumBytesToPad = 4 - (Size % 4);
3044 unsigned NewSize = Size + NumBytesToPad;
3045
3046 // Max number of bytes that memcpy allows for lowering to load/stores before
3047 // it uses library function (__aeabi_memcpy).
3048 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
3049
3050 if (NewSize > MaxMemIntrinsicSize)
3051 return 0;
3052
3053 return NumBytesToPad;
3054}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
bool shouldConsiderVectorizationRegPressure() const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
TailFoldingStyle getPreferredTailFoldingStyle() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Container class for subtarget features.
constexpr bool test(unsigned I) const
constexpr size_t size() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:509
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:281
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
Type * getArrayElementType() const
Definition Type.h:427
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).