LLVM 23.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
70 "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
92 Align(Alignment));
93}
94
96 const Function *Callee) const {
97 const TargetMachine &TM = getTLI()->getTargetMachine();
98 const FeatureBitset &CallerBits =
99 TM.getSubtargetImpl(*Caller)->getFeatureBits();
100 const FeatureBitset &CalleeBits =
101 TM.getSubtargetImpl(*Callee)->getFeatureBits();
102
103 // To inline a callee, all features not in the allowed list must match exactly.
104 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
105 (CalleeBits & ~InlineFeaturesAllowed);
106 // For features in the allowed list, the callee's features must be a subset of
107 // the callers'.
108 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
109 (CalleeBits & InlineFeaturesAllowed);
110
111 LLVM_DEBUG({
112 if (!MatchExact || !MatchSubset) {
113 dbgs() << "=== Inline compatibility debug ===\n";
114 dbgs() << "Caller: " << Caller->getName() << "\n";
115 dbgs() << "Callee: " << Callee->getName() << "\n";
116
117 // Bit diffs
118 FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
119 FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only
120
121 // Counts
122 dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
123 dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n";
124
125 dbgs() << "Only-in-caller feature indices [";
126 {
127 bool First = true;
128 for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {
129 if (ExtraInCaller.test(I)) {
130 if (!First)
131 dbgs() << ", ";
132 dbgs() << I;
133 First = false;
134 }
135 }
136 }
137 dbgs() << "]\n";
138
139 dbgs() << "Only-in-callee feature indices [";
140 {
141 bool First = true;
142 for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {
143 if (MissingInCaller.test(I)) {
144 if (!First)
145 dbgs() << ", ";
146 dbgs() << I;
147 First = false;
148 }
149 }
150 }
151 dbgs() << "]\n";
152
153 // Indices map to features as found in
154 // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
155 dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
156 << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
157 }
158 });
159 return MatchExact && MatchSubset;
160}
161
164 ScalarEvolution *SE) const {
165 if (ST->hasMVEIntegerOps())
167
168 if (L->getHeader()->getParent()->hasOptSize())
169 return TTI::AMK_None;
170
171 if (ST->isMClass() && ST->isThumb2() &&
172 L->getNumBlocks() == 1)
173 return TTI::AMK_PreIndexed;
174
175 return TTI::AMK_None;
176}
177
178std::optional<Instruction *>
180 using namespace PatternMatch;
181 Intrinsic::ID IID = II.getIntrinsicID();
182 switch (IID) {
183 default:
184 break;
185 case Intrinsic::arm_neon_vld1: {
186 Align MemAlign =
187 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
189 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
190 return IC.replaceInstUsesWith(II, V);
191 }
192 break;
193 }
194
195 case Intrinsic::arm_neon_vld2:
196 case Intrinsic::arm_neon_vld3:
197 case Intrinsic::arm_neon_vld4:
198 case Intrinsic::arm_neon_vld2lane:
199 case Intrinsic::arm_neon_vld3lane:
200 case Intrinsic::arm_neon_vld4lane:
201 case Intrinsic::arm_neon_vst1:
202 case Intrinsic::arm_neon_vst2:
203 case Intrinsic::arm_neon_vst3:
204 case Intrinsic::arm_neon_vst4:
205 case Intrinsic::arm_neon_vst2lane:
206 case Intrinsic::arm_neon_vst3lane:
207 case Intrinsic::arm_neon_vst4lane: {
208 Align MemAlign =
209 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
211 unsigned AlignArg = II.arg_size() - 1;
212 Value *AlignArgOp = II.getArgOperand(AlignArg);
213 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
214 if (Align && *Align < MemAlign) {
215 return IC.replaceOperand(
216 II, AlignArg,
217 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
218 false));
219 }
220 break;
221 }
222
223 case Intrinsic::arm_neon_vld1x2:
224 case Intrinsic::arm_neon_vld1x3:
225 case Intrinsic::arm_neon_vld1x4:
226 case Intrinsic::arm_neon_vst1x2:
227 case Intrinsic::arm_neon_vst1x3:
228 case Intrinsic::arm_neon_vst1x4: {
229 Align NewAlign =
230 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
232 Align OldAlign = II.getParamAlign(0).valueOrOne();
233 if (NewAlign > OldAlign)
234 II.addParamAttr(0,
235 Attribute::getWithAlignment(II.getContext(), NewAlign));
236 break;
237 }
238
239 case Intrinsic::arm_mve_pred_i2v: {
240 Value *Arg = II.getArgOperand(0);
241 Value *ArgArg;
243 PatternMatch::m_Value(ArgArg))) &&
244 II.getType() == ArgArg->getType()) {
245 return IC.replaceInstUsesWith(II, ArgArg);
246 }
247 Constant *XorMask;
249 PatternMatch::m_Value(ArgArg)),
250 PatternMatch::m_Constant(XorMask))) &&
251 II.getType() == ArgArg->getType()) {
252 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
253 if (CI->getValue().trunc(16).isAllOnes()) {
254 auto TrueVector = IC.Builder.CreateVectorSplat(
255 cast<FixedVectorType>(II.getType())->getNumElements(),
256 IC.Builder.getTrue());
257 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
258 }
259 }
260 }
261 KnownBits ScalarKnown(32);
262 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
263 ScalarKnown)) {
264 return &II;
265 }
266 break;
267 }
268 case Intrinsic::arm_mve_pred_v2i: {
269 Value *Arg = II.getArgOperand(0);
270 Value *ArgArg;
272 PatternMatch::m_Value(ArgArg)))) {
273 return IC.replaceInstUsesWith(II, ArgArg);
274 }
275
276 if (II.getMetadata(LLVMContext::MD_range))
277 break;
278
279 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
280
281 if (auto CurrentRange = II.getRange()) {
282 Range = Range.intersectWith(*CurrentRange);
283 if (Range == CurrentRange)
284 break;
285 }
286
287 II.addRangeRetAttr(Range);
288 II.addRetAttr(Attribute::NoUndef);
289 return &II;
290 }
291 case Intrinsic::arm_mve_vadc:
292 case Intrinsic::arm_mve_vadc_predicated: {
293 unsigned CarryOp =
294 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
295 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
296 "Bad type for intrinsic!");
297
298 KnownBits CarryKnown(32);
299 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
300 CarryKnown)) {
301 return &II;
302 }
303 break;
304 }
305 case Intrinsic::arm_mve_vmldava: {
307 if (I->hasOneUse()) {
308 auto *User = cast<Instruction>(*I->user_begin());
309 Value *OpZ;
310 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
311 match(I->getOperand(3), m_Zero())) {
312 Value *OpX = I->getOperand(4);
313 Value *OpY = I->getOperand(5);
314 Type *OpTy = OpX->getType();
315
317 Value *V =
318 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
319 {I->getOperand(0), I->getOperand(1),
320 I->getOperand(2), OpZ, OpX, OpY});
321
323 return IC.eraseInstFromFunction(*User);
324 }
325 }
326 return std::nullopt;
327 }
328 }
329 return std::nullopt;
330}
331
333 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
334 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
335 std::function<void(Instruction *, unsigned, APInt, APInt &)>
336 SimplifyAndSetOp) const {
337
338 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
339 // opcode specifying a Top/Bottom instruction, which can change between
340 // instructions.
341 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
342 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
343 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
344
345 // The only odd/even lanes of operand 0 will only be demanded depending
346 // on whether this is a top/bottom instruction.
347 APInt DemandedElts =
348 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
349 : APInt::getHighBitsSet(2, 1));
350 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
351 // The other lanes will be defined from the inserted elements.
352 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
353 : APInt::getHighBitsSet(2, 1));
354 return std::nullopt;
355 };
356
357 switch (II.getIntrinsicID()) {
358 default:
359 break;
360 case Intrinsic::arm_mve_vcvt_narrow:
361 SimplifyNarrowInstrTopBottom(2);
362 break;
363 case Intrinsic::arm_mve_vqmovn:
364 SimplifyNarrowInstrTopBottom(4);
365 break;
366 case Intrinsic::arm_mve_vshrn:
367 SimplifyNarrowInstrTopBottom(7);
368 break;
369 }
370
371 return std::nullopt;
372}
373
376 assert(Ty->isIntegerTy());
377
378 unsigned Bits = Ty->getPrimitiveSizeInBits();
379 if (Bits == 0 || Imm.getActiveBits() >= 64)
380 return 4;
381
382 int64_t SImmVal = Imm.getSExtValue();
383 uint64_t ZImmVal = Imm.getZExtValue();
384 if (!ST->isThumb()) {
385 if ((SImmVal >= 0 && SImmVal < 65536) ||
386 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
387 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
388 return 1;
389 return ST->hasV6T2Ops() ? 2 : 3;
390 }
391 if (ST->isThumb2()) {
392 if ((SImmVal >= 0 && SImmVal < 65536) ||
393 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
394 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
395 return 1;
396 return ST->hasV6T2Ops() ? 2 : 3;
397 }
398 // Thumb1, any i8 imm cost 1.
399 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
400 return 1;
401 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
402 return 2;
403 // Load from constantpool.
404 return 3;
405}
406
407// Constants smaller than 256 fit in the immediate field of
408// Thumb1 instructions so we return a zero cost and 1 otherwise.
410 const APInt &Imm,
411 Type *Ty) const {
412 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
413 return 0;
414
415 return 1;
416}
417
418// Checks whether Inst is part of a min(max()) or max(min()) pattern
419// that will match to an SSAT instruction. Returns the instruction being
420// saturated, or null if no saturation pattern was found.
421static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
422 Value *LHS, *RHS;
423 ConstantInt *C;
425
426 if (InstSPF == SPF_SMAX &&
428 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
429
430 auto isSSatMin = [&](Value *MinInst) {
431 if (isa<SelectInst>(MinInst)) {
432 Value *MinLHS, *MinRHS;
433 ConstantInt *MinC;
434 SelectPatternFlavor MinSPF =
435 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
436 if (MinSPF == SPF_SMIN &&
438 MinC->getValue() == ((-Imm) - 1))
439 return true;
440 }
441 return false;
442 };
443
444 if (isSSatMin(Inst->getOperand(1)))
445 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
446 if (Inst->hasNUses(2) &&
447 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
448 return Inst->getOperand(1);
449 }
450 return nullptr;
451}
452
453// Look for a FP Saturation pattern, where the instruction can be simplified to
454// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
455static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
456 if (Imm.getBitWidth() != 64 ||
457 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
458 return false;
459 Value *FP = isSSATMinMaxPattern(Inst, Imm);
460 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
462 if (!FP)
463 return false;
464 return isa<FPToSIInst>(FP);
465}
466
467InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
468 const APInt &Imm, Type *Ty,
470 Instruction *Inst) const {
471 // Division by a constant can be turned into multiplication, but only if we
472 // know it's constant. So it's not so much that the immediate is cheap (it's
473 // not), but that the alternative is worse.
474 // FIXME: this is probably unneeded with GlobalISel.
475 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
476 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
477 Idx == 1)
478 return 0;
479
480 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
481 // splitting any large offsets.
482 if (Opcode == Instruction::GetElementPtr && Idx != 0)
483 return 0;
484
485 if (Opcode == Instruction::And) {
486 // UXTB/UXTH
487 if (Imm == 255 || Imm == 65535)
488 return 0;
489 // Conversion to BIC is free, and means we can use ~Imm instead.
490 return std::min(getIntImmCost(Imm, Ty, CostKind),
491 getIntImmCost(~Imm, Ty, CostKind));
492 }
493
494 if (Opcode == Instruction::Add)
495 // Conversion to SUB is free, and means we can use -Imm instead.
496 return std::min(getIntImmCost(Imm, Ty, CostKind),
497 getIntImmCost(-Imm, Ty, CostKind));
498
499 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
500 Ty->getIntegerBitWidth() == 32) {
501 int64_t NegImm = -Imm.getSExtValue();
502 if (ST->isThumb2() && NegImm < 1<<12)
503 // icmp X, #-C -> cmn X, #C
504 return 0;
505 if (ST->isThumb() && NegImm < 1<<8)
506 // icmp X, #-C -> adds X, #C
507 return 0;
508 }
509
510 // xor a, -1 can always be folded to MVN
511 if (Opcode == Instruction::Xor && Imm.isAllOnes())
512 return 0;
513
514 // Ensures negative constant of min(max()) or max(min()) patterns that
515 // match to SSAT instructions don't get hoisted
516 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
517 Ty->getIntegerBitWidth() <= 32) {
518 if (isSSATMinMaxPattern(Inst, Imm) ||
519 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
521 return 0;
522 }
523
524 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
525 return 0;
526
527 // We can convert <= -1 to < 0, which is generally quite cheap.
528 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
529 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
530 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
531 return std::min(getIntImmCost(Imm, Ty, CostKind),
532 getIntImmCost(Imm + 1, Ty, CostKind));
533 }
534
535 return getIntImmCost(Imm, Ty, CostKind);
536}
537
540 const Instruction *I) const {
542 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
543 // FIXME: The vectorizer is highly sensistive to the cost of these
544 // instructions, which suggests that it may be using the costs incorrectly.
545 // But, for now, just make them free to avoid performance regressions for
546 // vector targets.
547 return 0;
548 }
549 return BaseT::getCFInstrCost(Opcode, CostKind, I);
550}
551
553 Type *Src,
556 const Instruction *I) const {
557 int ISD = TLI->InstructionOpcodeToISD(Opcode);
558 assert(ISD && "Invalid opcode");
559
560 // TODO: Allow non-throughput costs that aren't binary.
561 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
563 return Cost == 0 ? 0 : 1;
564 return Cost;
565 };
566 auto IsLegalFPType = [this](EVT VT) {
567 EVT EltVT = VT.getScalarType();
568 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
569 (EltVT == MVT::f64 && ST->hasFP64()) ||
570 (EltVT == MVT::f16 && ST->hasFullFP16());
571 };
572
573 EVT SrcTy = TLI->getValueType(DL, Src);
574 EVT DstTy = TLI->getValueType(DL, Dst);
575
576 if (!SrcTy.isSimple() || !DstTy.isSimple())
577 return AdjustCost(
578 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
579
580 // Extending masked load/Truncating masked stores is expensive because we
581 // currently don't split them. This means that we'll likely end up
582 // loading/storing each element individually (hence the high cost).
583 if ((ST->hasMVEIntegerOps() &&
584 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
585 Opcode == Instruction::SExt)) ||
586 (ST->hasMVEFloatOps() &&
587 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
588 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
589 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
590 return 2 * DstTy.getVectorNumElements() *
591 ST->getMVEVectorCostFactor(CostKind);
592
593 // The extend of other kinds of load is free
594 if (CCH == TTI::CastContextHint::Normal ||
596 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
597 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
598 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
599 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
600 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
601 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
602 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
603 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
604 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
605 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
606 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
607 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
608 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
609 };
610 if (const auto *Entry = ConvertCostTableLookup(
611 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
612 return AdjustCost(Entry->Cost);
613
614 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
615 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
616 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
617 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
618 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
619 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
620 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
621 // The following extend from a legal type to an illegal type, so need to
622 // split the load. This introduced an extra load operation, but the
623 // extend is still "free".
624 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
625 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
626 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
627 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
628 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
629 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
630 };
631 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
632 if (const auto *Entry =
633 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
634 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
635 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
636 }
637
638 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
639 // FPExtends are similar but also require the VCVT instructions.
640 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
641 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
642 };
643 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
644 if (const auto *Entry =
645 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
646 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
647 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
648 }
649
650 // The truncate of a store is free. This is the mirror of extends above.
651 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
652 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
653 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
654 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
655 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
656 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
657 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
658 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
659 };
660 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
661 if (const auto *Entry =
662 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
663 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
664 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
665 }
666
667 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
668 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
669 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
670 };
671 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
672 if (const auto *Entry =
673 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
674 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
675 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
676 }
677 }
678
679 // NEON vector operations that can extend their inputs.
680 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
681 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
682 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
683 // vaddl
684 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
685 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
686 // vsubl
687 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
688 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
689 // vmull
690 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
691 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
692 // vshll
693 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
694 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
695 };
696
697 auto *User = cast<Instruction>(*I->user_begin());
698 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
699 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
700 DstTy.getSimpleVT(),
701 SrcTy.getSimpleVT())) {
702 return AdjustCost(Entry->Cost);
703 }
704 }
705
706 // Single to/from double precision conversions.
707 if (Src->isVectorTy() && ST->hasNEON() &&
708 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
709 DstTy.getScalarType() == MVT::f32) ||
710 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
711 DstTy.getScalarType() == MVT::f64))) {
712 static const CostTblEntry NEONFltDblTbl[] = {
713 // Vector fptrunc/fpext conversions.
714 {ISD::FP_ROUND, MVT::v2f64, 2},
715 {ISD::FP_EXTEND, MVT::v2f32, 2},
716 {ISD::FP_EXTEND, MVT::v4f32, 4}};
717
718 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
719 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
720 return AdjustCost(LT.first * Entry->Cost);
721 }
722
723 // Some arithmetic, load and store operations have specific instructions
724 // to cast up/down their types automatically at no extra cost.
725 // TODO: Get these tables to know at least what the related operations are.
726 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
727 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
728 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
729 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
730 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
731 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
732 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
733
734 // The number of vmovl instructions for the extension.
735 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
736 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
737 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
738 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
739 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
740 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
741 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
742 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
743 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
744 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
745 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
746 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
747 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
748 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
749 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
750 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
751 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
752 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
753
754 // Operations that we legalize using splitting.
755 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
756 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
757
758 // Vector float <-> i32 conversions.
759 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
760 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
761
762 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
763 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
764 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
765 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
766 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
767 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
768 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
769 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
770 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
771 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
772 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
773 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
774 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
775 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
776 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
777 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
778 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
779 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
780 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
781 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
782
783 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
784 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
785 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
786 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
787 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
788 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
789
790 // Vector double <-> i32 conversions.
791 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
792 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
793
794 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
795 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
796 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
797 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
798 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
799 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
800
801 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
802 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
803 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
804 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
805 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
806 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
807 };
808
809 if (SrcTy.isVector() && ST->hasNEON()) {
810 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
811 DstTy.getSimpleVT(),
812 SrcTy.getSimpleVT()))
813 return AdjustCost(Entry->Cost);
814 }
815
816 // Scalar float to integer conversions.
817 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
818 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
819 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
820 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
821 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
822 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
823 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
824 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
825 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
826 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
827 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
828 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
829 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
830 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
831 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
832 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
833 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
834 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
835 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
836 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
837 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
838 };
839 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
840 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
841 DstTy.getSimpleVT(),
842 SrcTy.getSimpleVT()))
843 return AdjustCost(Entry->Cost);
844 }
845
846 // Scalar integer to float conversions.
847 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
848 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
849 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
850 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
851 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
852 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
853 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
854 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
855 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
856 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
857 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
858 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
859 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
860 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
861 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
862 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
863 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
864 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
865 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
866 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
867 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
868 };
869
870 if (SrcTy.isInteger() && ST->hasNEON()) {
871 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
872 ISD, DstTy.getSimpleVT(),
873 SrcTy.getSimpleVT()))
874 return AdjustCost(Entry->Cost);
875 }
876
877 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
878 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
879 // are linearised so take more.
880 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
881 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
882 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
883 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
884 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
885 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
886 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
887 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
888 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
889 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
890 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
891 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
892 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
893 };
894
895 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
896 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
897 ISD, DstTy.getSimpleVT(),
898 SrcTy.getSimpleVT()))
899 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
900 }
901
902 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
903 // As general rule, fp converts that were not matched above are scalarized
904 // and cost 1 vcvt for each lane, so long as the instruction is available.
905 // If not it will become a series of function calls.
906 const InstructionCost CallCost =
907 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
908 int Lanes = 1;
909 if (SrcTy.isFixedLengthVector())
910 Lanes = SrcTy.getVectorNumElements();
911
912 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
913 return Lanes;
914 else
915 return Lanes * CallCost;
916 }
917
918 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
919 SrcTy.isFixedLengthVector()) {
920 // Treat a truncate with larger than legal source (128bits for MVE) as
921 // expensive, 2 instructions per lane.
922 if ((SrcTy.getScalarType() == MVT::i8 ||
923 SrcTy.getScalarType() == MVT::i16 ||
924 SrcTy.getScalarType() == MVT::i32) &&
925 SrcTy.getSizeInBits() > 128 &&
926 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
927 return SrcTy.getVectorNumElements() * 2;
928 }
929
930 // Scalar integer conversion costs.
931 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
932 // i16 -> i64 requires two dependent operations.
933 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
934
935 // Truncates on i64 are assumed to be free.
936 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
937 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
938 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
939 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
940 };
941
942 if (SrcTy.isInteger()) {
943 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
944 DstTy.getSimpleVT(),
945 SrcTy.getSimpleVT()))
946 return AdjustCost(Entry->Cost);
947 }
948
949 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
950 ? ST->getMVEVectorCostFactor(CostKind)
951 : 1;
952 return AdjustCost(
953 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
954}
955
958 unsigned Index, const Value *Op0,
959 const Value *Op1) const {
960 // Penalize inserting into an D-subregister. We end up with a three times
961 // lower estimated throughput on swift.
962 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
963 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
964 return 3;
965
966 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
967 Opcode == Instruction::ExtractElement)) {
968 // Cross-class copies are expensive on many microarchitectures,
969 // so assume they are expensive by default.
970 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
971 return 3;
972
973 // Even if it's not a cross class copy, this likely leads to mixing
974 // of NEON and VFP code and should be therefore penalized.
975 if (ValTy->isVectorTy() &&
976 ValTy->getScalarSizeInBits() <= 32)
977 return std::max<InstructionCost>(
978 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
979 2U);
980 }
981
982 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
983 Opcode == Instruction::ExtractElement)) {
984 // Integer cross-lane moves are more expensive than float, which can
985 // sometimes just be vmovs. Integer involve being passes to GPR registers,
986 // causing more of a delay.
987 std::pair<InstructionCost, MVT> LT =
988 getTypeLegalizationCost(ValTy->getScalarType());
989 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
990 }
991
992 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
993}
994
996 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
998 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
999 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1000
1001 // Thumb scalar code size cost for select.
1003 ST->isThumb() && !ValTy->isVectorTy()) {
1004 // Assume expensive structs.
1005 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
1006 return TTI::TCC_Expensive;
1007
1008 // Select costs can vary because they:
1009 // - may require one or more conditional mov (including an IT),
1010 // - can't operate directly on immediates,
1011 // - require live flags, which we can't copy around easily.
1013
1014 // Possible IT instruction for Thumb2, or more for Thumb1.
1015 ++Cost;
1016
1017 // i1 values may need rematerialising by using mov immediates and/or
1018 // flag setting instructions.
1019 if (ValTy->isIntegerTy(1))
1020 ++Cost;
1021
1022 return Cost;
1023 }
1024
1025 // If this is a vector min/max/abs, use the cost of that intrinsic directly
1026 // instead. Hopefully when min/max intrinsics are more prevalent this code
1027 // will not be needed.
1028 const Instruction *Sel = I;
1029 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
1030 Sel->hasOneUse())
1031 Sel = cast<Instruction>(Sel->user_back());
1032 if (Sel && ValTy->isVectorTy() &&
1033 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
1034 const Value *LHS, *RHS;
1035 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
1036 unsigned IID = 0;
1037 switch (SPF) {
1038 case SPF_ABS:
1039 IID = Intrinsic::abs;
1040 break;
1041 case SPF_SMIN:
1042 IID = Intrinsic::smin;
1043 break;
1044 case SPF_SMAX:
1045 IID = Intrinsic::smax;
1046 break;
1047 case SPF_UMIN:
1048 IID = Intrinsic::umin;
1049 break;
1050 case SPF_UMAX:
1051 IID = Intrinsic::umax;
1052 break;
1053 case SPF_FMINNUM:
1054 IID = Intrinsic::minnum;
1055 break;
1056 case SPF_FMAXNUM:
1057 IID = Intrinsic::maxnum;
1058 break;
1059 default:
1060 break;
1061 }
1062 if (IID) {
1063 // The ICmp is free, the select gets the cost of the min/max/etc
1064 if (Sel != I)
1065 return 0;
1066 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1067 return getIntrinsicInstrCost(CostAttrs, CostKind);
1068 }
1069 }
1070
1071 // On NEON a vector select gets lowered to vbsl.
1072 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1073 // Lowering of some vector selects is currently far from perfect.
1074 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1075 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1076 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1077 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1078 };
1079
1080 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1081 EVT SelValTy = TLI->getValueType(DL, ValTy);
1082 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1083 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1084 SelCondTy.getSimpleVT(),
1085 SelValTy.getSimpleVT()))
1086 return Entry->Cost;
1087 }
1088
1089 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1090 return LT.first;
1091 }
1092
1093 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1094 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1095 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1096 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1098 if (!VecCondTy)
1100
1101 // If we don't have mve.fp any fp operations will need to be scalarized.
1102 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1103 // One scalaization insert, one scalarization extract and the cost of the
1104 // fcmps.
1105 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1106 /*Extract*/ true, CostKind) +
1107 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1108 /*Extract*/ false, CostKind) +
1109 VecValTy->getNumElements() *
1110 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1111 VecCondTy->getScalarType(), VecPred,
1112 CostKind, Op1Info, Op2Info, I);
1113 }
1114
1115 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1116 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1117 // There are two types - the input that specifies the type of the compare
1118 // and the output vXi1 type. Because we don't know how the output will be
1119 // split, we may need an expensive shuffle to get two in sync. This has the
1120 // effect of making larger than legal compares (v8i32 for example)
1121 // expensive.
1122 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1123 if (LT.first > 1)
1124 return LT.first * BaseCost +
1125 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1126 /*Extract*/ false, CostKind);
1127 return BaseCost;
1128 }
1129 }
1130
1131 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1132 // for "multiple beats" potentially needed by MVE instructions.
1133 int BaseCost = 1;
1134 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1135 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1136
1137 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1138 CostKind, Op1Info, Op2Info, I);
1139}
1140
1143 const SCEV *Ptr,
1145 // Address computations in vectorized code with non-consecutive addresses will
1146 // likely result in more instructions compared to scalar code where the
1147 // computation can more often be merged into the index mode. The resulting
1148 // extra micro-ops can significantly decrease throughput.
1149 unsigned NumVectorInstToHideOverhead = 10;
1150 int MaxMergeDistance = 64;
1151
1152 if (ST->hasNEON()) {
1153 if (PtrTy->isVectorTy() && SE &&
1154 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1155 return NumVectorInstToHideOverhead;
1156
1157 // In many cases the address computation is not merged into the instruction
1158 // addressing mode.
1159 return 1;
1160 }
1161 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1162}
1163
1166 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1167 // optimized, else LSR may block tail-predication.
1168 switch (II->getIntrinsicID()) {
1169 case Intrinsic::arm_mve_vctp8:
1170 case Intrinsic::arm_mve_vctp16:
1171 case Intrinsic::arm_mve_vctp32:
1172 case Intrinsic::arm_mve_vctp64:
1173 return true;
1174 default:
1175 break;
1176 }
1177 }
1178 return false;
1179}
1180
1182 unsigned /*AddressSpace*/,
1183 TTI::MaskKind /*MaskKind*/) const {
1184 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1185 return false;
1186
1187 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1188 // Don't support v2i1 yet.
1189 if (VecTy->getNumElements() == 2)
1190 return false;
1191
1192 // We don't support extending fp types.
1193 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1194 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1195 return false;
1196 }
1197
1198 unsigned EltWidth = DataTy->getScalarSizeInBits();
1199 return (EltWidth == 32 && Alignment >= 4) ||
1200 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1201}
1202
1203bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1204 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1205 return false;
1206
1207 unsigned EltWidth = Ty->getScalarSizeInBits();
1208 return ((EltWidth == 32 && Alignment >= 4) ||
1209 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1210}
1211
1212/// Given a memcpy/memset/memmove instruction, return the number of memory
1213/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1214/// call is used.
1216 MemOp MOp;
1217 unsigned DstAddrSpace = ~0u;
1218 unsigned SrcAddrSpace = ~0u;
1219 const Function *F = I->getParent()->getParent();
1220
1221 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1222 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1223 // If 'size' is not a constant, a library call will be generated.
1224 if (!C)
1225 return -1;
1226
1227 const unsigned Size = C->getValue().getZExtValue();
1228 const Align DstAlign = MC->getDestAlign().valueOrOne();
1229 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1230
1231 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1232 /*IsVolatile*/ false);
1233 DstAddrSpace = MC->getDestAddressSpace();
1234 SrcAddrSpace = MC->getSourceAddressSpace();
1235 }
1236 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1237 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1238 // If 'size' is not a constant, a library call will be generated.
1239 if (!C)
1240 return -1;
1241
1242 const unsigned Size = C->getValue().getZExtValue();
1243 const Align DstAlign = MS->getDestAlign().valueOrOne();
1244
1245 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1246 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1247 DstAddrSpace = MS->getDestAddressSpace();
1248 }
1249 else
1250 llvm_unreachable("Expected a memcpy/move or memset!");
1251
1252 unsigned Limit, Factor = 2;
1253 switch(I->getIntrinsicID()) {
1254 case Intrinsic::memcpy:
1255 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1256 break;
1257 case Intrinsic::memmove:
1258 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1259 break;
1260 case Intrinsic::memset:
1261 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1262 Factor = 1;
1263 break;
1264 default:
1265 llvm_unreachable("Expected a memcpy/move or memset!");
1266 }
1267
1268 // MemOps will be poplulated with a list of data types that needs to be
1269 // loaded and stored. That's why we multiply the number of elements by 2 to
1270 // get the cost for this memcpy.
1271 std::vector<EVT> MemOps;
1272 LLVMContext &C = F->getContext();
1273 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1274 SrcAddrSpace, F->getAttributes()))
1275 return MemOps.size() * Factor;
1276
1277 // If we can't find an optimal memop lowering, return the default cost
1278 return -1;
1279}
1280
1283
1284 // To model the cost of a library call, we assume 1 for the call, and
1285 // 3 for the argument setup.
1286 if (NumOps == -1)
1287 return 4;
1288 return NumOps;
1289}
1290
1292 VectorType *DstTy, VectorType *SrcTy,
1293 ArrayRef<int> Mask,
1295 int Index, VectorType *SubTp,
1297 const Instruction *CxtI) const {
1298 assert((Mask.empty() || DstTy->isScalableTy() ||
1299 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1300 "Expected the Mask to match the return size if given");
1301 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1302 "Expected the same scalar types");
1303
1304 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1305 // Treat extractsubvector as single op permutation.
1306 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1307 if (IsExtractSubvector)
1309 if (ST->hasNEON()) {
1310 if (Kind == TTI::SK_Broadcast) {
1311 static const CostTblEntry NEONDupTbl[] = {
1312 // VDUP handles these cases.
1313 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1314 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1315 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1316 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1319
1320 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1321 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1322 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1323 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1324
1325 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1326 if (const auto *Entry =
1327 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1328 return LT.first * Entry->Cost;
1329 }
1330 if (Kind == TTI::SK_Reverse) {
1331 static const CostTblEntry NEONShuffleTbl[] = {
1332 // Reverse shuffle cost one instruction if we are shuffling within a
1333 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1334 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1335 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1336 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1337 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1338 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1339 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1340
1341 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1342 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1343 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1344 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1345
1346 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1347 if (const auto *Entry =
1348 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1349 return LT.first * Entry->Cost;
1350 }
1351 if (Kind == TTI::SK_Select) {
1352 static const CostTblEntry NEONSelShuffleTbl[] = {
1353 // Select shuffle cost table for ARM. Cost is the number of
1354 // instructions
1355 // required to create the shuffled vector.
1356
1357 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1358 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1359 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1360 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1361
1362 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1363 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1364 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1365
1366 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1367
1368 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1369
1370 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1371 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1372 ISD::VECTOR_SHUFFLE, LT.second))
1373 return LT.first * Entry->Cost;
1374 }
1375 }
1376 if (ST->hasMVEIntegerOps()) {
1377 if (Kind == TTI::SK_Broadcast) {
1378 static const CostTblEntry MVEDupTbl[] = {
1379 // VDUP handles these cases.
1380 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1381 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1382 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1383 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1384 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1385
1386 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1387 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1388 LT.second))
1389 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1390 }
1391
1392 if (!Mask.empty()) {
1393 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1394 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1395 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1396 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1397 // higher cost than just the load.
1398 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1399 (LT.second.getScalarSizeInBits() == 8 ||
1400 LT.second.getScalarSizeInBits() == 16 ||
1401 LT.second.getScalarSizeInBits() == 32) &&
1402 LT.second.getSizeInBits() == 128 &&
1403 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1405 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1407 return ST->getMVEVectorCostFactor(CostKind) *
1408 std::max<InstructionCost>(1, LT.first / 4);
1409
1410 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1411 // store(interleaving-shuffle). The shuffle cost could potentially be
1412 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1413 // higher cost than just the store.
1414 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1415 (LT.second.getScalarSizeInBits() == 8 ||
1416 LT.second.getScalarSizeInBits() == 16 ||
1417 LT.second.getScalarSizeInBits() == 32) &&
1418 LT.second.getSizeInBits() == 128 &&
1419 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1421 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1422 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1424 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1425 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1426
1427 if (LT.second.isVector() &&
1428 Mask.size() <= LT.second.getVectorNumElements() &&
1429 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1430 isVREVMask(Mask, LT.second, 64)))
1431 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1432 }
1433 }
1434
1435 // Restore optimal kind.
1436 if (IsExtractSubvector)
1438 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1439 ? ST->getMVEVectorCostFactor(CostKind)
1440 : 1;
1441 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1442 Index, SubTp);
1443}
1444
1446 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1448 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1449 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1450 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1451 // Make operations on i1 relatively expensive as this often involves
1452 // combining predicates. AND and XOR should be easier to handle with IT
1453 // blocks.
1454 switch (ISDOpcode) {
1455 default:
1456 break;
1457 case ISD::AND:
1458 case ISD::XOR:
1459 return 2;
1460 case ISD::OR:
1461 return 3;
1462 }
1463 }
1464
1465 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1466
1467 if (ST->hasNEON()) {
1468 const unsigned FunctionCallDivCost = 20;
1469 const unsigned ReciprocalDivCost = 10;
1470 static const CostTblEntry CostTbl[] = {
1471 // Division.
1472 // These costs are somewhat random. Choose a cost of 20 to indicate that
1473 // vectorizing devision (added function call) is going to be very expensive.
1474 // Double registers types.
1475 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1476 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1477 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1478 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1479 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1480 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1481 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1482 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1483 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1484 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1485 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1486 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1487 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1488 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1489 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1490 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1491 // Quad register types.
1492 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1493 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1494 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1495 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1496 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1497 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1498 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1499 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1500 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1501 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1502 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1503 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1504 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1505 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1506 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1507 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1508 // Multiplication.
1509 };
1510
1511 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1512 return LT.first * Entry->Cost;
1513
1515 Opcode, Ty, CostKind, Op1Info, Op2Info);
1516
1517 // This is somewhat of a hack. The problem that we are facing is that SROA
1518 // creates a sequence of shift, and, or instructions to construct values.
1519 // These sequences are recognized by the ISel and have zero-cost. Not so for
1520 // the vectorized code. Because we have support for v2i64 but not i64 those
1521 // sequences look particularly beneficial to vectorize.
1522 // To work around this we increase the cost of v2i64 operations to make them
1523 // seem less beneficial.
1524 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1525 Cost += 4;
1526
1527 return Cost;
1528 }
1529
1530 // If this operation is a shift on arm/thumb2, it might well be folded into
1531 // the following instruction, hence having a cost of 0.
1532 auto LooksLikeAFreeShift = [&]() {
1533 if (ST->isThumb1Only() || Ty->isVectorTy())
1534 return false;
1535
1536 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1537 return false;
1538 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1539 return false;
1540
1541 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1542 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1543 case Instruction::Add:
1544 case Instruction::Sub:
1545 case Instruction::And:
1546 case Instruction::Xor:
1547 case Instruction::Or:
1548 case Instruction::ICmp:
1549 return true;
1550 default:
1551 return false;
1552 }
1553 };
1554 if (LooksLikeAFreeShift())
1555 return 0;
1556
1557 // When targets have both DSP and MVE we find that the
1558 // the compiler will attempt to vectorize as well as using
1559 // scalar (S/U)MLAL operations. This is in cases where we have
1560 // the pattern ext(mul(ext(i16), ext(i16))) we find
1561 // that codegen performs better when only using (S/U)MLAL scalar
1562 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1563 // check if a mul instruction is used in a (U/S)MLAL pattern.
1564 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1565 Type *Ty) -> bool {
1566 if (!ST->hasDSP())
1567 return false;
1568
1569 if (!I)
1570 return false;
1571
1572 if (Opcode != Instruction::Mul)
1573 return false;
1574
1575 if (Ty->isVectorTy())
1576 return false;
1577
1578 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1579 return cast<Instruction>(LHS)->getOpcode() ==
1580 cast<Instruction>(RHS)->getOpcode();
1581 };
1582 auto IsExtInst = [](const Value *V) -> bool {
1583 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1584 };
1585 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1586 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1587 };
1588
1589 // We check the arguments of the instruction to see if they're extends
1590 auto *BinOp = dyn_cast<BinaryOperator>(I);
1591 if (!BinOp)
1592 return false;
1593 Value *Op0 = BinOp->getOperand(0);
1594 Value *Op1 = BinOp->getOperand(1);
1595 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1596 // We're interested in an ext of an i16
1597 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1598 !IsExtensionFromHalf(Op1))
1599 return false;
1600 // We need to check if this result will be further extended to i64
1601 // and that all these uses are SExt
1602 for (auto *U : I->users())
1603 if (!IsExtInst(U))
1604 return false;
1605 return true;
1606 }
1607
1608 return false;
1609 };
1610
1611 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1612 return 0;
1613
1614 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1615 // for "multiple beats" potentially needed by MVE instructions.
1616 int BaseCost = 1;
1617 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1618 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1619
1620 // The rest of this mostly follows what is done in
1621 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1622 // that scalars or increasing the costs for custom operations. The results is
1623 // also multiplied by the MVEVectorCostFactor where appropriate.
1624 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1625 return LT.first * BaseCost;
1626
1627 // Else this is expand, assume that we need to scalarize this op.
1628 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1629 unsigned Num = VTy->getNumElements();
1631 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1632 // Return the cost of multiple scalar invocation plus the cost of
1633 // inserting and extracting the values.
1634 SmallVector<Type *> Tys(Args.size(), Ty);
1635 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1636 Num * Cost;
1637 }
1638
1639 return BaseCost;
1640}
1641
1643 Align Alignment,
1644 unsigned AddressSpace,
1646 TTI::OperandValueInfo OpInfo,
1647 const Instruction *I) const {
1648 // TODO: Handle other cost kinds.
1650 return 1;
1651
1652 // Type legalization can't handle structs
1653 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1654 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1655 CostKind);
1656
1657 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1658 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1659 // Unaligned loads/stores are extremely inefficient.
1660 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1661 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1662 return LT.first * 4;
1663 }
1664
1665 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1666 // Same for stores.
1667 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1668 ((Opcode == Instruction::Load && I->hasOneUse() &&
1669 isa<FPExtInst>(*I->user_begin())) ||
1670 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1672 Type *DstTy =
1673 Opcode == Instruction::Load
1674 ? (*I->user_begin())->getType()
1675 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1676 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1677 DstTy->getScalarType()->isFloatTy())
1678 return ST->getMVEVectorCostFactor(CostKind);
1679 }
1680
1681 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1682 ? ST->getMVEVectorCostFactor(CostKind)
1683 : 1;
1684 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1685 CostKind, OpInfo, I);
1686}
1687
1691 switch (MICA.getID()) {
1692 case Intrinsic::masked_scatter:
1693 case Intrinsic::masked_gather:
1694 return getGatherScatterOpCost(MICA, CostKind);
1695 case Intrinsic::masked_load:
1696 case Intrinsic::masked_store:
1697 return getMaskedMemoryOpCost(MICA, CostKind);
1698 }
1700}
1701
1705 unsigned IID = MICA.getID();
1706 Type *Src = MICA.getDataType();
1707 Align Alignment = MICA.getAlignment();
1708 unsigned AddressSpace = MICA.getAddressSpace();
1709 if (ST->hasMVEIntegerOps()) {
1710 if (IID == Intrinsic::masked_load &&
1711 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1712 return ST->getMVEVectorCostFactor(CostKind);
1713 if (IID == Intrinsic::masked_store &&
1714 isLegalMaskedStore(Src, Alignment, AddressSpace))
1715 return ST->getMVEVectorCostFactor(CostKind);
1716 }
1717 if (!isa<FixedVectorType>(Src))
1719 // Scalar cost, which is currently very high due to the efficiency of the
1720 // generated code.
1721 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1722}
1723
1725 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1726 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1727 bool UseMaskForCond, bool UseMaskForGaps) const {
1728 assert(Factor >= 2 && "Invalid interleave factor");
1729 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1730
1731 // vldN/vstN doesn't support vector types of i64/f64 element.
1732 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1733
1734 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1735 !UseMaskForCond && !UseMaskForGaps) {
1736 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1737 auto *SubVecTy =
1738 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1739
1740 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1741 // Accesses having vector types that are a multiple of 128 bits can be
1742 // matched to more than one vldN/vstN instruction.
1743 int BaseCost =
1744 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1745 if (NumElts % Factor == 0 &&
1746 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1747 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1748
1749 // Some smaller than legal interleaved patterns are cheap as we can make
1750 // use of the vmovn or vrev patterns to interleave a standard load. This is
1751 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1752 // promoted differently). The cost of 2 here is then a load and vrev or
1753 // vmovn.
1754 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1755 VecTy->isIntOrIntVectorTy() &&
1756 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1757 return 2 * BaseCost;
1758 }
1759
1760 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1761 Alignment, AddressSpace, CostKind,
1762 UseMaskForCond, UseMaskForGaps);
1763}
1764
1768
1769 Type *DataTy = MICA.getDataType();
1770 const Value *Ptr = MICA.getPointer();
1771 bool VariableMask = MICA.getVariableMask();
1772 Align Alignment = MICA.getAlignment();
1773 const Instruction *I = MICA.getInst();
1774
1775 using namespace PatternMatch;
1776 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1778
1779 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1780 auto *VTy = cast<FixedVectorType>(DataTy);
1781
1782 // TODO: Splitting, once we do that.
1783
1784 unsigned NumElems = VTy->getNumElements();
1785 unsigned EltSize = VTy->getScalarSizeInBits();
1786 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1787
1788 // For now, it is assumed that for the MVE gather instructions the loads are
1789 // all effectively serialised. This means the cost is the scalar cost
1790 // multiplied by the number of elements being loaded. This is possibly very
1791 // conservative, but even so we still end up vectorising loops because the
1792 // cost per iteration for many loops is lower than for scalar loops.
1793 InstructionCost VectorCost =
1794 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1795 // The scalarization cost should be a lot higher. We use the number of vector
1796 // elements plus the scalarization overhead. If masking is required then a lot
1797 // of little blocks will be needed and potentially a scalarized p0 mask,
1798 // greatly increasing the cost.
1799 InstructionCost ScalarCost =
1800 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1801 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1802 CostKind) +
1803 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1804 CostKind);
1805
1806 if (EltSize < 8 || Alignment < EltSize / 8)
1807 return ScalarCost;
1808
1809 unsigned ExtSize = EltSize;
1810 // Check whether there's a single user that asks for an extended type
1811 if (I != nullptr) {
1812 // Dependent of the caller of this function, a gather instruction will
1813 // either have opcode Instruction::Load or be a call to the masked_gather
1814 // intrinsic
1815 if ((I->getOpcode() == Instruction::Load ||
1817 I->hasOneUse()) {
1818 const User *Us = *I->users().begin();
1819 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1820 // only allow valid type combinations
1821 unsigned TypeSize =
1822 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1823 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1824 (TypeSize == 16 && EltSize == 8)) &&
1825 TypeSize * NumElems == 128) {
1826 ExtSize = TypeSize;
1827 }
1828 }
1829 }
1830 // Check whether the input data needs to be truncated
1831 TruncInst *T;
1832 if ((I->getOpcode() == Instruction::Store ||
1834 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1835 // Only allow valid type combinations
1836 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1837 if (((EltSize == 16 && TypeSize == 32) ||
1838 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1839 TypeSize * NumElems == 128)
1840 ExtSize = TypeSize;
1841 }
1842 }
1843
1844 if (ExtSize * NumElems != 128 || NumElems < 4)
1845 return ScalarCost;
1846
1847 // Any (aligned) i32 gather will not need to be scalarised.
1848 if (ExtSize == 32)
1849 return VectorCost;
1850 // For smaller types, we need to ensure that the gep's inputs are correctly
1851 // extended from a small enough value. Other sizes (including i64) are
1852 // scalarized for now.
1853 if (ExtSize != 8 && ExtSize != 16)
1854 return ScalarCost;
1855
1856 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1857 Ptr = BC->getOperand(0);
1858 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1859 if (GEP->getNumOperands() != 2)
1860 return ScalarCost;
1861 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1862 // Scale needs to be correct (which is only relevant for i16s).
1863 if (Scale != 1 && Scale * 8 != ExtSize)
1864 return ScalarCost;
1865 // And we need to zext (not sext) the indexes from a small enough type.
1866 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1867 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1868 return VectorCost;
1869 }
1870 return ScalarCost;
1871 }
1872 return ScalarCost;
1873}
1874
1877 std::optional<FastMathFlags> FMF,
1879
1880 EVT ValVT = TLI->getValueType(DL, ValTy);
1881 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1882 unsigned EltSize = ValVT.getScalarSizeInBits();
1883
1884 // In general floating point reductions are a series of elementwise
1885 // operations, with free extracts on each step. These are either in-order or
1886 // treewise depending on whether that is allowed by the fast math flags.
1887 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1888 ((EltSize == 32 && ST->hasVFP2Base()) ||
1889 (EltSize == 64 && ST->hasFP64()) ||
1890 (EltSize == 16 && ST->hasFullFP16()))) {
1891 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1892 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1893 InstructionCost VecCost = 0;
1894 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1895 NumElts * EltSize > VecLimit) {
1896 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1897 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1898 NumElts /= 2;
1899 }
1900
1901 // For fp16 we need to extract the upper lane elements. MVE can add a
1902 // VREV+FMIN/MAX to perform another vector step instead.
1903 InstructionCost ExtractCost = 0;
1904 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1905 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1906 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1907 NumElts /= 2;
1908 } else if (ValVT.getVectorElementType() == MVT::f16)
1909 ExtractCost = NumElts / 2;
1910
1911 return VecCost + ExtractCost +
1912 NumElts *
1913 getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
1914 }
1915
1916 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1917 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1918 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1919 unsigned VecLimit =
1920 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1921 InstructionCost VecCost = 0;
1922 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1923 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1924 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1925 NumElts /= 2;
1926 }
1927 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1928 // step.
1929 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1930 NumElts * EltSize == 64) {
1931 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1932 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1933 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1934 NumElts /= 2;
1935 }
1936
1937 // From here we extract the elements and perform the and/or/xor.
1938 InstructionCost ExtractCost = NumElts;
1939 return VecCost + ExtractCost +
1940 (NumElts - 1) * getArithmeticInstrCost(
1941 Opcode, ValTy->getElementType(), CostKind);
1942 }
1943
1944 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1946 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1947
1948 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1949
1950 static const CostTblEntry CostTblAdd[]{
1951 {ISD::ADD, MVT::v16i8, 1},
1952 {ISD::ADD, MVT::v8i16, 1},
1953 {ISD::ADD, MVT::v4i32, 1},
1954 };
1955 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1956 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1957
1958 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1959}
1960
1962 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1963 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1964 EVT ValVT = TLI->getValueType(DL, ValTy);
1965 EVT ResVT = TLI->getValueType(DL, ResTy);
1966
1967 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1968
1969 switch (ISD) {
1970 case ISD::ADD:
1971 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1972 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1973
1974 // The legal cases are:
1975 // VADDV u/s 8/16/32
1976 // VADDLV u/s 32
1977 // Codegen currently cannot always handle larger than legal vectors very
1978 // well, especially for predicated reductions where the mask needs to be
1979 // split, so restrict to 128bit or smaller input types.
1980 unsigned RevVTSize = ResVT.getSizeInBits();
1981 if (ValVT.getSizeInBits() <= 128 &&
1982 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1983 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1984 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1985 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1986 }
1987 break;
1988 default:
1989 break;
1990 }
1991 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1992 CostKind);
1993}
1994
1996ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1997 Type *ResTy, VectorType *ValTy,
1999 if (RedOpcode != Instruction::Add)
2001 EVT ValVT = TLI->getValueType(DL, ValTy);
2002 EVT ResVT = TLI->getValueType(DL, ResTy);
2003
2004 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
2005 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2006
2007 // The legal cases are:
2008 // VMLAV u/s 8/16/32
2009 // VMLALV u/s 16/32
2010 // Codegen currently cannot always handle larger than legal vectors very
2011 // well, especially for predicated reductions where the mask needs to be
2012 // split, so restrict to 128bit or smaller input types.
2013 unsigned RevVTSize = ResVT.getSizeInBits();
2014 if (ValVT.getSizeInBits() <= 128 &&
2015 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
2016 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
2017 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
2018 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
2019 }
2020
2021 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
2022 CostKind);
2023}
2024
2027 FastMathFlags FMF,
2029 EVT ValVT = TLI->getValueType(DL, Ty);
2030
2031 // In general floating point reductions are a series of elementwise
2032 // operations, with free extracts on each step. These are either in-order or
2033 // treewise depending on whether that is allowed by the fast math flags.
2034 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
2035 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
2036 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
2037 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
2038 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
2039 unsigned EltSize = ValVT.getScalarSizeInBits();
2040 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
2041 InstructionCost VecCost;
2042 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
2043 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
2044 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
2045 VecCost += getIntrinsicInstrCost(ICA, CostKind);
2046 NumElts /= 2;
2047 }
2048
2049 // For fp16 we need to extract the upper lane elements. MVE can add a
2050 // VREV+FMIN/MAX to perform another vector step instead.
2051 InstructionCost ExtractCost = 0;
2052 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
2053 NumElts == 8) {
2054 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
2055 NumElts /= 2;
2056 } else if (ValVT.getVectorElementType() == MVT::f16)
2057 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
2058
2059 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2060 {Ty->getElementType(), Ty->getElementType()},
2061 FMF);
2062 return VecCost + ExtractCost +
2063 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2064 }
2065
2066 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2067 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2068 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2069
2070 // All costs are the same for u/s min/max. These lower to vminv, which are
2071 // given a slightly higher cost as they tend to take multiple cycles for
2072 // smaller type sizes.
2073 static const CostTblEntry CostTblAdd[]{
2074 {ISD::SMIN, MVT::v16i8, 4},
2075 {ISD::SMIN, MVT::v8i16, 3},
2076 {ISD::SMIN, MVT::v4i32, 2},
2077 };
2078 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2079 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2080 }
2081
2082 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2083}
2084
2088 unsigned Opc = ICA.getID();
2089 switch (Opc) {
2090 case Intrinsic::get_active_lane_mask:
2091 // Currently we make a somewhat optimistic assumption that
2092 // active_lane_mask's are always free. In reality it may be freely folded
2093 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2094 // of add/icmp code. We may need to improve this in the future, but being
2095 // able to detect if it is free or not involves looking at a lot of other
2096 // code. We currently assume that the vectorizer inserted these, and knew
2097 // what it was doing in adding one.
2098 if (ST->hasMVEIntegerOps())
2099 return 0;
2100 break;
2101 case Intrinsic::sadd_sat:
2102 case Intrinsic::ssub_sat:
2103 case Intrinsic::uadd_sat:
2104 case Intrinsic::usub_sat: {
2105 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2106 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2107 Type *RetTy = ICA.getReturnType();
2108
2109 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2110 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2111 return 1; // qadd / qsub
2112 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2113 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2114 // Otherwise return the cost of expanding the node. Generally an add +
2115 // icmp + sel.
2117 Type *CondTy = RetTy->getWithNewBitWidth(1);
2118 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2119 RetTy, CostKind) +
2120 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2121 CostKind) +
2122 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2123 CostKind);
2124 }
2125
2126 if (!ST->hasMVEIntegerOps())
2127 break;
2128
2129 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2130 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2131 LT.second == MVT::v16i8) {
2132 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2133 // need to extend the type, as it uses shr(qadd(shl, shl)).
2134 unsigned Instrs =
2135 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2136 : 4;
2137 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2138 }
2139 break;
2140 }
2141 case Intrinsic::abs:
2142 case Intrinsic::smin:
2143 case Intrinsic::smax:
2144 case Intrinsic::umin:
2145 case Intrinsic::umax: {
2146 if (!ST->hasMVEIntegerOps())
2147 break;
2148 Type *VT = ICA.getReturnType();
2149
2150 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2151 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2152 LT.second == MVT::v16i8)
2153 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2154 break;
2155 }
2156 case Intrinsic::minnum:
2157 case Intrinsic::maxnum: {
2158 if (!ST->hasMVEFloatOps())
2159 break;
2160 Type *VT = ICA.getReturnType();
2161 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2162 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2163 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2164 break;
2165 }
2166 case Intrinsic::fptosi_sat:
2167 case Intrinsic::fptoui_sat: {
2168 if (ICA.getArgTypes().empty())
2169 break;
2170 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2171 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2172 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2173 // Check for the legal types, with the corect subtarget features.
2174 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2175 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2176 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2177 return LT.first;
2178
2179 // Equally for MVE vector types
2180 if (ST->hasMVEFloatOps() &&
2181 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2182 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2183 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2184
2185 // If we can we use a legal convert followed by a min+max
2186 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2187 (ST->hasFP64() && LT.second == MVT::f64) ||
2188 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2189 (ST->hasMVEFloatOps() &&
2190 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2191 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2192 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2193 LT.second.getScalarSizeInBits());
2195 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2196 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2197 : Intrinsic::umin,
2198 LegalTy, {LegalTy, LegalTy});
2200 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2201 : Intrinsic::umax,
2202 LegalTy, {LegalTy, LegalTy});
2204 return LT.first * Cost;
2205 }
2206 // Otherwise we need to follow the default expansion that clamps the value
2207 // using a float min/max with a fcmp+sel for nan handling when signed.
2208 Type *FPTy = ICA.getArgTypes()[0];
2209 Type *RetTy = ICA.getReturnType();
2210 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2212 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2214 Cost +=
2215 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2216 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2217 if (IsSigned) {
2218 Type *CondTy = RetTy->getWithNewBitWidth(1);
2219 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2221 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2223 }
2224 return Cost;
2225 }
2226 }
2227
2229}
2230
2232 if (!F->isIntrinsic())
2233 return BaseT::isLoweredToCall(F);
2234
2235 // Assume all Arm-specific intrinsics map to an instruction.
2236 if (F->getName().starts_with("llvm.arm"))
2237 return false;
2238
2239 switch (F->getIntrinsicID()) {
2240 default: break;
2241 case Intrinsic::powi:
2242 case Intrinsic::sin:
2243 case Intrinsic::cos:
2244 case Intrinsic::sincos:
2245 case Intrinsic::pow:
2246 case Intrinsic::log:
2247 case Intrinsic::log10:
2248 case Intrinsic::log2:
2249 case Intrinsic::exp:
2250 case Intrinsic::exp2:
2251 return true;
2252 case Intrinsic::sqrt:
2253 case Intrinsic::fabs:
2254 case Intrinsic::copysign:
2255 case Intrinsic::floor:
2256 case Intrinsic::ceil:
2257 case Intrinsic::trunc:
2258 case Intrinsic::rint:
2259 case Intrinsic::nearbyint:
2260 case Intrinsic::round:
2261 case Intrinsic::canonicalize:
2262 case Intrinsic::lround:
2263 case Intrinsic::llround:
2264 case Intrinsic::lrint:
2265 case Intrinsic::llrint:
2266 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2267 return true;
2268 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2269 return true;
2270 // Some operations can be handled by vector instructions and assume
2271 // unsupported vectors will be expanded into supported scalar ones.
2272 // TODO Handle scalar operations properly.
2273 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2274 case Intrinsic::masked_store:
2275 case Intrinsic::masked_load:
2276 case Intrinsic::masked_gather:
2277 case Intrinsic::masked_scatter:
2278 return !ST->hasMVEIntegerOps();
2279 case Intrinsic::sadd_with_overflow:
2280 case Intrinsic::uadd_with_overflow:
2281 case Intrinsic::ssub_with_overflow:
2282 case Intrinsic::usub_with_overflow:
2283 case Intrinsic::sadd_sat:
2284 case Intrinsic::uadd_sat:
2285 case Intrinsic::ssub_sat:
2286 case Intrinsic::usub_sat:
2287 return false;
2288 }
2289
2290 return BaseT::isLoweredToCall(F);
2291}
2292
2294 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2295 EVT VT = TLI->getValueType(DL, I.getType(), true);
2296 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2297 return true;
2298
2299 // Check if an intrinsic will be lowered to a call and assume that any
2300 // other CallInst will generate a bl.
2301 if (auto *Call = dyn_cast<CallInst>(&I)) {
2302 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2303 switch(II->getIntrinsicID()) {
2304 case Intrinsic::memcpy:
2305 case Intrinsic::memset:
2306 case Intrinsic::memmove:
2307 return getNumMemOps(II) == -1;
2308 default:
2309 if (const Function *F = Call->getCalledFunction())
2310 return isLoweredToCall(F);
2311 }
2312 }
2313 return true;
2314 }
2315
2316 // FPv5 provides conversions between integer, double-precision,
2317 // single-precision, and half-precision formats.
2318 switch (I.getOpcode()) {
2319 default:
2320 break;
2321 case Instruction::FPToSI:
2322 case Instruction::FPToUI:
2323 case Instruction::SIToFP:
2324 case Instruction::UIToFP:
2325 case Instruction::FPTrunc:
2326 case Instruction::FPExt:
2327 return !ST->hasFPARMv8Base();
2328 }
2329
2330 // FIXME: Unfortunately the approach of checking the Operation Action does
2331 // not catch all cases of Legalization that use library calls. Our
2332 // Legalization step categorizes some transformations into library calls as
2333 // Custom, Expand or even Legal when doing type legalization. So for now
2334 // we have to special case for instance the SDIV of 64bit integers and the
2335 // use of floating point emulation.
2336 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2337 switch (ISD) {
2338 default:
2339 break;
2340 case ISD::SDIV:
2341 case ISD::UDIV:
2342 case ISD::SREM:
2343 case ISD::UREM:
2344 case ISD::SDIVREM:
2345 case ISD::UDIVREM:
2346 return true;
2347 }
2348 }
2349
2350 // Assume all other non-float operations are supported.
2351 if (!VT.isFloatingPoint())
2352 return false;
2353
2354 // We'll need a library call to handle most floats when using soft.
2355 if (TLI->useSoftFloat()) {
2356 switch (I.getOpcode()) {
2357 default:
2358 return true;
2359 case Instruction::Alloca:
2360 case Instruction::Load:
2361 case Instruction::Store:
2362 case Instruction::Select:
2363 case Instruction::PHI:
2364 return false;
2365 }
2366 }
2367
2368 // We'll need a libcall to perform double precision operations on a single
2369 // precision only FPU.
2370 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2371 return true;
2372
2373 // Likewise for half precision arithmetic.
2374 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2375 return true;
2376
2377 return false;
2378}
2379
2381 AssumptionCache &AC,
2382 TargetLibraryInfo *LibInfo,
2383 HardwareLoopInfo &HWLoopInfo) const {
2384 // Low-overhead branches are only supported in the 'low-overhead branch'
2385 // extension of v8.1-m.
2386 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2387 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2388 return false;
2389 }
2390
2392 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2393 return false;
2394 }
2395
2396 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2397 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2398 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2399 return false;
2400 }
2401
2402 const SCEV *TripCountSCEV =
2403 SE.getAddExpr(BackedgeTakenCount,
2404 SE.getOne(BackedgeTakenCount->getType()));
2405
2406 // We need to store the trip count in LR, a 32-bit register.
2407 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2408 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2409 return false;
2410 }
2411
2412 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2413 // point in generating a hardware loop if that's going to happen.
2414
2415 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2416 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2417 switch (Call->getIntrinsicID()) {
2418 default:
2419 break;
2420 case Intrinsic::start_loop_iterations:
2421 case Intrinsic::test_start_loop_iterations:
2422 case Intrinsic::loop_decrement:
2423 case Intrinsic::loop_decrement_reg:
2424 return true;
2425 }
2426 }
2427 return false;
2428 };
2429
2430 // Scan the instructions to see if there's any that we know will turn into a
2431 // call or if this loop is already a low-overhead loop or will become a tail
2432 // predicated loop.
2433 bool IsTailPredLoop = false;
2434 auto ScanLoop = [&](Loop *L) {
2435 for (auto *BB : L->getBlocks()) {
2436 for (auto &I : *BB) {
2437 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2438 isa<InlineAsm>(I)) {
2439 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2440 return false;
2441 }
2442 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2443 IsTailPredLoop |=
2444 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2445 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2446 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2447 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2448 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2449 }
2450 }
2451 return true;
2452 };
2453
2454 // Visit inner loops.
2455 for (auto *Inner : *L)
2456 if (!ScanLoop(Inner))
2457 return false;
2458
2459 if (!ScanLoop(L))
2460 return false;
2461
2462 // TODO: Check whether the trip count calculation is expensive. If L is the
2463 // inner loop but we know it has a low trip count, calculating that trip
2464 // count (in the parent loop) may be detrimental.
2465
2466 LLVMContext &C = L->getHeader()->getContext();
2467 HWLoopInfo.CounterInReg = true;
2468 HWLoopInfo.IsNestingLegal = false;
2469 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2470 HWLoopInfo.CountType = Type::getInt32Ty(C);
2471 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2472 return true;
2473}
2474
2475static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2476 // We don't allow icmp's, and because we only look at single block loops,
2477 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2478 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2479 return false;
2480 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2481 // not currently canonical, but soon will be. Code without them uses icmp, and
2482 // so is not tail predicated as per the condition above. In order to get the
2483 // same performance we treat min and max the same as an icmp for tailpred
2484 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2485 // pick more optimial instructions like VQDMULH. They need to be recognized
2486 // directly by the vectorizer).
2487 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2488 if ((II->getIntrinsicID() == Intrinsic::smin ||
2489 II->getIntrinsicID() == Intrinsic::smax ||
2490 II->getIntrinsicID() == Intrinsic::umin ||
2491 II->getIntrinsicID() == Intrinsic::umax) &&
2492 ++ICmpCount > 1)
2493 return false;
2494
2495 if (isa<FCmpInst>(&I))
2496 return false;
2497
2498 // We could allow extending/narrowing FP loads/stores, but codegen is
2499 // too inefficient so reject this for now.
2501 return false;
2502
2503 // Extends have to be extending-loads
2504 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2505 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2506 return false;
2507
2508 // Truncs have to be narrowing-stores
2509 if (isa<TruncInst>(&I) )
2510 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2511 return false;
2512
2513 return true;
2514}
2515
2516// To set up a tail-predicated loop, we need to know the total number of
2517// elements processed by that loop. Thus, we need to determine the element
2518// size and:
2519// 1) it should be uniform for all operations in the vector loop, so we
2520// e.g. don't want any widening/narrowing operations.
2521// 2) it should be smaller than i64s because we don't have vector operations
2522// that work on i64s.
2523// 3) we don't want elements to be reversed or shuffled, to make sure the
2524// tail-predication masks/predicates the right lanes.
2525//
2527 const DataLayout &DL,
2528 const LoopAccessInfo *LAI,
2529 const DominatorTree &DT) {
2530 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2531
2532 // If there are live-out values, it is probably a reduction. We can predicate
2533 // most reduction operations freely under MVE using a combination of
2534 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2535 // floating point and integer reductions, but don't check for operators
2536 // specifically here. If the value ends up not being a reduction (and so the
2537 // vectorizer cannot tailfold the loop), we should fall back to standard
2538 // vectorization automatically.
2540 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2541 bool ReductionsDisabled =
2544
2545 for (auto *I : LiveOuts) {
2546 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2547 !I->getType()->isHalfTy()) {
2548 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2549 "live-out value\n");
2550 return false;
2551 }
2552 if (ReductionsDisabled) {
2553 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2554 return false;
2555 }
2556 }
2557
2558 // Next, check that all instructions can be tail-predicated.
2559 PredicatedScalarEvolution PSE = LAI->getPSE();
2560 int ICmpCount = 0;
2561
2562 for (BasicBlock *BB : L->blocks()) {
2563 for (Instruction &I : BB->instructionsWithoutDebug()) {
2564 if (isa<PHINode>(&I))
2565 continue;
2566 if (!canTailPredicateInstruction(I, ICmpCount)) {
2567 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2568 return false;
2569 }
2570
2571 Type *T = I.getType();
2572 if (T->getScalarSizeInBits() > 32) {
2573 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2574 return false;
2575 }
2576 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2578 Type *AccessTy = getLoadStoreType(&I);
2579 int64_t NextStride =
2580 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2581 if (NextStride == 1) {
2582 // TODO: for now only allow consecutive strides of 1. We could support
2583 // other strides as long as it is uniform, but let's keep it simple
2584 // for now.
2585 continue;
2586 } else if (NextStride == -1 ||
2587 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2588 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2590 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2591 "be tail-predicated\n.");
2592 return false;
2593 // TODO: don't tail predicate if there is a reversed load?
2594 } else if (EnableMaskedGatherScatters) {
2595 // Gather/scatters do allow loading from arbitrary strides, at
2596 // least if they are loop invariant.
2597 // TODO: Loop variant strides should in theory work, too, but
2598 // this requires further testing.
2599 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2600 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2601 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2602 if (PSE.getSE()->isLoopInvariant(Step, L))
2603 continue;
2604 }
2605 }
2606 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2607 "tail-predicate\n.");
2608 return false;
2609 }
2610 }
2611 }
2612
2613 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2614 return true;
2615}
2616
2618 if (!EnableTailPredication) {
2619 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2620 return false;
2621 }
2622
2623 // Creating a predicated vector loop is the first step for generating a
2624 // tail-predicated hardware loop, for which we need the MVE masked
2625 // load/stores instructions:
2626 if (!ST->hasMVEIntegerOps())
2627 return false;
2628
2629 LoopVectorizationLegality *LVL = TFI->LVL;
2630 Loop *L = LVL->getLoop();
2631
2632 // For now, restrict this to single block loops.
2633 if (L->getNumBlocks() > 1) {
2634 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2635 "loop.\n");
2636 return false;
2637 }
2638
2639 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2640
2641 LoopInfo *LI = LVL->getLoopInfo();
2642 HardwareLoopInfo HWLoopInfo(L);
2643 if (!HWLoopInfo.canAnalyze(*LI)) {
2644 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2645 "analyzable.\n");
2646 return false;
2647 }
2648
2651
2652 // This checks if we have the low-overhead branch architecture
2653 // extension, and if we will create a hardware-loop:
2654 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2655 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2656 "profitable.\n");
2657 return false;
2658 }
2659
2660 DominatorTree *DT = LVL->getDominatorTree();
2661 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2662 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2663 "a candidate.\n");
2664 return false;
2665 }
2666
2667 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2668 *LVL->getDominatorTree());
2669}
2670
2672ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2673 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2675
2676 // Intrinsic @llvm.get.active.lane.mask is supported.
2677 // It is used in the MVETailPredication pass, which requires the number of
2678 // elements processed by this vector loop to setup the tail-predicated
2679 // loop.
2681}
2684 OptimizationRemarkEmitter *ORE) const {
2685 // Enable Upper bound unrolling universally, providing that we do not see an
2686 // active lane mask, which will be better kept as a loop to become tail
2687 // predicated than to be conditionally unrolled.
2688 UP.UpperBound =
2689 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2690 return isa<IntrinsicInst>(I) &&
2691 cast<IntrinsicInst>(I).getIntrinsicID() ==
2692 Intrinsic::get_active_lane_mask;
2693 });
2694
2695 // Only currently enable these preferences for M-Class cores.
2696 if (!ST->isMClass())
2697 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2698
2699 // Disable loop unrolling for Oz and Os.
2700 UP.OptSizeThreshold = 0;
2702 if (L->getHeader()->getParent()->hasOptSize())
2703 return;
2704
2705 SmallVector<BasicBlock*, 4> ExitingBlocks;
2706 L->getExitingBlocks(ExitingBlocks);
2707 LLVM_DEBUG(dbgs() << "Loop has:\n"
2708 << "Blocks: " << L->getNumBlocks() << "\n"
2709 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2710
2711 // Only allow another exit other than the latch. This acts as an early exit
2712 // as it mirrors the profitability calculation of the runtime unroller.
2713 if (ExitingBlocks.size() > 2)
2714 return;
2715
2716 // Limit the CFG of the loop body for targets with a branch predictor.
2717 // Allowing 4 blocks permits if-then-else diamonds in the body.
2718 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2719 return;
2720
2721 // Don't unroll vectorized loops, including the remainder loop
2722 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2723 return;
2724
2725 // Scan the loop: don't unroll loops with calls as this could prevent
2726 // inlining.
2728 for (auto *BB : L->getBlocks()) {
2729 for (auto &I : *BB) {
2730 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2731 // scalar code.
2732 if (I.getType()->isVectorTy())
2733 return;
2734
2735 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2736 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2737 if (!isLoweredToCall(F))
2738 continue;
2739 }
2740 return;
2741 }
2742
2743 SmallVector<const Value*, 4> Operands(I.operand_values());
2744 Cost += getInstructionCost(&I, Operands,
2746 }
2747 }
2748
2749 // On v6m cores, there are very few registers available. We can easily end up
2750 // spilling and reloading more registers in an unrolled loop. Look at the
2751 // number of LCSSA phis as a rough measure of how many registers will need to
2752 // be live out of the loop, reducing the default unroll count if more than 1
2753 // value is needed. In the long run, all of this should be being learnt by a
2754 // machine.
2755 unsigned UnrollCount = 4;
2756 if (ST->isThumb1Only()) {
2757 unsigned ExitingValues = 0;
2759 L->getExitBlocks(ExitBlocks);
2760 for (auto *Exit : ExitBlocks) {
2761 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2762 // only the last is expected to be needed for address operands.
2763 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2764 return PH.getNumOperands() != 1 ||
2765 !isa<GetElementPtrInst>(PH.getOperand(0));
2766 });
2767 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2768 }
2769 if (ExitingValues)
2770 UnrollCount /= ExitingValues;
2771 if (UnrollCount <= 1)
2772 return;
2773 }
2774
2775 // For processors with low overhead branching (LOB), runtime unrolling the
2776 // innermost loop is often detrimental to performance. In these cases the loop
2777 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2778 // deeply nested loops get executed multiple times, negating the benefits of
2779 // LOB. This is particularly noticable when the loop trip count of the
2780 // innermost loop varies within the outer loop, such as in the case of
2781 // triangular matrix decompositions. In these cases we will prefer to not
2782 // unroll the innermost loop, with the intention for it to be executed as a
2783 // low overhead loop.
2784 bool Runtime = true;
2785 if (ST->hasLOB()) {
2787 const auto *BETC = SE.getBackedgeTakenCount(L);
2788 auto *Outer = L->getOutermostLoop();
2789 if ((L != Outer && Outer != L->getParentLoop()) ||
2790 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2791 Runtime = false;
2792 }
2793 }
2794 }
2795
2796 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2797 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2798
2799 UP.Partial = true;
2800 UP.Runtime = Runtime;
2801 UP.UnrollRemainder = true;
2803 UP.UnrollAndJam = true;
2805
2806 // Force unrolling small loops can be very useful because of the branch
2807 // taken cost of the backedge.
2809 UP.Force = true;
2810}
2811
2816
2818 if (!ST->hasMVEIntegerOps())
2819 return false;
2820
2821 unsigned ScalarBits = Ty->getScalarSizeInBits();
2822 switch (Kind) {
2823 case RecurKind::Add:
2824 return ScalarBits <= 64;
2825 default:
2826 return false;
2827 }
2828}
2829
2831 if (!ST->hasMVEIntegerOps())
2832 return false;
2833 return true;
2834}
2835
2837 StackOffset BaseOffset,
2838 bool HasBaseReg, int64_t Scale,
2839 unsigned AddrSpace) const {
2841 AM.BaseGV = BaseGV;
2842 AM.BaseOffs = BaseOffset.getFixed();
2843 AM.HasBaseReg = HasBaseReg;
2844 AM.Scale = Scale;
2845 AM.ScalableOffset = BaseOffset.getScalable();
2846 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2847 if (ST->hasFPAO())
2848 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2849 return 0;
2850 }
2852}
2853
2854bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2855 if (Thumb) {
2856 // B.W is available in any Thumb2-supporting target, and also in every
2857 // version of Armv8-M, even Baseline which does not include the rest of
2858 // Thumb2.
2859 return ST->isThumb2() || ST->hasV8MBaselineOps();
2860 } else {
2861 // B is available in all versions of the Arm ISA, so the only question is
2862 // whether that ISA is available at all.
2863 return ST->hasARMOps();
2864 }
2865}
2866
2867/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2868/// of the vector elements.
2869static bool areExtractExts(Value *Ext1, Value *Ext2) {
2870 using namespace PatternMatch;
2871
2872 auto areExtDoubled = [](Instruction *Ext) {
2873 return Ext->getType()->getScalarSizeInBits() ==
2874 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2875 };
2876
2877 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2878 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2879 !areExtDoubled(cast<Instruction>(Ext1)) ||
2880 !areExtDoubled(cast<Instruction>(Ext2)))
2881 return false;
2882
2883 return true;
2884}
2885
2886/// Check if sinking \p I's operands to I's basic block is profitable, because
2887/// the operands can be folded into a target instruction, e.g.
2888/// sext/zext can be folded into vsubl.
2890 SmallVectorImpl<Use *> &Ops) const {
2891 using namespace PatternMatch;
2892
2893 if (!I->getType()->isVectorTy())
2894 return false;
2895
2896 if (ST->hasNEON()) {
2897 switch (I->getOpcode()) {
2898 case Instruction::Sub:
2899 case Instruction::Add: {
2900 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2901 return false;
2902 Ops.push_back(&I->getOperandUse(0));
2903 Ops.push_back(&I->getOperandUse(1));
2904 return true;
2905 }
2906 default:
2907 return false;
2908 }
2909 }
2910
2911 if (!ST->hasMVEIntegerOps())
2912 return false;
2913
2914 auto IsFMSMul = [&](Instruction *I) {
2915 if (!I->hasOneUse())
2916 return false;
2917 auto *Sub = cast<Instruction>(*I->users().begin());
2918 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2919 };
2920 auto IsFMS = [&](Instruction *I) {
2921 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2922 match(I->getOperand(1), m_FNeg(m_Value())))
2923 return true;
2924 return false;
2925 };
2926
2927 auto IsSinker = [&](Instruction *I, int Operand) {
2928 switch (I->getOpcode()) {
2929 case Instruction::Add:
2930 case Instruction::Mul:
2931 case Instruction::FAdd:
2932 case Instruction::ICmp:
2933 case Instruction::FCmp:
2934 return true;
2935 case Instruction::FMul:
2936 return !IsFMSMul(I);
2937 case Instruction::Sub:
2938 case Instruction::FSub:
2939 case Instruction::Shl:
2940 case Instruction::LShr:
2941 case Instruction::AShr:
2942 return Operand == 1;
2943 case Instruction::Call:
2944 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2945 switch (II->getIntrinsicID()) {
2946 case Intrinsic::fma:
2947 return !IsFMS(I);
2948 case Intrinsic::sadd_sat:
2949 case Intrinsic::uadd_sat:
2950 case Intrinsic::arm_mve_add_predicated:
2951 case Intrinsic::arm_mve_mul_predicated:
2952 case Intrinsic::arm_mve_qadd_predicated:
2953 case Intrinsic::arm_mve_vhadd:
2954 case Intrinsic::arm_mve_hadd_predicated:
2955 case Intrinsic::arm_mve_vqdmull:
2956 case Intrinsic::arm_mve_vqdmull_predicated:
2957 case Intrinsic::arm_mve_vqdmulh:
2958 case Intrinsic::arm_mve_qdmulh_predicated:
2959 case Intrinsic::arm_mve_vqrdmulh:
2960 case Intrinsic::arm_mve_qrdmulh_predicated:
2961 case Intrinsic::arm_mve_fma_predicated:
2962 return true;
2963 case Intrinsic::ssub_sat:
2964 case Intrinsic::usub_sat:
2965 case Intrinsic::arm_mve_sub_predicated:
2966 case Intrinsic::arm_mve_qsub_predicated:
2967 case Intrinsic::arm_mve_hsub_predicated:
2968 case Intrinsic::arm_mve_vhsub:
2969 return Operand == 1;
2970 default:
2971 return false;
2972 }
2973 }
2974 return false;
2975 default:
2976 return false;
2977 }
2978 };
2979
2980 for (auto OpIdx : enumerate(I->operands())) {
2981 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2982 // Make sure we are not already sinking this operand
2983 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2984 continue;
2985
2986 Instruction *Shuffle = Op;
2987 if (Shuffle->getOpcode() == Instruction::BitCast)
2988 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2989 // We are looking for a splat that can be sunk.
2990 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2991 m_ZeroInt()),
2992 m_Undef(), m_ZeroMask())))
2993 continue;
2994 if (!IsSinker(I, OpIdx.index()))
2995 continue;
2996
2997 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2998 // and vector registers
2999 for (Use &U : Op->uses()) {
3000 Instruction *Insn = cast<Instruction>(U.getUser());
3001 if (!IsSinker(Insn, U.getOperandNo()))
3002 return false;
3003 }
3004
3005 Ops.push_back(&Shuffle->getOperandUse(0));
3006 if (Shuffle != Op)
3007 Ops.push_back(&Op->getOperandUse(0));
3008 Ops.push_back(&OpIdx.value());
3009 }
3010 return true;
3011}
3012
3014 Type *ArrayType) const {
3015 if (!UseWidenGlobalArrays) {
3016 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
3017 return false;
3018 }
3019
3020 // Don't modify none integer array types
3021 if (!ArrayType || !ArrayType->isArrayTy() ||
3023 return 0;
3024
3025 // We pad to 4 byte boundaries
3026 if (Size % 4 == 0)
3027 return 0;
3028
3029 unsigned NumBytesToPad = 4 - (Size % 4);
3030 unsigned NewSize = Size + NumBytesToPad;
3031
3032 // Max number of bytes that memcpy allows for lowering to load/stores before
3033 // it uses library function (__aeabi_memcpy).
3034 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
3035
3036 if (NewSize > MaxMemIntrinsicSize)
3037 return 0;
3038
3039 return NumBytesToPad;
3040}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
constexpr bool test(unsigned I) const
constexpr size_t size() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
Type * getArrayElementType() const
Definition Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:795
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).